# We will use a logistic regression using the revised variables as features to predict our target
## Target = absent hours

In [1]:
# Define moderately absent vs. excessively absent (our target)
# we will use the median value of 'absenteeism in hours' and use it as the cut off line because 
# median is numerically stable and rigid
# below the median = normal ; above the median = excessive 

In [2]:
import pandas as pd
preprocessed_data = pd.read_csv('Absenteeism_preprocessed.csv')
preprocessed_data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2


In [3]:
preprocessed_data['Absenteeism Time in Hours'].median()

3.0

In [4]:
# Now we have our cutoff: Absenteeism time over 3 hours will be considered excessive
# We can create a new column that indicates whether excessive absenteeism has been met in that sample

import numpy as np
targets = np.where(preprocessed_data['Absenteeism Time in Hours'] > 3, 1, 0)
# parameters (condition, if true, if false)
preprocessed_data['Excessive Absenteeism'] = targets

# By using the median we implicitly balance the dataset (half targets = 1, half targets = 0)
# This will prevent our modell from learning to output only 0s or only 1s

preprocessed_data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4,1
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2,0
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4,1
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2,0


In [5]:
# check if dataset is balanced
targets.sum() / targets.shape[0]

0.45571428571428574

### 46% 1 ; 54% 0

# Drop Unnecessary Columns

In [6]:
# we drop "Daily Work Load Average," "Day of the Week," and "Distance to Work," columns before standardizing
# Furthermore, we have to exclude dummy variables from standardization to retain interpretability

In [7]:
data_with_targets = preprocessed_data.drop(['Absenteeism Time in Hours', 'Daily Work Load Average', 'Day of the Week', 'Distance to Work'], axis=1)
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,False,False,False,True,7,289,33,30,0,2,1,1
1,False,False,False,False,7,118,50,31,0,1,0,0
2,False,False,False,True,7,179,38,31,0,0,0,0
3,True,False,False,False,7,279,39,24,0,2,0,1
4,False,False,False,True,7,289,33,30,0,2,1,0


In [8]:
# checkpoint check
data_with_targets is preprocessed_data
# If false then the two dataframes are different and this is indeed a checkpoint

False

# Select inputs for Logistic Regression

In [9]:
unscaled_inputs = data_with_targets.iloc[:, :-1]
# above line selects all rows and columns until the last one
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

# Standardize appropriate variables

In [10]:
# create a custom scaler 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin


class CustomScaler(BaseEstimator,TransformerMixin): 

    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy=copy,with_mean=with_mean,with_std=with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method which is based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale the select features
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [11]:
# divide unscaled inputs into columns to include and columns to exclude in standardizations
# ( we exclude all the dummy variables to preserve interpretability
columns_to_exclude = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [12]:
# use list comprehension to create columns to scale list
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_exclude]

In [13]:
# scale the selected columns and fit
absenteeism_scaler = CustomScaler(columns_to_scale)

In [14]:
# fit using mean and stdev and store in the absenteeism scaler object
absenteeism_scaler.fit(unscaled_inputs)

AttributeError: 'CustomScaler' object has no attribute 'copy'

AttributeError: 'CustomScaler' object has no attribute 'copy'

AttributeError: 'CustomScaler' object has no attribute 'copy'