### Creating a logistic regression to predict absenteeism

In [1]:
import pandas as pd
import numpy as np

***Load the data***

In [2]:
data_preprocessed = pd.read_csv('../data/df_preprocessed_absenteeism.csv')
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


#### Create the targets

In [3]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [4]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 
                   1'''in case of true''', 
                   0 '''in case of false''')

In [6]:
targets[:10]

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1])

In [7]:
data_preprocessed['Excessive Absenteeism'] = targets

In [8]:
data_preprocessed[:3]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0


#### A comment on the targets

In [9]:
targets.sum()/ targets.shape[0]   # balance groups

0.45571428571428574

In [10]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'],axis=1)

In [11]:
data_with_targets is data_preprocessed

False

In [12]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


#### Select the inputs for the regression

In [13]:
data_with_targets.shape

(700, 15)

In [16]:
data_with_targets.iloc[:,:-1].head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1


In [17]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

#### Standardize the data

In [63]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self,columns,copy=True,with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    
    def fit(self,X,y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]),columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled],axis=1)[init_col_order]

In [56]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [57]:
columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets']

In [64]:
#from sklearn.preprocessing import StandardScaler

absenteeism_scaler = CustomScaler(columns_to_scale)

In [65]:
# will calculate and store the mean and the standard deviation
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Month Value', 'Day of the Week',
                      'Transportation Expense', 'Distance to Work', 'Age',
                      'Daily Work Load Average', 'Body Mass Index', 'Education',
                      'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [66]:
# this will scale the unscaled inputs
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [21]:
scaled_inputs.shape

(700, 14)

In [67]:
scaled_inputs[:2]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,-0.44798,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.44798,-0.01928,-0.58969


#### Split the data into train & test and shuffle

In [23]:
from sklearn.model_selection import train_test_split

In [68]:
# Shuffle = True(default)
X_train,X_test,y_train,y_test = train_test_split(scaled_inputs,targets,
                                                 test_size= 0.2,random_state=42)

In [69]:
X_train.shape, y_train.shape

((560, 14), (560,))

In [70]:
X_test.shape, y_test.shape

((140, 14), (140,))

#### Logistic Regression with Sklearn

In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [72]:
reg = LogisticRegression()

In [73]:
reg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [74]:
reg.score(X_train,y_train)

0.7660714285714286

In [76]:
#reg.score(X_test,y_test)

#### Manually check the accuracy

In [77]:
model_outputs = reg.predict(X_train)
model_outputs

array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,

In [78]:
np.sum((model_outputs == y_train))

429

In [79]:
model_outputs.shape[0]

560

In [80]:
np.sum((model_outputs == y_train))/model_outputs.shape[0]

0.7660714285714286

In [81]:
reg.intercept_, reg.coef_

(array([-1.7575216]),
 array([[ 2.92993196,  0.73453473,  3.07517417,  0.99389691,  0.07923215,
         -0.15754023,  0.67570676, -0.05915557, -0.25914578, -0.02183528,
          0.24334102, -0.10857391,  0.41638209, -0.31223952]]))

In [82]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [83]:
feature_name = unscaled_inputs.columns.values

In [84]:
summary_table = pd.DataFrame(columns=['Feature name'],data=feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.929932
1,Reason_2,0.734535
2,Reason_3,3.075174
3,Reason_4,0.993897
4,Month Value,0.079232
5,Day of the Week,-0.15754
6,Transportation Expense,0.675707
7,Distance to Work,-0.059156
8,Age,-0.259146
9,Daily Work Load Average,-0.021835


In [85]:
summary_table.index = summary_table.index +1

In [86]:
summary_table.loc[0] = ['Intercept',reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.757522
1,Reason_1,2.929932
2,Reason_2,0.734535
3,Reason_3,3.075174
4,Reason_4,0.993897
5,Month Value,0.079232
6,Day of the Week,-0.15754
7,Transportation Expense,0.675707
8,Distance to Work,-0.059156
9,Age,-0.259146


In [87]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.757522,0.172472
1,Reason_1,2.929932,18.726356
2,Reason_2,0.734535,2.084512
3,Reason_3,3.075174,21.653653
4,Reason_4,0.993897,2.701742
5,Month Value,0.079232,1.082456
6,Day of the Week,-0.15754,0.854242
7,Transportation Expense,0.675707,1.965422
8,Distance to Work,-0.059156,0.94256
9,Age,-0.259146,0.771711


In [88]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,3.075174,21.653653
1,Reason_1,2.929932,18.726356
4,Reason_4,0.993897,2.701742
2,Reason_2,0.734535,2.084512
7,Transportation Expense,0.675707,1.965422
13,Children,0.416382,1.516465
11,Body Mass Index,0.243341,1.275504
5,Month Value,0.079232,1.082456
10,Daily Work Load Average,-0.021835,0.978401
8,Distance to Work,-0.059156,0.94256


##### Backward elimination
* removing the less significant features and running the model again

In [90]:
data_preprocessed.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours',
       'Excessive Absenteeism'], dtype=object)

In [91]:
data_with_targets_1 = data_preprocessed.drop(['Day of the Week', 'Distance to Work',
       'Age', 'Daily Work Load Average','Absenteeism Time in Hours'],axis=1)

In [92]:
data_with_targets_1.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,30,0,2,1,1
1,0,0,0,0,7,118,31,0,1,0,0
2,0,0,0,1,7,179,31,0,0,0,0
3,1,0,0,0,7,279,24,0,2,0,1
4,0,0,0,1,7,289,30,0,2,1,0


In [93]:
unscaled_inputs_1 = data_with_targets_1.iloc[:,:-1]

In [94]:
unscaled_inputs_1.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [95]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [97]:
columns_to_scale_1 = [x for x in unscaled_inputs_1.columns.values if x not in columns_to_omit]

In [98]:
absenteeism_scaler = CustomScaler(columns_to_scale_1)
absenteeism_scaler.fit(unscaled_inputs_1)
scaled_inputs_1 = absenteeism_scaler.transform(unscaled_inputs_1)

***Fit the Model***

In [101]:
#from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(scaled_inputs_1,targets,train_size=0.8,random_state=42)

In [102]:
#from sklearn.linear_model import LogisticRegression
#from sklearn import metrics
reg_log = LogisticRegression().fit(X_train,y_train)

In [104]:
reg_log.score(X_train,y_train)

0.7607142857142857

In [105]:
feature_name_1 = unscaled_inputs_1.columns.values

In [106]:
summary_table = pd.DataFrame(columns=['Feature name'],data=feature_name_1)
summary_table['Coefficient'] = np.transpose(reg_log.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.924847
1,Reason_2,0.721239
2,Reason_3,3.099221
3,Reason_4,1.033398
4,Month Value,0.078742
5,Transportation Expense,0.67582
6,Body Mass Index,0.140898
7,Education,-0.189411
8,Children,0.348226
9,Pets,-0.248117


In [107]:
summary_table.index = summary_table.index +1
summary_table.loc[0] = ['Intercept',reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.757522
1,Reason_1,2.924847
2,Reason_2,0.721239
3,Reason_3,3.099221
4,Reason_4,1.033398
5,Month Value,0.078742
6,Transportation Expense,0.67582
7,Body Mass Index,0.140898
8,Education,-0.189411
9,Children,0.348226


In [108]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table.sort_values('Odds_ratio', ascending=False)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.757522,0.172472
1,Reason_1,2.924847,18.631368
2,Reason_2,0.721239,2.056981
3,Reason_3,3.099221,22.180676
4,Reason_4,1.033398,2.8106
5,Month Value,0.078742,1.081925
6,Transportation Expense,0.67582,1.965645
7,Body Mass Index,0.140898,1.151307
8,Education,-0.189411,0.827447
9,Children,0.348226,1.416552


#### Testing the model

In [109]:
reg_log.score(X_test,y_test)

0.7428571428571429

In [111]:
predicted_proba = reg_log.predict_proba(X_test)
predicted_proba

array([[0.810489  , 0.189511  ],
       [0.84969272, 0.15030728],
       [0.75786909, 0.24213091],
       [0.59937559, 0.40062441],
       [0.51408241, 0.48591759],
       [0.10350895, 0.89649105],
       [0.58682637, 0.41317363],
       [0.40617759, 0.59382241],
       [0.73877445, 0.26122555],
       [0.74605176, 0.25394824],
       [0.86349198, 0.13650802],
       [0.66470279, 0.33529721],
       [0.30640958, 0.69359042],
       [0.48436169, 0.51563831],
       [0.76456263, 0.23543737],
       [0.50514921, 0.49485079],
       [0.90430082, 0.09569918],
       [0.16375997, 0.83624003],
       [0.86612038, 0.13387962],
       [0.52530531, 0.47469469],
       [0.75803464, 0.24196536],
       [0.75028749, 0.24971251],
       [0.72172669, 0.27827331],
       [0.66969464, 0.33030536],
       [0.81059973, 0.18940027],
       [0.13763137, 0.86236863],
       [0.5365027 , 0.4634973 ],
       [0.66286544, 0.33713456],
       [0.76675406, 0.23324594],
       [0.54208834, 0.45791166],
       [0.

In [114]:
predicted_proba[:,1]

array([0.189511  , 0.15030728, 0.24213091, 0.40062441, 0.48591759,
       0.89649105, 0.41317363, 0.59382241, 0.26122555, 0.25394824,
       0.13650802, 0.33529721, 0.69359042, 0.51563831, 0.23543737,
       0.49485079, 0.09569918, 0.83624003, 0.13387962, 0.47469469,
       0.24196536, 0.24971251, 0.27827331, 0.33030536, 0.18940027,
       0.86236863, 0.4634973 , 0.33713456, 0.23324594, 0.45791166,
       0.11440659, 0.13923205, 0.54424774, 0.52685835, 0.23950825,
       0.68898527, 0.27827331, 0.13134394, 0.82581181, 0.16723335,
       0.50609112, 0.24552418, 0.58293329, 0.12875116, 0.21755037,
       0.67119383, 0.71766886, 0.87327171, 0.26933365, 0.13129414,
       0.25394824, 0.27378075, 0.48591759, 0.92522942, 0.13655949,
       0.21755037, 0.96916185, 0.23950825, 0.89777684, 0.19991943,
       0.68660828, 0.1167042 , 0.48720939, 0.59382241, 0.13650802,
       0.34725512, 0.65845407, 0.07675261, 0.23141462, 0.58057363,
       0.25394824, 0.23324594, 0.74971435, 0.27378075, 0.18259

***Saving the model***

In [116]:
import pickle

with open('model_logistic','wb') as file:
    pickle.dump(reg_log,file)

In [117]:
with open('model_log_StandardScaler','wb') as file:
    pickle.dump(absenteeism_scaler,file)