## Creating a Logistic regression to predict absenteeism

### Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd

### Load the data

In [2]:
data_preprocessed= pd.read_csv('Abseteeism_preprocessed.csv')

In [3]:
data_preprocessed= data_preprocessed.drop(['Unnamed: 0'], axis=1)

In [4]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Values,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


### Create the Targets

We will classify our targets into two classes: Morderately absent, Excessively absent.
Below the median is Normal and above it is considered High

In [5]:
median= data_preprocessed['Absenteeism Time in Hours'].median()

targets= np.where(data_preprocessed['Absenteeism Time in Hours']>median, 1, 0)

In [6]:
data_preprocessed['targets']= targets

### A comment on the target

In [7]:
# A balance of 45-55 for targets is always sufficient
targets.sum()/targets.shape[0]

0.45571428571428574

In [8]:
# This is chechpoint
data_with_targets= data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)
data_with_targets

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Values,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,targets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,0


In [9]:
# data_with_targets is data_preprocessed

### Selecting inputs for the regression

In [10]:
data_with_targets.shape

(700, 15)

In [11]:
unscaled_inputs= data_with_targets.iloc[:, :14]
# unscaled_inputs= data_with_targets.iloc[: , 0:-1]
# unscaled_inputs= data_with_targets.iloc[: , :-1]
unscaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Values,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0


### Standardize the data

In [12]:
# This is manual Scaler method, but we use CustomScaler
# from sklearn.preprocessing import StandardScaler

# # absenteeism_scaler.fit(unscaled_inputs)
# # This is no more an empty object

# Here, all features are standardized, including dummies, which omits the effect of dummy variables, so we need to standardize undummy features only.
# absenteeism_scaler= StandardScaler()
# This is an empty object

# We will ommit dummies:
# standardize_inputs_columns= [ 'Day of the Week', 'Transportation Expense',
#                              'Age', 'Body Mass Index','Children', 'Pets']

# standardize_inputs= unscaled_inputs[standardize_inputs_columns]

# absenteeism_scaler.fit(standardize_inputs)
# scaled_columns_inputs= absenteeism_scaler.transform(standardize_inputs)

# scaled_inputs= unscaled_inputs.drop(standardize_inputs_columns, axis= 1)

# scaled_drops= pd.DataFrame(columns= standardize_inputs_columns, data= scaled_columns_inputs)

# scaled_inputs[standardize_inputs_columns]= scaled_drops[standardize_inputs_columns]

# columns=unscaled_inputs.columns.values

# scaled_inputs= scaled_inputs[columns]

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy= True, with_maen= True, with_std= True):
        self.scaler= StandardScaler()
        self.columns= columns
        self.mean= None
        self.var= None
        
    def fit(self, X, y= None):
        self.scaler.fit(X[self.columns], y)
        self.mean = np.mean(X[self.columns])
        self.var= np.var(X[self.columns])
        return self
    
    def transform(self, X, y= None):
        init_col_order= X.columns
        X_scaled= pd.DataFrame(self.scaler.transform(X[self.columns]), columns= self.columns)
        X_not_scaled= X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
        

In [14]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Values',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [15]:
#columns_to_scale= [ 'Day of the Week', 'Transportation Expense', 'Age', 'Body Mass Index','Children', 'Pets']

columns_to_omit= ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [16]:
columns_to_scale= [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [17]:
absenteeism_scaler= CustomScaler(columns_to_scale)

In [18]:
absenteeism_scaler.fit(unscaled_inputs)
scaled_inputs=absenteeism_scaler.transform(unscaled_inputs) 

In [19]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Values,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.007725,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,-0.007725,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,0.668253,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.668253,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690


### Split the data into train & test and shuffle

#### import the relevant module

In [20]:
from sklearn.model_selection import train_test_split

#### Split

In [21]:
# train_test_split(scaled_inputs, targets, train_size, shuffle= True, random_state)
# The output contains 4 arrays: training inputs, training targets, test inputs, test targets
x_train, x_test, y_train, y_test= train_test_split(scaled_inputs, targets, train_size= 0.8, shuffle= True, random_state= 120)

In [22]:
print(x_train.shape, y_train.shape)

(560, 14) (560,)


In [23]:
print(x_test.shape, y_test.shape)

(140, 14) (140,)


# Modeling

### Logistic regression with sklearn

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics   # useful for evaluating model

### Training the model

In [25]:
reg= LogisticRegression()
reg.fit(x_train, y_train)

LogisticRegression()

In [26]:
# Model accuracy
reg.score(x_train, y_train)

0.7678571428571429

In [27]:
# Checking accuracy manually

In [28]:
model_outputs=reg.predict(x_train)

In [29]:
# model_outputs == y_train

In [30]:
np.sum(model_outputs == y_train)/ model_outputs.shape[0]
# The result is exactly as reg.score of sklearn

0.7678571428571429

### Finding the intercept and coefficients

In [31]:
reg.intercept_

array([-1.97766912])

In [32]:
reg.coef_

array([[ 2.92224312,  1.66511081,  1.66511081,  1.2255857 ,  0.11528594,
        -0.20847351,  0.78472433, -0.07390694, -0.27591319,  0.05386436,
         0.32688999,  0.18619675,  0.46380254, -0.35843023]])

In [33]:
feature_name= unscaled_inputs.columns.values
feature_name

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Values',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [34]:
summary_table=pd.DataFrame(columns= ['Features'], data= feature_name)
summary_table['Coefficients']= np.transpose(reg.coef_)
# summary_table['Coefficients']= reg.coef_.T

summary_table

Unnamed: 0,Features,Coefficients
0,Reason_1,2.922243
1,Reason_2,1.665111
2,Reason_3,1.665111
3,Reason_4,1.225586
4,Month Values,0.115286
5,Day of the Week,-0.208474
6,Transportation Expense,0.784724
7,Distance to Work,-0.073907
8,Age,-0.275913
9,Daily Work Load Average,0.053864


In [35]:
summary_table.index= summary_table.index + 1
summary_table.loc[0]= ['Intercept', reg.intercept_[0]]
summary_table= summary_table.sort_index()
summary_table

Unnamed: 0,Features,Coefficients
0,Intercept,-1.977669
1,Reason_1,2.922243
2,Reason_2,1.665111
3,Reason_3,1.665111
4,Reason_4,1.225586
5,Month Values,0.115286
6,Day of the Week,-0.208474
7,Transportation Expense,0.784724
8,Distance to Work,-0.073907
9,Age,-0.275913


In [36]:
summary_table['Odds_ratio']= np.exp(summary_table.Coefficients)
summary_table

Unnamed: 0,Features,Coefficients,Odds_ratio
0,Intercept,-1.977669,0.138391
1,Reason_1,2.922243,18.582924
2,Reason_2,1.665111,5.286259
3,Reason_3,1.665111,5.286259
4,Reason_4,1.225586,3.40616
5,Month Values,0.115286,1.122194
6,Day of the Week,-0.208474,0.811823
7,Transportation Expense,0.784724,2.191803
8,Distance to Work,-0.073907,0.928758
9,Age,-0.275913,0.758879


In [37]:
summary_table.sort_values('Odds_ratio', ascending= False)

Unnamed: 0,Features,Coefficients,Odds_ratio
1,Reason_1,2.922243,18.582924
2,Reason_2,1.665111,5.286259
3,Reason_3,1.665111,5.286259
4,Reason_4,1.225586,3.40616
7,Transportation Expense,0.784724,2.191803
13,Children,0.463803,1.590109
11,Body Mass Index,0.32689,1.386649
12,Education,0.186197,1.204659
5,Month Values,0.115286,1.122194
10,Daily Work Load Average,0.053864,1.055341


A feature is not particularly important if:
    1) its coefficient is close to zero,
    2) its odds ratio is close to 1.

The further away a coefficient from zero is, the biger its importance is.

Education, month of the year, Daily Work Load Average, and Distance to work are the less important features.

Reason_1 weight odds ratio is 18.58 which means the odds of somebody has disease in Reason_1 being ansent is 19 times more than reason_0, no reason reported.

Standardization always makes accuracy higher, but it has a drawback of noninterperability.

Machine Learning engineers: High accuracy, so they do standardization.

Econometrics and Statisticians: Less accuracy but more inpterpretablity, they care about the underlying reasons behind different phenomena.

Data Scientist: Either cases, sometimes they need higher accuracy, and sometimes high interpretablity and finding main drivers of a problem.

The intercept/bias calibrates the model.

### Backward elimination

The idea is that we can simplify our model by removing all features which have close to no contribution to the model.

When we have p-values, we get ride of all coefficient with p_value>0.05.

In sklearn, we do not have p_value, becasue if the weight is small enough, it wont make a differece anyway.

We need to omit the features (Month Values, Daily Work Load Average, Distance to Work) and them do the regression again.

A simpler model is always preferable. After ommition, we check the accuracy. It does not change much.

We ommited in the features and then fitted regression once more.

### Testing the Model

When we test the model, we are not allowed to touch the model anymore.

In [38]:
# Accuracy
reg.score(x_test, y_test)

0.7142857142857143

Based on the data that the model has never seen before, in 72.9% of the cases, the model will predict correctly if the person is going to be excessively absent.

By definition, the test accuracy is always less than train accuracy. If higher, we are kucky, or made a mistake.

We can use reg.predict() to get predicts.

Instead, we use reg.predict_proba() to get probability estimates for all possible outputs(0,1).

In [39]:
predicted_proba= reg.predict_proba(x_test)
predicted_proba

array([[0.16279821, 0.83720179],
       [0.71665309, 0.28334691],
       [0.41647435, 0.58352565],
       [0.73893294, 0.26106706],
       [0.74629043, 0.25370957],
       [0.65084312, 0.34915688],
       [0.88265168, 0.11734832],
       [0.43365416, 0.56634584],
       [0.79210283, 0.20789717],
       [0.87903818, 0.12096182],
       [0.85802432, 0.14197568],
       [0.64318112, 0.35681888],
       [0.56917787, 0.43082213],
       [0.31341701, 0.68658299],
       [0.54969432, 0.45030568],
       [0.59450817, 0.40549183],
       [0.4169549 , 0.5830451 ],
       [0.61756655, 0.38243345],
       [0.1032383 , 0.8967617 ],
       [0.41510589, 0.58489411],
       [0.42027177, 0.57972823],
       [0.73442677, 0.26557323],
       [0.80022782, 0.19977218],
       [0.0873305 , 0.9126695 ],
       [0.44987282, 0.55012718],
       [0.07821319, 0.92178681],
       [0.11948094, 0.88051906],
       [0.21249891, 0.78750109],
       [0.21757158, 0.78242842],
       [0.58440919, 0.41559081],
       [0.

The output contains two columns:

The first column is the probability of the model assigning zero to the observation
The second column is for the output 1.

In reality, logistic regression models calculate these pobabilities in the background. If the probability is below 0.5. it places 0 and if it is above 0.5, it places 1.
We need the probability of being absenteeism, so the second column, the 1 output.


In [40]:
predicted_proba[:, 1]

array([0.83720179, 0.28334691, 0.58352565, 0.26106706, 0.25370957,
       0.34915688, 0.11734832, 0.56634584, 0.20789717, 0.12096182,
       0.14197568, 0.35681888, 0.43082213, 0.68658299, 0.45030568,
       0.40549183, 0.5830451 , 0.38243345, 0.8967617 , 0.58489411,
       0.57972823, 0.26557323, 0.19977218, 0.9126695 , 0.55012718,
       0.92178681, 0.88051906, 0.78750109, 0.78242842, 0.41559081,
       0.75662356, 0.82353666, 0.16301574, 0.46452211, 0.6973517 ,
       0.51928112, 0.38807661, 0.16527457, 0.14673797, 0.48383914,
       0.9252106 , 0.61756526, 0.40025988, 0.43955693, 0.48902291,
       0.5702002 , 0.12744706, 0.59311889, 0.91286281, 0.25392329,
       0.84392562, 0.61382676, 0.59311889, 0.92178681, 0.2717068 ,
       0.30175659, 0.35593684, 0.14414952, 0.92145802, 0.48832165,
       0.18208356, 0.14465408, 0.59872676, 0.69882836, 0.13298396,
       0.33841425, 0.28337246, 0.04450854, 0.28807105, 0.29663208,
       0.22908824, 0.21139591, 0.21984658, 0.09132196, 0.58880

### Save the model

pickle[module] is a python module used to convert a Python object into a character stream.

In [41]:
import pickle

In [42]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [43]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)

In [44]:
# We create a module to deploy it. Storing code in a module will allow us to reuse it without trouble.