## Machine Learning

### Creating a logistic regression to predict absenteeism

In [3]:
# import relevant libraries
import numpy as np
import pandas as pd

In [4]:
# load the data
data_preprocessed = pd.read_csv('../data/preprocessed.csv')
data_preprocessed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Year,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,2015,July,Tuesday,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,2015,July,Tuesday,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,2015,July,Wednesday,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,2015,July,Thursday,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,2015,July,Thursday,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018,May,Wednesday,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,2018,May,Wednesday,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,2018,May,Thursday,330,16,28,237.656,25,1,0,0,8
698,0,0,0,1,2018,May,Thursday,235,16,32,237.656,25,1,0,0,2


### Create the targets

In [5]:
# using logistics regression
# creating classes; moderately absent, excessively absent
# using the median vlaue of the 'Absenteeism Time in Hours' as a cut-off line
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [6]:
# moderately absent (<= 3hours), excessively absent(>= 4 hours)
# moderately absent = 0, excessively absent = 1
# create a variable to predict this targets (0 & 1)
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                    data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [7]:
# add the target data to the dataframe
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Year,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,2015,July,Tuesday,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,2015,July,Tuesday,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,2015,July,Wednesday,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,2015,July,Thursday,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,2015,July,Thursday,289,36,33,239.554,30,0,2,1,2,0


In [8]:
# checking the targets distribution
targets.sum() / targets.shape[0]

0.45571428571428574

In [9]:
# dropping the absentism column
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis = 1)
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Year,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,2015,July,Tuesday,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,2015,July,Tuesday,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,2015,July,Wednesday,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,2015,July,Thursday,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,2015,July,Thursday,289,36,33,239.554,30,0,2,1,0


## Selecting the inputs for the regression

In [10]:
data_with_targets.shape

(700, 16)

In [11]:
# selecting the inputs excluding the last column 'extreme absenteeism'
unscaled_inputs = data_with_targets.iloc[:, :-1]
unscaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Year,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,2015,July,Tuesday,289,36,33,239.554,30,0,2,1
1,0,0,0,0,2015,July,Tuesday,118,13,50,239.554,31,0,1,0
2,0,0,0,1,2015,July,Wednesday,179,51,38,239.554,31,0,0,0
3,1,0,0,0,2015,July,Thursday,279,5,39,239.554,24,0,2,0
4,0,0,0,1,2015,July,Thursday,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018,May,Wednesday,179,22,40,237.656,22,1,2,0
696,1,0,0,0,2018,May,Wednesday,225,26,28,237.656,24,0,1,2
697,1,0,0,0,2018,May,Thursday,330,16,28,237.656,25,1,0,0
698,0,0,0,1,2018,May,Thursday,235,16,32,237.656,25,1,0,0


## Standardize the Data

In [12]:
#from sklearn.preprocessing import StandardScaler

# create an object to subtract the mean and divide by the SD featurewise
#absenteeism_scalar = StandardScaler()

In [13]:
# before using standardscaler, lets convert 'month' and 'day' to integers
unscaled_inputs['Month'] = unscaled_inputs['Month'].replace(['January', 'February', 'March', 'April', 'May', 'June', 
'July', 'August', 'September', 'October', 'November', 'December'], ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'])
unscaled_inputs['Day'] = unscaled_inputs['Day'].replace(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 
'Sunday'], ['1', '2', '3', '4', '5', '6', '7'])
unscaled_inputs.head(9)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Year,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,2015,7,2,289,36,33,239.554,30,0,2,1
1,0,0,0,0,2015,7,2,118,13,50,239.554,31,0,1,0
2,0,0,0,1,2015,7,3,179,51,38,239.554,31,0,0,0
3,1,0,0,0,2015,7,4,279,5,39,239.554,24,0,2,0
4,0,0,0,1,2015,7,4,289,36,33,239.554,30,0,2,1
5,0,0,0,1,2015,7,5,179,51,38,239.554,31,0,0,0
6,0,0,0,1,2015,7,5,361,52,28,239.554,27,0,1,4
7,0,0,0,1,2015,7,5,260,50,36,239.554,23,0,4,0
8,0,0,1,0,2015,7,1,155,12,34,239.554,25,0,2,0


In [27]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create a custom scaler so as to choose which input to standardize.
# this allows the exclusion of the dummie varaibles columns from being standardized
# alternatively, one can standardize the dataset before creating dummie variables.
class CustomScaler(BaseEstimator, TransformerMixin):

    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [28]:
# checking the column values in the unscaled data frame
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Year', 'Month',
       'Day', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [30]:
columns_to_scale = ['Year', 'Month', 'Day', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']

In [31]:
absenteeism_scaler = CustomScaler(columns_to_scale)



In [32]:
# fit input data (calculate and store the mean and SD)
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Year', 'Month', 'Day', 'Transportation Expense',
                      'Distance to Work', 'Age', 'Daily Work Load Average',
                      'Body Mass Index', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [33]:
# transforming the unscaled inputs with the information contained in the absenteeism scalar. ie subtracting by the mean and dividing by the SD
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Year,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,-1.556984,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,-1.556984,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,-1.556984,0.182726,-0.007725,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,-1.556984,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,-1.556984,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,1.530520,-0.388293,-0.007725,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,1.530520,-0.388293,-0.007725,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,1.530520,-0.388293,0.668253,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,1.530520,-0.388293,0.668253,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690


### Spilt the data into train & test and shuffle

In [34]:
#import relevant modules
from sklearn.model_selection import train_test_split


In [35]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4      Year     Month       Day  \
 524         1         0         0         0  0.501352  1.039256  0.668253   
 448         0         0         0         1  0.501352 -0.102784 -1.359682   
 293         0         0         0         0 -0.527816  1.039256  0.668253   
 663         0         0         0         1  1.530520 -0.673803 -0.683704   
 518         0         0         0         1  0.501352  1.039256 -0.683704   
 ..        ...       ...       ...       ...       ...       ...       ...   
 389         0         0         0         1  0.501352 -1.244823  1.344231   
 561         1         0         0         0  0.501352  1.610276  0.668253   
 4           0         0         0         1 -1.556984  0.182726  0.668253   
 419         0         0         0         1  0.501352 -0.673803 -1.359682   
 584         0         0         0         1  1.530520 -1.530333  1.344231   
 
      Transportation Expense  Distance to Work       Age  \
 5

above output consists of:
array 1: a training dataset with inputs
array 2: a training dataset with targets
array 3: a test dataset with inputs
array 4: a test dataset with targets

In [36]:
# declare four variables to contain the split outputs
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20) 

In [37]:
# view output shapes
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)


(560, 15) (560,)
(140, 15) (140,)


## Logistic regression with sklearn

In [38]:
from sklearn.linear_model import LogisticRegression
#for model evaluation
from sklearn import metrics

### Training the model

In [39]:
#declare a logistic regression object
reg = LogisticRegression()

In [40]:
# fitting the regression
reg.fit(x_train,y_train)

LogisticRegression()

In [41]:
# evaluating model accuracy
reg.score(x_train,y_train)

0.7714285714285715

### Manually check the accuracy

In [42]:
# find the outputs 
model_outputs = reg.predict(x_train)
model_outputs # shows the prediction of the model

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [43]:
# and compare with the targets
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [44]:
# calculate the number of correct predictions
np.sum((model_outputs==y_train))

432

In [45]:
# number of outputs
model_outputs.shape[0]

560

In [46]:
# Accuracy = Correct predictions / # observations
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.7714285714285715

to use this model outside of python (Tableau), we need to create a function, to enable easy usuage within Tableau

#### Finding the intercept and coefficients

In [47]:
# finding intercept
reg.intercept_

array([-1.67495631])

In [48]:
#finding the coefficient
reg.coef_

array([[ 2.83349979,  0.99098895,  3.04180231,  0.85736119, -0.25656102,
         0.03664655, -0.08070659,  0.61206337, -0.02400319, -0.15411123,
        -0.07131645,  0.27805883, -0.11234437,  0.34852464, -0.27233454]])

In [49]:
# to know the variable these variables refer to
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Year', 'Month',
       'Day', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [50]:
# best practice to define a variale to contain this info
feature_name = unscaled_inputs.columns.values

In [51]:
#creating a dataframe to contain the feature_names and the corresponding coefficient
summary_table = pd.DataFrame (columns=['feature name'], data = feature_name)

# matching the names with the coeffiecient
summary_table['coefficient'] =np.transpose(reg.coef_) #transpose because ndarrays are rows not columns
summary_table

Unnamed: 0,feature name,coefficient
0,Reason_1,2.8335
1,Reason_2,0.990989
2,Reason_3,3.041802
3,Reason_4,0.857361
4,Year,-0.256561
5,Month,0.036647
6,Day,-0.080707
7,Transportation Expense,0.612063
8,Distance to Work,-0.024003
9,Age,-0.154111


In [52]:
# appending the intercept
# to shift the index of thee summary table so that the intercept comes first
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,feature name,coefficient
0,intercept,-1.674956
1,Reason_1,2.8335
2,Reason_2,0.990989
3,Reason_3,3.041802
4,Reason_4,0.857361
5,Year,-0.256561
6,Month,0.036647
7,Day,-0.080707
8,Transportation Expense,0.612063
9,Distance to Work,-0.024003


## Interpreting the coefficients

"""
Remember, coefficient = weight and intercepts = bias. the weight shows how we weigh a certain input, the closer they are to zero, the smaller the weight... 
this holds for models where all variables are of the same scale, like this model.
Standardized coefficients are basically coefficients of a regression where all variables have been standardized. 
it's advised to standard the variables like in this model.
In logistic Regression, coefficient predicted are the 'log(odds). logistic regreesion by default are linear function, predicting log(odds). 
these log(odds) are later transformed into zero's and one's
"""

In [53]:
# find the exponentials of the coefficient, to make them more interpretable
summary_table['odds_ratio'] = np.exp(summary_table.coefficient)
summary_table

Unnamed: 0,feature name,coefficient,odds_ratio
0,intercept,-1.674956,0.187316
1,Reason_1,2.8335,17.00487
2,Reason_2,0.990989,2.693897
3,Reason_3,3.041802,20.942955
4,Reason_4,0.857361,2.356933
5,Year,-0.256561,0.773708
6,Month,0.036647,1.037326
7,Day,-0.080707,0.922464
8,Transportation Expense,0.612063,1.844233
9,Distance to Work,-0.024003,0.976283


In [54]:
# sort the dataframe by the 'odds_ratio' column
summary_table.sort_values('odds_ratio', ascending = False)


Unnamed: 0,feature name,coefficient,odds_ratio
3,Reason_3,3.041802,20.942955
1,Reason_1,2.8335,17.00487
2,Reason_2,0.990989,2.693897
4,Reason_4,0.857361,2.356933
8,Transportation Expense,0.612063,1.844233
14,Children,0.348525,1.416975
12,Body Mass Index,0.278059,1.320564
6,Month,0.036647,1.037326
9,Distance to Work,-0.024003,0.976283
11,Daily Work Load Average,-0.071316,0.931167


if a coefficient is around zero or it's odd ratio is close to 1 = corresponding feature isn't particularly important