# Absenteeism Exercise Machine Learning with CustomScaler

#### Load the preprocessing data

In [1]:
import numpy as np
import pandas as pd

In [2]:
data_preprocessed = pd.read_csv('Absenteeism-preprocessed.csv')
data_preprocessed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2


#### Create the targets

In [3]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0) 

In [4]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [5]:
data_preprocessed['Excessive Absenteeism'] = targets

In [6]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


#### A comment on the targets

In [7]:
targets.sum()

319

In [8]:
targets.shape

(700,)

In [9]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [10]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis = 1)

In [11]:
data_with_targets is data_preprocessed

False

In [12]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


#### Select the inputs for the regression

In [13]:
data_with_targets.shape

(700, 15)

In [14]:
data_with_targets.iloc[:, :14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0


In [15]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

#### Standardizing only the numerical variables

When we standardized the inputs, we also standardized the dummies, all features are standardized including dummies, this is bad, because when we standardized we lose the whole interpretability of dummy.

If we had left them as 0s and 1s, we could have said for a unit change, it is 7.92 times more likely that a person will be excessively absent, a unit change in the dummy variable universe means a change from disregarding this dummy to taking only this dummy into account.

So if the reason given is Reason_1, it could have said, it is 7.92 times more likely that a person will be excessively absent compared to no reason given.

However we standardize the reasons and now a unit change is completely uninterpretable, the predictive power of the model is still valid and it is a good classifier, but we don't know how the different reasons compare.

This is a problem, since those are the most important features, this brings to a correction of the code.

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# this is a CustomScaler based on the standard scalar from sklearn
# however when we declare the scaler object, there's an extra argument columns to scale
# so this CustomScaler won't standardise all inputs but only the ones we choose
# in this way we will be albe to preserve the dummies untouched

# in practice we would avoud this step by standardizing prior to creating the dummies

# first we have the CustomScaler class, is no different than the standard scaler in the way it works
# second we must choose the columns to be scaled

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy = True, with_mean = True, with_std = True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    
    def fit(self, X, y = None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y = None, copy = None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]

In [17]:
# this is the unscaled_inputs variable

unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [18]:
# this object will contain the names of the features we'd like to scale
# therefore we will omit the dummy variables from this list

columns_to_scale = ['Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pet']

In [19]:
# declare the absenteeism scaler

absenteeism_scaler = CustomScaler(columns_to_scale) 

In [20]:
absenteeism_scaler.fit(unscaled_inputs)

CustomScaler(columns=['Month Value', 'Day of the Week',
                      'Transportation Expense', 'Distance to Work', 'Age',
                      'Daily Work Load Average', 'Body Mass Index', 'Education',
                      'Children', 'Pet'],
             copy=None, with_mean=None, with_std=None)

In [21]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [22]:
# all the dummies have remained untouched and that was the whole point

scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet
0,0,0,0,1,0.030796,-0.800950,1.005844,0.412816,-0.536062,-0.806331,0.767431,-0.447980,0.880469,0.268487
1,0,0,0,0,0.030796,-0.800950,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.447980,-0.019280,-0.589690
2,0,0,0,1,0.030796,-0.232900,-0.654143,1.426749,0.248310,-0.806331,1.002633,-0.447980,-0.919030,-0.589690
3,1,0,0,0,0.030796,0.335149,0.854936,-1.682647,0.405184,-0.806331,-0.643782,-0.447980,0.880469,-0.589690
4,0,0,0,1,0.030796,0.335149,1.005844,0.412816,-0.536062,-0.806331,0.767431,-0.447980,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.568019,-0.232900,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,2.232242,0.880469,-0.589690
696,1,0,0,0,-0.568019,-0.232900,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,-0.447980,-0.019280,1.126663
697,1,0,0,0,-0.568019,0.335149,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,2.232242,-0.919030,-0.589690
698,0,0,0,1,-0.568019,0.335149,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,2.232242,-0.919030,-0.589690


#### Split data into train & test and shuffle

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  Day of the Week  \
 227         1         0         0         0    -0.268611        -1.368999   
 232         0         0         0         1     0.629611        -0.800950   
 630         1         0         0         0    -1.166834        -0.800950   
 429         0         0         0         1     0.929019         0.335149   
 197         0         0         1         0    -0.867426         0.903199   
 ..        ...       ...       ...       ...          ...              ...   
 382         0         0         0         1    -1.466241        -0.800950   
 51          0         0         0         0     0.629611        -1.368999   
 282         0         0         0         1     0.629611         0.903199   
 231         1         0         0         0     0.330204         1.471248   
 30          0         0         1         0     0.330204        -1.368999   
 
      Transportation Expense  Distance to Work       Age  \
 2

In [25]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [26]:
print(x_train.shape, y_train.shape)

(560, 14) (560,)


In [27]:
print(x_test.shape, y_test.shape)

(140, 14) (140,)


#### Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [29]:
reg = LogisticRegression()

In [30]:
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
# this model accuracy has fallen by a bit less than 1%

reg.score(x_train, y_train)

0.7678571428571429

#### Manually check the accuracy

In [32]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [33]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [34]:
model_outputs == y_train

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True, False,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [35]:
np.sum((model_outputs == y_train))

430

In [36]:
model_outputs.shape[0]

560

In [37]:
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.7678571428571429

#### Creating a summary table, finding the coefficients and intercept


In [38]:
reg.intercept_

array([-1.47489377])

In [39]:
reg.coef_

array([[ 2.613926  ,  0.83206771,  2.94820524,  0.6390807 ,  0.01115133,
        -0.07477497,  0.62166958, -0.02939883, -0.17595929, -0.02595869,
         0.27683349, -0.11016534,  0.35469433, -0.27491171]])

In [40]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [41]:
feature_name = unscaled_inputs.columns.values

In [42]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.613926
1,Reason_2,0.832068
2,Reason_3,2.948205
3,Reason_4,0.639081
4,Month Value,0.011151
5,Day of the Week,-0.074775
6,Transportation Expense,0.62167
7,Distance to Work,-0.029399
8,Age,-0.175959
9,Daily Work Load Average,-0.025959


In [43]:
summary_table.index = summary_table.index + 1

summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

summary_table = summary_table.sort_index()

summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.474894
1,Reason_1,2.613926
2,Reason_2,0.832068
3,Reason_3,2.948205
4,Reason_4,0.639081
5,Month Value,0.011151
6,Day of the Week,-0.074775
7,Transportation Expense,0.62167
8,Distance to Work,-0.029399
9,Age,-0.175959


In [44]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.474894,0.228803
1,Reason_1,2.613926,13.652546
2,Reason_2,0.832068,2.298066
3,Reason_3,2.948205,19.071694
4,Reason_4,0.639081,1.894738
5,Month Value,0.011151,1.011214
6,Day of the Week,-0.074775,0.927952
7,Transportation Expense,0.62167,1.862034
8,Distance to Work,-0.029399,0.971029
9,Age,-0.175959,0.838652


In [45]:
summary_table.sort_values('Odds_ratio')

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.474894,0.228803
14,Pet,-0.274912,0.759639
9,Age,-0.175959,0.838652
12,Education,-0.110165,0.895686
6,Day of the Week,-0.074775,0.927952
8,Distance to Work,-0.029399,0.971029
10,Daily Work Load Average,-0.025959,0.974375
5,Month Value,0.011151,1.011214
11,Body Mass Index,0.276833,1.318947
13,Children,0.354694,1.425745


In [46]:
summary_table.sort_values('Odds_ratio', ascending = False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,2.948205,19.071694
1,Reason_1,2.613926,13.652546
2,Reason_2,0.832068,2.298066
4,Reason_4,0.639081,1.894738
7,Transportation Expense,0.62167,1.862034
13,Children,0.354694,1.425745
11,Body Mass Index,0.276833,1.318947
5,Month Value,0.011151,1.011214
10,Daily Work Load Average,-0.025959,0.974375
8,Distance to Work,-0.029399,0.971029


#### Interpreting the coefficients

The further away from 0 a coefficient is, the bigger its importance.

Looking at the summary table, it will notice that the most strongly pronounced features seem to be the four reasons for absence, the transportation expense and whether a person has children, pet, an education.

Note the pet and an education are at the bottom of the table but their weights are still far away from 0, they are indeed important.

We can carry on in this way, finishing with the daily work load average, distance to work and day of week would seem to have smallest impact, their weight is almost 0, so regardless of the particular values, they will barely affect the model.

It said that the base model includes no reason, but what is the impact of the various reasons, reason 0 or no reason whihc baseline model; reason 1 which comprises of various diseases; reason 2 relating to pregnancy and giving birth; reason 3 regarding poisoning and peculiar reasons not categorized elsewhere and reason 4 which relates to light diseases, in the light of this clarfication, it can easier understand the coefficients.

It can know the most crucial reason for excessive absence is poisoning, the weight means the odds of someone being execessively absent after being poisoned are 20 times higher than when no reason was reported.

Another very important reason seems to be reason 1 various diseases, a person who has reported this is 14 times more likely to be excessively absent than a person who didn't specify a reason.

Then we have pregnancy and giving birth, it is a prominent cause of absenteeism, but at the same time is way less pronounced than reason 1 and 3.

Transportation expense, this is the most important non dummy feature in the model, but here's the problem, it is one of the standardized variables, we don't have direct interpretability of it, it's odds ratio implies that for one standardized unit or for one standard deviation, increase in transportation expense, it is close to 2 as likely to be excessively absent, this is the main drawback of standardization, standardized models almost always yield higher accuracy, because the optimization algorithms work better in this way.

Machine learning engineer, prefer models with higher accuracy, so they normally go for standardization; Econometricians and Statisticians prefer less accurate but more interpretable models, because they care about the underlying reasons behind different phenomena; Data scientists may be in either position, sometimes they need higher accuracy, other times they must find the main drivers of s problem.

So it make sense to create two different models, one with standardized features and one without them, and them draw insights from both.

However should we opt for predicting values, we definitely prefer higher accuracy, so standardization is more often than norm.

Pet is a continuous variable, it's odds ratio is 0.7, so for each additional standardized unit of pet, the odds are 1 - 0.759 = 24% lower than the base bottle.

The intercept, it is used to get more accurate predictions but there's no specific meaning attached to it, that's why in machine learning you can say that it calibrates the model, and you can also call it a bias.

Nevertheless without an intercept, each prediction would be off the mark by precisely that value.

#### Backward elimination or how to simplify the model

The daily work load average, distance to work and day of the week, seemed to have the lowest impact, if we can even call their contribution and impact as their weights are almost 0, month value is useful even if it does not add predictive power.

So what can we do about it, there is a concept called backward elimination, this idea is that we can simplify the model by removing all features which have close to no contribution to the model.

Usually when we have the p-values of variables, we get rid of all coefficients with p-values above 0,05, when use sklearn it don't have p-values, because we don't necessarily need them.

The reasoning of the engineer who created the package is that is the weight is small enough, it won't make difference anyway.

So if remove these variables, the rest of our model shouldn't really change in terms of coefficient values. 

#### Change the targets

In [48]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the Week', 'Daily Work Load Average', 'Distance to Work'], axis = 1)

In [49]:
data_with_targets is data_preprocessed

False

In [50]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [51]:
data_with_targets.shape

(700, 12)

In [52]:
data_with_targets.iloc[:, :14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0,1
696,1,0,0,0,5,225,28,24,0,1,2,0
697,1,0,0,0,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


In [53]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

In [54]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy = True, with_mean = True, with_std = True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    
    def fit(self, X, y = None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y = None, copy = None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]

In [55]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [56]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [57]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [58]:
absenteeism_scaler = CustomScaler(columns_to_scale) 

In [59]:
absenteeism_scaler.fit(unscaled_inputs)

CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pet'],
             copy=None, with_mean=None, with_std=None)

In [60]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [61]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.030796,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.030796,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.030796,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.568019,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.568019,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.568019,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.568019,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [62]:
from sklearn.model_selection import train_test_split

In [63]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 311         0         0         0         0     0.929019   
 467         0         0         0         1     0.030796   
 131         0         0         0         1    -1.765648   
 432         1         0         0         0    -0.568019   
 76          0         0         0         1     0.929019   
 ..        ...       ...       ...       ...          ...   
 525         1         0         0         0     0.929019   
 110         0         0         0         1     1.228426   
 371         1         0         0         0    -1.765648   
 342         0         0         0         1    -0.568019   
 273         0         0         0         0     0.330204   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 311                0.568211 -0.065439        -0.878984          0  2.679969   
 467               -1.574681  0.091435         0.297027          0 -0.919030   
 131               -1.5746

In [64]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [65]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [66]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [68]:
reg = LogisticRegression()

In [69]:
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [70]:
reg.score(x_train, y_train)

0.775

In [71]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [72]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [73]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [74]:
np.sum((model_outputs == y_train))

434

In [75]:
model_outputs.shape[0]

560

In [76]:
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.775

In [77]:
reg.intercept_

array([-1.43138127])

In [78]:
reg.coef_

array([[ 2.60237227,  0.84350002,  2.94078723,  0.63723433,  0.00565051,
         0.61953401, -0.17635497,  0.28410321, -0.26372527,  0.35195032,
        -0.27369766]])

In [79]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [80]:
feature_name = unscaled_inputs.columns.values

In [81]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.602372
1,Reason_2,0.8435
2,Reason_3,2.940787
3,Reason_4,0.637234
4,Month Value,0.005651
5,Transportation Expense,0.619534
6,Age,-0.176355
7,Body Mass Index,0.284103
8,Education,-0.263725
9,Children,0.35195


In [82]:
summary_table.index = summary_table.index + 1

summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

summary_table = summary_table.sort_index()

summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.431381
1,Reason_1,2.602372
2,Reason_2,0.8435
3,Reason_3,2.940787
4,Reason_4,0.637234
5,Month Value,0.005651
6,Transportation Expense,0.619534
7,Age,-0.176355
8,Body Mass Index,0.284103
9,Education,-0.263725


In [83]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.431381,0.238979
1,Reason_1,2.602372,13.495716
2,Reason_2,0.8435,2.324489
3,Reason_3,2.940787,18.930743
4,Reason_4,0.637234,1.891243
5,Month Value,0.005651,1.005667
6,Transportation Expense,0.619534,1.858062
7,Age,-0.176355,0.83832
8,Body Mass Index,0.284103,1.32857
9,Education,-0.263725,0.768185


In [84]:
summary_table.sort_values('Odds_ratio')

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.431381,0.238979
11,Pet,-0.273698,0.760562
9,Education,-0.263725,0.768185
7,Age,-0.176355,0.83832
5,Month Value,0.005651,1.005667
8,Body Mass Index,0.284103,1.32857
10,Children,0.35195,1.421838
6,Transportation Expense,0.619534,1.858062
4,Reason_4,0.637234,1.891243
2,Reason_2,0.8435,2.324489


In [85]:
summary_table.sort_values('Odds_ratio', ascending = False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,2.940787,18.930743
1,Reason_1,2.602372,13.495716
2,Reason_2,0.8435,2.324489
4,Reason_4,0.637234,1.891243
6,Transportation Expense,0.619534,1.858062
10,Children,0.35195,1.421838
8,Body Mass Index,0.284103,1.32857
5,Month Value,0.005651,1.005667
7,Age,-0.176355,0.83832
9,Education,-0.263725,0.768185


Through the accuracy, after drop the targets, the accuracy is 77.5%, moreover the number of correctly predicted instances is 434, this shows us that the three variables we dropped were useless with or without them, we obtained practically the same results.

#### Testing the model

The train accuracy is around 77.5%, it doesn't mean much, the algorithm has seen this train data many times, in fact thousands of times during the training process, so it has learned to model that quite well.

However it may fail miserably when provided with new data, as we said earilier, we should test it on data it has never seen.

It is time to use the test data, this will also be the end of the machine learning, that's because testing is done once and ate the very end of the machine learning process.

Some researchers are looking at the testing accuracy and then tweaking the model a bit to get better test accuracy, however if you do this operation enough times, what will this be, an iterative process in which you change some parameters based on a function, the accuracy in this case, but that's basically the definition of the machine learning training process, so instead of testing, you will be using the test data to train a bit more.

But this time, manually, this makes the test dataset useless, because you are not really testing.

The takeaway is that once we test, we are not conceptually allowed to touch the model anymore.

In [86]:
# the first task is to find the accuracy
# different before, this time use the test dataset

# so based on the data that the model has never seen before
# the accuracy us 73.5%
# it can say that 74% of the cases, the model will predict of a person is going to be excessively absent

# the test accuracy is always less than the train accuracy by definition

# often is dramatically lower than the train accuracy
# maybe something like 10% - 20% lower

# this would mean that the model is overfit
# the model learned the train data very well
# but is prone to fail in real life

# without small percentage difference between the train and test accuracy
# we are in neither case

reg.score(x_test, y_test)

0.7357142857142858

In [87]:
# instead of 0 and 1
# we can get the probability of an output being 0 or 1

predicted_proba = reg.predict_proba(x_test)

In [88]:
# the result is 140 by 2 array
# means there are 140 observations and 2 columns

# the first column shows the probability the model assigned to the observation being 0
# the second column shows the probability the model assigned to the observation being 1

# so it can know, by summing any two numbers horizontally
# it will give an output of 1

# we are interested in is the probability of excessive absenteeism
# so the probability of getting one 

predicted_proba

array([[0.75308922, 0.24691078],
       [0.60926091, 0.39073909],
       [0.4859575 , 0.5140425 ],
       [0.7552847 , 0.2447153 ],
       [0.0839675 , 0.9160325 ],
       [0.30192695, 0.69807305],
       [0.30166774, 0.69833226],
       [0.1151045 , 0.8848955 ],
       [0.73775967, 0.26224033],
       [0.75403176, 0.24596824],
       [0.50719215, 0.49280785],
       [0.19719276, 0.80280724],
       [0.06163196, 0.93836804],
       [0.70917025, 0.29082975],
       [0.29280547, 0.70719453],
       [0.5241047 , 0.4758953 ],
       [0.50676929, 0.49323071],
       [0.50888352, 0.49111648],
       [0.367008  , 0.632992  ],
       [0.06355661, 0.93644339],
       [0.73644831, 0.26355169],
       [0.7552847 , 0.2447153 ],
       [0.47457156, 0.52542844],
       [0.47288443, 0.52711557],
       [0.22026535, 0.77973465],
       [0.73808685, 0.26191315],
       [0.51184512, 0.48815488],
       [0.87683579, 0.12316421],
       [0.23445563, 0.76554437],
       [0.7552847 , 0.2447153 ],
       [0.

In [89]:
# so it can simply slice out all values from the second column
# in reality, logistic regression model calculate these probabilities in the background
# if the probability is below 0.5, it places 0
# it the probability is above 0.5, it plasec 1

predicted_proba[:,1]

array([0.24691078, 0.39073909, 0.5140425 , 0.2447153 , 0.9160325 ,
       0.69807305, 0.69833226, 0.8848955 , 0.26224033, 0.24596824,
       0.49280785, 0.80280724, 0.93836804, 0.29082975, 0.70719453,
       0.4758953 , 0.49323071, 0.49111648, 0.632992  , 0.93644339,
       0.26355169, 0.2447153 , 0.52542844, 0.52711557, 0.77973465,
       0.26191315, 0.48815488, 0.12316421, 0.76554437, 0.2447153 ,
       0.38912926, 0.71585927, 0.70056321, 0.49365359, 0.2447153 ,
       0.59546518, 0.26256777, 0.78560289, 0.43689507, 0.60455576,
       0.24440274, 0.49889237, 0.26125957, 0.44701216, 0.80689602,
       0.60831063, 0.72248525, 0.24534123, 0.24855634, 0.24409045,
       0.50227596, 0.32586203, 0.69807305, 0.24697976, 0.82067559,
       0.39154489, 0.90719507, 0.26697852, 0.35791316, 0.35830205,
       0.70399998, 0.6991417 , 0.26797295, 0.78118404, 0.24619646,
       0.24659633, 0.09025875, 0.26289548, 0.7674276 , 0.29222745,
       0.26060705, 0.35283656, 0.88319236, 0.43606282, 0.59383

#### Save the model

Saving a model is the process of creating a file that will contain all the information regarding the machine learning.

Roughly speaking we want to create a file that will store the following information.

This machine learning model is logistic regression, it has the coefficient and intercept, the random state that was chosen for the shuffling was 20 and so on.

The object reg which was an instance of the sklearn logistic regrsiion class contains all this information, in fact this is the object we use to find the intercept, coefficients and accuracy.

Therefore it should not come as a surprise that saving the model is equivalent to saving the reg object.

So saving the model means saving the reg object.

In [90]:
# pickle is a python module used to convert a python object into a character stream
# the main idea is that this character stream will contain sufficient information
# then latter when we would like to convert the character stream into a python object in another notebook
# it will unpickle it

# this means we will save the reg variable into a file
# this file will then be loaded in a new notebook
# thus will be albe to use the machine learning algorithm

# the file size will be less than 1 kb

import pickle

In [91]:
# first the file name - model

# second the wb, stands for write bytes
# coversely when we unpickle we will use rb or read bytes

# third, we got the dump method - save 
# when we pickle, we dump the information in a file
# when we unpickle, we load it 
# in the dump method, we specify the object to be dumped
# the rest is the object to be dumped

with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [92]:
# finally it must save the absenteeism_scaler
# the absenteeism_scaler object was used to standardize all numerical variables
# what it did was store the columns to scale as well as the mean, standard deviation of each feature
# to further explain why need pickle the scaler
# think that up until now, the code was heavily dependent on training data
# without training data, the machine learning could not be executed at all
# but once the model is trained and we have obtained the coefficients we can save it as we just have
# in this way, we are basically separating the model from the training data
# in the logically, the information in the absenteeism_scaler is needed to preprocess any new data using the same rules as the ones apply to training data
# thus we must pickle a scaler too

with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)