# We will use a logistic regression using the revised variables as features to predict our target
## Target = absent hours

In [1]:
# Define moderately absent vs. excessively absent (our target)
# we will use the median value of 'absenteeism in hours' and use it as the cut off line because 
# median is numerically stable and rigid
# below the median = normal ; above the median = excessive 

In [2]:
import pandas as pd
preprocessed_data = pd.read_csv('Absenteeism_preprocessed.csv')
preprocessed_data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2


In [3]:
preprocessed_data['Absenteeism Time in Hours'].median()

3.0

In [4]:
# Now we have our cutoff: Absenteeism time over 3 hours will be considered excessive
# We can create a new column that indicates whether excessive absenteeism has been met in that sample

import numpy as np
targets = np.where(preprocessed_data['Absenteeism Time in Hours'] > 3, 1, 0)
# parameters (condition, if true, if false)
preprocessed_data['Excessive Absenteeism'] = targets

# By using the median we implicitly balance the dataset (half targets = 1, half targets = 0)
# This will prevent our modell from learning to output only 0s or only 1s

preprocessed_data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4,1
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2,0
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4,1
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2,0


In [5]:
# check if dataset is balanced
targets.sum() / targets.shape[0]

0.45571428571428574

### 46% 1 ; 54% 0

# Drop Unnecessary Columns

In [6]:
# we drop "Daily Work Load Average," "Day of the Week," and "Distance to Work," columns before standardizing
# Furthermore, we have to exclude dummy variables from standardization to retain interpretability

In [7]:
data_with_targets = preprocessed_data.drop(['Absenteeism Time in Hours', 'Daily Work Load Average', 'Day of the Week', 'Distance to Work'], axis=1)
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,False,False,False,True,7,289,33,30,0,2,1,1
1,False,False,False,False,7,118,50,31,0,1,0,0
2,False,False,False,True,7,179,38,31,0,0,0,0
3,True,False,False,False,7,279,39,24,0,2,0,1
4,False,False,False,True,7,289,33,30,0,2,1,0


In [8]:
# checkpoint check
data_with_targets is preprocessed_data
# If false then the two dataframes are different and this is indeed a checkpoint

False

# Select inputs for Logistic Regression

In [9]:
unscaled_inputs = data_with_targets.iloc[:, :-1]
# above line selects all rows and columns until the last one
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

# Standardize appropriate variables

In [10]:
# create a custom scaler 

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin):

    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        # scaler is a StandardScaler object
        # with some columns 'twist'
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std

    # the fit method which is based on StandardScale

    def fit(self, X, y=None):
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, with_std=self.with_std)
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    # transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):

         # record the initial order of the columns
        init_col_order = X.columns
        
        # scale the select features
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)

        # declare a variable containing all information that was not scale
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]


        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)     
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]



In [11]:
# divide unscaled inputs into columns to include and columns to exclude in standardizations
# ( we exclude all the dummy variables to preserve interpretability
columns_to_exclude = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [12]:
# use list comprehension to create columns to scale list
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_exclude]

In [13]:
# scale the selected columns and fit
absenteeism_scaler = CustomScaler(columns_to_scale)

In [14]:
# fit using mean and stdev and store in the absenteeism scaler object
absenteeism_scaler.fit(unscaled_inputs)

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [15]:
# after fitting we transform the variables 
# transforming applies the parameters derived from fitting onto the data
# new data can be transformed using the same scaler object

scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,False,False,False,True,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,False,False,False,False,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,False,False,False,True,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,True,False,False,False,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,False,False,False,True,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,True,False,False,False,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,True,False,False,False,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,False,False,False,True,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [16]:
scaled_inputs.shape

(700, 11)

# Train Test Split + Shuffle

In [17]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 42)

In [18]:
x_train.shape, y_train.shape

((560, 11), (560,))

In [19]:
x_test.shape, y_test.shape

((140, 11), (140,))

# Sklearn Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

# we use metrics to evaluate the model
from sklearn import metrics

In [21]:
logreg = LogisticRegression()
# fit the model
logreg.fit(x_train,y_train)

In [22]:
# assess the train accuracy of the model 
logreg.score(x_train,y_train)

0.7732142857142857

## Manually check accuracy for enhanced understanding of the model's performance

In [23]:
logreg_outputs = logreg.predict(x_train)
logreg_outputs

array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,

In [24]:
# compare to targets contained in y_train
logreg_outputs == y_train

array([ True,  True,  True,  True, False, False, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True, False,  True, False,  True,
        True, False, False,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True, False, False,  True,
        True,  True, False,  True,  True, False,  True,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [25]:
# calculate number of correct instances

np.sum((logreg_outputs==y_train))

433

In [26]:
# total number of instances is in logreg_outputs.shape[0] (number of rows)
np.sum((logreg_outputs==y_train)) / logreg_outputs.shape[0]

0.7732142857142857

## Intercept and Coefficients

In [27]:
# intercept = bias, coef = weight of each feature
logreg.intercept_, logreg.coef_

(array([-1.7023427]),
 array([[ 2.91326793,  0.75549599,  3.1001126 ,  0.96422678,  0.07433973,
          0.65058443, -0.2512894 ,  0.25268283, -0.24135999,  0.39883784,
         -0.29441037]]))

In [28]:
# save feature names to correspond them to the weights
feature_name = unscaled_inputs.columns.values
 
feature_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

# we transpose the model coefficients and organize them vertically in the df so they can be multiplied by certain matrices
feature_table['Coefficient'] = np.transpose(logreg.coef_)

feature_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.913268
1,Reason_2,0.755496
2,Reason_3,3.100113
3,Reason_4,0.964227
4,Month Value,0.07434
5,Transportation Expense,0.650584
6,Age,-0.251289
7,Body Mass Index,0.252683
8,Education,-0.24136
9,Children,0.398838


In [29]:
# get intercept at the top of the table 
feature_table.index = feature_table.index + 1

# add the intercept at index 0
feature_table.loc[0] = ['Intercept (Bias)', logreg.intercept_[0]]

# sort the df by index
feature_table = feature_table.sort_index()
feature_table

Unnamed: 0,Feature name,Coefficient
0,Intercept (Bias),-1.702343
1,Reason_1,2.913268
2,Reason_2,0.755496
3,Reason_3,3.100113
4,Reason_4,0.964227
5,Month Value,0.07434
6,Transportation Expense,0.650584
7,Age,-0.251289
8,Body Mass Index,0.252683
9,Education,-0.24136


## Odds ratios

In [30]:
# show the odds ratio of each feature
feature_table['Odds_ratio'] = np.exp(feature_table.Coefficient)
# sort the table according to odds ratio
feature_table = feature_table.sort_values('Odds_ratio', ascending=False)
feature_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,3.100113,22.200451
1,Reason_1,2.913268,18.416885
4,Reason_4,0.964227,2.622759
2,Reason_2,0.755496,2.128667
6,Transportation Expense,0.650584,1.916661
10,Children,0.398838,1.490092
8,Body Mass Index,0.252683,1.287475
5,Month Value,0.07434,1.077173
9,Education,-0.24136,0.785559
7,Age,-0.251289,0.777797


# Testing the Logistic Regression

In [31]:
# test accuracy is what we are striving to optimize
logreg.score(x_test,y_test)

0.7714285714285715

In [32]:
# find the predicted probabilities of each class
# the first column shows the probability of a particular observation to be 0, while the second one to be 1
predicted_probability = logreg.predict_proba(x_test)

predicted_probability

array([[0.82964265, 0.17035735],
       [0.85180287, 0.14819713],
       [0.79808463, 0.20191537],
       [0.59298483, 0.40701517],
       [0.57366649, 0.42633351],
       [0.08038046, 0.91961954],
       [0.67593146, 0.32406854],
       [0.35565538, 0.64434462],
       [0.71607461, 0.28392539],
       [0.73999951, 0.26000049],
       [0.86470554, 0.13529446],
       [0.67524315, 0.32475685],
       [0.26571911, 0.73428089],
       [0.44938817, 0.55061183],
       [0.72608771, 0.27391229],
       [0.49239272, 0.50760728],
       [0.8944988 , 0.1055012 ],
       [0.21436647, 0.78563353],
       [0.86716946, 0.13283054],
       [0.58401458, 0.41598542],
       [0.7303233 , 0.2696767 ],
       [0.74406231, 0.25593769],
       [0.68192133, 0.31807867],
       [0.67988009, 0.32011991],
       [0.86089273, 0.13910727],
       [0.16080983, 0.83919017],
       [0.59428914, 0.40571086],
       [0.59656673, 0.40343327],
       [0.7598924 , 0.2401076 ],
       [0.59939621, 0.40060379],
       [0.

In [33]:
predicted_probability.shape

(140, 2)

In [34]:
# select ONLY the probabilities referring to 1s
predicted_probability[:,1]

array([0.17035735, 0.14819713, 0.20191537, 0.40701517, 0.42633351,
       0.91961954, 0.32406854, 0.64434462, 0.28392539, 0.26000049,
       0.13529446, 0.32475685, 0.73428089, 0.55061183, 0.27391229,
       0.50760728, 0.1055012 , 0.78563353, 0.13283054, 0.41598542,
       0.2696767 , 0.25593769, 0.31807867, 0.32011991, 0.13910727,
       0.83919017, 0.40571086, 0.40343327, 0.2401076 , 0.40060379,
       0.11740886, 0.14124928, 0.599468  , 0.56109137, 0.27815375,
       0.64956431, 0.31807867, 0.13370072, 0.854817  , 0.19924262,
       0.51821218, 0.25191677, 0.63455821, 0.12801669, 0.2249606 ,
       0.72595153, 0.76914193, 0.8839772 , 0.30894311, 0.13040474,
       0.26000049, 0.31349281, 0.42633351, 0.93283037, 0.13869432,
       0.2249606 , 0.97638405, 0.27815375, 0.87185858, 0.22511208,
       0.59267817, 0.11962617, 0.49917611, 0.64434462, 0.13529446,
       0.41369029, 0.68015887, 0.05803339, 0.26971134, 0.50504807,
       0.26000049, 0.2401076 , 0.69842446, 0.31349281, 0.13410

# Save and Export Model

In [35]:
import pickle

# pickle the model file
with open('model', 'wb') as file:
    pickle.dump(logreg, file)

# pickle the scaler file
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)