# Absenteeism - Applying Machine Learning

#### Importing the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
# We can override the default matplotlib styles with those of Seaborn
sns.set()## Importing the relevant libraries

In [2]:
# Load the data from a .csv
data_preprocessed = pd.read_csv(
    os.path.join(os.path.pardir,'data','processed','Absenteeism_data_preprocessed.csv'), index_col=0)

data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,7,3


In [3]:
df = data_preprocessed.copy()

#### Create targets

- For this case we are going to classify the workers into two categories, `Moderately absent` and `Excessively absent`. 
- We will use `median` as the basis. 
- Our task will be to predict whether we will obtain a 0 (Moderately absent) or a 1 (Excessively absent).

In [4]:
# Get the median
df['Absenteeism Time in Hours'].median()

3.0

In [5]:
# Moderately absent: <=3
# Excessively absent >=4

In [6]:
# create targets for our logistic regression
# they have to be categories and we must find a way to say if someone is 'being absent too much' or not
# what we've decided to do is to take the median of the dataset as a cut-off line
# in this way the dataset will be balanced (there will be roughly equal number of 0s and 1s for the logistic regression)
# as balancing is a great problem for ML, this will work great for us
# alternatively, if we had more data, we could have found other ways to deal with the issue 
# for instance, we could have assigned some arbitrary value as a cut-off line, instead of the median

# note that what line does is to assign 1 to anyone who has been absent 4 hours or more (more than 3 hours)
# that is the equivalent of taking half a day off

# initial code from the lecture
# targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)

# parameterized code
targets = np.where(df['Absenteeism Time in Hours']>
                   df['Absenteeism Time in Hours'].median(), 1, 0)

In [7]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [8]:
df['Excessive Absenteeism'] = targets

In [9]:
df

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of week,Excessive Absenteeism
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,7,1,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,7,1,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2,0
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,7,3,1
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,7,3,0
5,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,4,0
6,0,0,0,1,361,52,28,239.554,27,0,1,4,8,7,4,1
7,0,0,0,1,260,50,36,239.554,23,0,4,0,4,7,4,1
8,0,0,1,0,155,12,34,239.554,25,0,2,0,40,7,0,1
9,0,0,0,1,235,11,37,239.554,29,1,1,1,8,7,0,1


**Note: About our target**
***
using the `median` as a cutoff line is numerically stable and rigid.
That's because by using the median we have implicitly balanced the dataset roughly half of the targets are zeros while the other half ones. This will prevent our model from learning to output one of the two classes exclusively.
Thinking it did very well

In [10]:
# Get the ratio of Excessive Absenteeism
targets.sum()/targets.shape[0]
# As we can see around 46% of the targets are 1s

0.45571428571428574

In [11]:
# Drop the Absenteeism Time in Hours feature
data_with_targets = df.drop(['Absenteeism Time in Hours', 'Distance to Work',
                             'Daily Work Load Average', 'Day of week'], axis=1)

In [12]:
data_with_targets

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value,Excessive Absenteeism
0,0,0,0,1,289,33,30,0,2,1,7,1
1,0,0,0,0,118,50,31,0,1,0,7,0
2,0,0,0,1,179,38,31,0,0,0,7,0
3,1,0,0,0,279,39,24,0,2,0,7,1
4,0,0,0,1,289,33,30,0,2,1,7,0
5,0,0,0,1,179,38,31,0,0,0,7,0
6,0,0,0,1,361,28,27,0,1,4,7,1
7,0,0,0,1,260,36,23,0,4,0,7,1
8,0,0,1,0,155,34,25,0,2,0,7,1
9,0,0,0,1,235,37,29,1,1,1,7,1


In [13]:
data_with_targets is df

False

In [14]:
1 is 2

False

#### Select the inputs for the regresion

In [15]:
data_with_targets.shape

(700, 12)

In [16]:
# The inputs will be all features except `Excessive Absenteeism`
# data_with_targets.iloc[:,:14]
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,0,0,0,1,289,33,30,0,2,1,7
1,0,0,0,0,118,50,31,0,1,0,7
2,0,0,0,1,179,38,31,0,0,0,7
3,1,0,0,0,279,39,24,0,2,0,7
4,0,0,0,1,289,33,30,0,2,1,7
5,0,0,0,1,179,38,31,0,0,0,7
6,0,0,0,1,361,28,27,0,1,4,7
7,0,0,0,1,260,36,23,0,4,0,7
8,0,0,1,0,155,34,25,0,2,0,7
9,0,0,0,1,235,37,29,1,1,1,7


In [17]:
unscalled_inputs = data_with_targets.iloc[:,:-1]

In [18]:
unscalled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,0,0,0,1,289,33,30,0,2,1,7
1,0,0,0,0,118,50,31,0,1,0,7
2,0,0,0,1,179,38,31,0,0,0,7
3,1,0,0,0,279,39,24,0,2,0,7
4,0,0,0,1,289,33,30,0,2,1,7
5,0,0,0,1,179,38,31,0,0,0,7
6,0,0,0,1,361,28,27,0,1,4,7
7,0,0,0,1,260,36,23,0,4,0,7
8,0,0,1,0,155,34,25,0,2,0,7
9,0,0,0,1,235,37,29,1,1,1,7


#### Standardize the inputs

In [19]:
# Import the relevant libraries
# from sklearn.preprocessing import StandardScaler

In [20]:
# absenteeism_scalar = StandardScaler()

## NOTE: 
***
Since the dummy variables are either in 0s or 1s, we do not have to include them when standardizing. Hence we have to create a custom scalar

The idea is that this is a custom scalar based on the `StandardScalar` from sklearn.

However when we declare the scalar object there's an extra argument, `columns to scale`.

So our custom scalar will not standardise all inputs but only the ones we choose.

In this way we will be able to preserve the dummys untouched.

In practice we would avoid this step by standardizing prior to creating the dummies but we didn't do

In [21]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScalar(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scalar = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scalar.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scalar.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
        

In [63]:
unscalled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,0,0,0,1,289,33,30,0,2,1,7
1,0,0,0,0,118,50,31,0,1,0,7
2,0,0,0,1,179,38,31,0,0,0,7
3,1,0,0,0,279,39,24,0,2,0,7
4,0,0,0,1,289,33,30,0,2,1,7
5,0,0,0,1,179,38,31,0,0,0,7
6,0,0,0,1,361,28,27,0,1,4,7
7,0,0,0,1,260,36,23,0,4,0,7
8,0,0,1,0,155,34,25,0,2,0,7
9,0,0,0,1,235,37,29,1,1,1,7


In [22]:
unscalled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value'], dtype=object)

In [23]:
columns_to_ommit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

columns_to_scale = [column for column in unscalled_inputs.columns.values if column not in columns_to_ommit]

In [24]:
absenteeism_scalar = CustomScalar(columns_to_scale)

In [25]:
# absenteeism_scalar will contain information about the mean and standard deviation.
absenteeism_scalar.fit(unscalled_inputs)

CustomScalar(columns=['Transportation Expense', 'Age', 'Body Mass Index',
                      'Education', 'Children', 'Pets', 'Month Value'],
             copy=None, with_mean=None, with_std=None)

In [26]:
scaled_inputs = absenteeism_scalar.transform(unscalled_inputs)

In [27]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,0,0,0,1,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487,0.182726
1,0,0,0,0,-1.574681,2.130803,1.002633,-0.447980,-0.019280,-0.589690,0.182726
2,0,0,0,1,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690,0.182726
3,1,0,0,0,0.854936,0.405184,-0.643782,-0.447980,0.880469,-0.589690,0.182726
4,0,0,0,1,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487,0.182726
5,0,0,0,1,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690,0.182726
6,0,0,0,1,2.092381,-1.320435,0.061825,-0.447980,-0.019280,2.843016,0.182726
7,0,0,0,1,0.568211,-0.065439,-0.878984,-0.447980,2.679969,-0.589690,0.182726
8,0,0,1,0,-1.016322,-0.379188,-0.408580,-0.447980,0.880469,-0.589690,0.182726
9,0,0,0,1,0.190942,0.091435,0.532229,2.232242,-0.019280,0.268487,0.182726


Whenever we get new data we will just apply `absenteeism_scalar.transform(new_data)` to reach the same transformation
```
new_data = pd.read_csv('new_data.csv)
new_data_scaled = absenteeism_scalar.transform(new_data)
```

In [28]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,0,0,0,1,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487,0.182726
1,0,0,0,0,-1.574681,2.130803,1.002633,-0.447980,-0.019280,-0.589690,0.182726
2,0,0,0,1,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690,0.182726
3,1,0,0,0,0.854936,0.405184,-0.643782,-0.447980,0.880469,-0.589690,0.182726
4,0,0,0,1,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487,0.182726
5,0,0,0,1,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690,0.182726
6,0,0,0,1,2.092381,-1.320435,0.061825,-0.447980,-0.019280,2.843016,0.182726
7,0,0,0,1,0.568211,-0.065439,-0.878984,-0.447980,2.679969,-0.589690,0.182726
8,0,0,1,0,-1.016322,-0.379188,-0.408580,-0.447980,0.880469,-0.589690,0.182726
9,0,0,0,1,0.190942,0.091435,0.532229,2.232242,-0.019280,0.268487,0.182726


In [29]:
scaled_inputs.shape

(700, 11)

### Train Test Split

We want to shuffle the data so that we remove all types of dependencies that come from the order of
the data set like Day of the week

In [30]:
# import the relevant library
from sklearn.model_selection import train_test_split

##### Split

In [31]:
# train_test_split(inputs, target) will split the data into 4 arrays
# array 1: A training dataset with inputs
# array 2: A training dataset with targets
# array 3: A testing dataset with inputs
# array 4: A testing dataset with targets

# Split the variables with an 80-20 split and some random state
# To have the same split as mine, use random_state = 365

train_test_split(scaled_inputs, targets, test_size=0.2, random_state=365)

[     Reason_1  Reason_2  Reason_3  Reason_4  Transportation Expense       Age  \
 334         1         0         0         0                0.568211 -0.065439   
 43          0         0         1         0                0.190942  1.032682   
 686         1         0         0         0               -1.574681  2.130803   
 551         0         0         0         0                0.190942  1.032682   
 245         0         0         0         1               -0.986140 -1.163560   
 109         0         0         0         1                2.092381 -1.320435   
 71          0         0         0         1                1.036026  0.562059   
 652         0         0         1         0                1.005844 -0.536062   
 611         1         0         0         0                0.040034 -1.320435   
 44          0         0         0         1               -1.016322 -0.379188   
 333         0         0         0         1               -0.654143  0.248310   
 600         1  

In [32]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size=0.2, random_state=365)

In [33]:
print('\n x_train: ',x_train.shape, '\n x_test: ',
      x_test.shape, '\n y_train:', y_train.shape, '\n y_test: ',y_test.shape)


 x_train:  (560, 11) 
 x_test:  (140, 11) 
 y_train: (560,) 
 y_test:  (140,)


##### Logistic regression with sklearn

In [34]:
# Import the relevant libraries
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [35]:
reg = LogisticRegression(solver='liblinear')

In [36]:
# Train the model
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

###### accuracy

In [37]:
# To get the model accuracy we use .score(inputs, targets)
reg.score(x_train, y_train)
# Our model has an accuracy of around 76%

0.7678571428571429

##### Manually checking the accuracy
***

***
Accuracy means that x% of the model outputs match the targets.

So if we want to find the accuracy of a model manually we should find the outputs and compare them with

the targets.

In [38]:
# .predict()method will find the predicted outputs of the regression
# The model itself is contained in the variable reg and we are choosing to predict the outputs associated
# with the training inputs and contained an X train.

model_outputs = reg.predict(x_train)

In [39]:
model_outputs

array([1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,

In [40]:
y_train

array([1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [41]:
model_outputs == y_train
# we get an array which compares the elements of the two variables. If there is a match the result is true.
# otherwise it is false.
# Now we can clearly see which elements have been guessed correctly and which haven't.

# If we divide the number of matches by the total number of elements we will get the accuracy 

array([ True,  True, False,  True,  True,  True, False,  True, False,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True, False,  True,  True,  True,  True, False,
       False, False,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True, False,  True, False,
        True,  True,  True,  True, False,  True,  True, False, False,
       False,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True, False, False, False,  True, False,
       False, False,  True,  True, False,  True,  True,  True,  True,
        True,  True,

In [42]:
# total number of true entries.
np.sum(model_outputs == y_train) 

430

In [43]:
# total number of elements
model_outputs.shape[0]

560

In [44]:
# Get the accuracy = Correct predictions / # observations
np.sum(model_outputs == y_train)/model_outputs.shape[0]

0.7678571428571429

###### intercept and coefficients

In [45]:
reg.intercept_

array([-1.40770453])

In [46]:
reg.coef_.shape

(1, 11)

In [47]:
feature_name = unscalled_inputs.columns.values

In [48]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)
summary_table

Unnamed: 0,Feature name
0,Reason_1
1,Reason_2
2,Reason_3
3,Reason_4
4,Transportation Expense
5,Age
6,Body Mass Index
7,Education
8,Children
9,Pets


In [49]:
# we must transpose this array because by default and the arrays are rows and not columns
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.51399
1,Reason_2,1.258447
2,Reason_3,2.81172
3,Reason_4,0.695239
4,Transportation Expense,0.49964
5,Age,-0.252112
6,Body Mass Index,0.324878
7,Education,-0.039164
8,Children,0.430363
9,Pets,-0.303175


In [50]:
# Adding the intercept to the table
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.407705
1,Reason_1,2.51399
2,Reason_2,1.258447
3,Reason_3,2.81172
4,Reason_4,0.695239
5,Transportation Expense,0.49964
6,Age,-0.252112
7,Body Mass Index,0.324878
8,Education,-0.039164
9,Children,0.430363


##### Weight (coefficient) and bias (intercept)

another notion we must emphasize is that whenever we are dealing with a logistic regression the

coefficients we are predicting or the so-called log(odds).

This is a consequence of the choice of model logistic regression.

By default these models are nothing but a linear function predicting log(odds).

These log odds are later transformed into zeros and ones.

In [51]:
summary_table['Odds Ratio'] = np.exp(summary_table.Coefficient)
# The Odds Ratio feature will hold the exponentials of the coefficients

In [52]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds Ratio
0,Intercept,-1.407705,0.244704
1,Reason_1,2.51399,12.354126
2,Reason_2,1.258447,3.519952
3,Reason_3,2.81172,16.638505
4,Reason_4,0.695239,2.004188
5,Transportation Expense,0.49964,1.648128
6,Age,-0.252112,0.777158
7,Body Mass Index,0.324878,1.383862
8,Education,-0.039164,0.961593
9,Children,0.430363,1.537816


In [53]:
summary_table.sort_values('Odds Ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds Ratio
3,Reason_3,2.81172,16.638505
1,Reason_1,2.51399,12.354126
2,Reason_2,1.258447,3.519952
4,Reason_4,0.695239,2.004188
5,Transportation Expense,0.49964,1.648128
9,Children,0.430363,1.537816
7,Body Mass Index,0.324878,1.383862
11,Month Value,0.142003,1.15258
8,Education,-0.039164,0.961593
6,Age,-0.252112,0.777158


## Interpretation
***
If a `coefficient` is around 0 or it's `odds ratio` is close to 1.

This means that the corresponding feature is not particularly important.

It is important to note that `the further away from 0 a coefficient is, the bigger its importance`.

So by looking at the coefficients table we will notice that the most strongly pronounced features seem

to be the four reasons for absence, transportation expense, whether a person has children pets, and education 

Note that pet and education are at the `bottom of the table but their weights are still far away from zero`. They are indeed important.

The daily work load average distance to work and day of the week would seem to have the smallest impact their weight is almost zero so regardless of the particular values they will barely affect our model

***
***

I'll quickly recap what the 5 reason variable stand for 
```
Reason 0 or no reason which is the baseline model.
Reason 1 which comprises of various diseases.

Reason 2 relating to pregnancy and giving birth 
Reason 3 regarding poisoning and peculiar reasons not categorized elsewhere.
Reason 4 which relates to light diseases in the light of this clarification.
```

**We can easily understand our coefficients.**

1. The most crucial reason for excessive absence is `poisoning`. Not much of a surprise there. If you are poisoned you just won't go to work. The weight means the odds of someone being excessively absent after being poisoned are 20 times higher than when no reason was reported

1. Another very important reason seems to be number one or various diseases. I'd call this the normal absenteeism case you got sick, you skipped work. No drama. A person who has reported this is 14 times more likely to be excessively absent than a person who didn't specify a reason.

1. Then we have pregnancy and giving birth. I particularly like this one because it's a prominent cause of absenteeism but at the same time it is way less pronounced than reasons 1 and 3. My explanation for this is a woman is pregnant. She goes to the gynecologist gets a regular pregnancy check and comes back to work. Nothing excessive about that from time to time. There are some emergencies but the odds ratio we can verify that it's only around two times more likely to be excessively absent than the base model.

1. Transportation expense. This is the most important non-dom feature in the model.But here's the problem it is one of our standardized variables. We don't have direct interpretability of it it's odds ratio implies that for one standardized unit or for one standard deviation increase in transportation expense it is close to twice as likely to be excessively absent.

```
This is the main drawback of standardization standardized models almost always yield higher accuracy because the optimization algorithms work better in this way.

Machine learning engineers prefer models with higher accuracy. So they normally go for standardization.

Econometricians and statisticians however prefer less accurate but more interpretable models because they care about the underlying reasons behind different phenomena.

Data scientists may be in either position. Sometimes they need higher accuracy. Other times they must find the main drivers of a problem.

So it makes sense to create two different models one with standardized features and one without them and then draw insights from both. However should we opt for predicting values. We definitely prefer higher accuracy. So standardization is more often the norm.

```
***
***
The **reasoning in terms of weights** is that, a weight of zero implies that no matter the feature value, we will multiply it by 0 and the whole result will be zero.

The **meaning in terms of odds ratios** is the following. For one unit change in the standardized feature the odds increase by a multiple equal to the odds ratio.

So if the odds ratio is 1 then the odds don't change at all.

For example if the odds are 5 to 1 and the odds ratio is two we would say that for one unit change

the odds change from 5 to 1 to 10 to 1 because we multiply them by the odds ratio.

Alternatively if the odds ratio is 0.2 the odds would change to 1 to 1

when the odds ratio is 1. We don't have a change as multiplication with the number one keeps things equal.

This makes sense as the odds ratio was one whenever the weight is zero.
***
***
Consider the daily work load. Its average weight is -0.03 so almost zero and it's odd ratio is 0.97. So almost 1.

So this feature is almost useless for our model and with or without it the result would likely be the same.
***

#### Backward Elimination
***

The idea of backward elimination is that we can simplify our model by removing all features which have close to no contribution to the model.

Usually when we have the p-values of variables we get rid of all coefficients with p-values above 0.05.

When learning with sklearn we don't have p-values because we don't necessarily need them.

The reasoning of the engineers who created the package is that if the weight is small enough it will make a difference anyway. And we trust their work.

So if we remove these variables the rest of our model should not really change in terms of coefficient values.

### Testing our model

###### Accuracy

In [54]:
reg.score(x_test, y_test)

# So based on data that the model has never seen before we can say that and 76% of the cases the
# model will predict of a person is going to be excessively absent

0.7642857142857142

In [55]:
# instead of 0 and 1 we can get the probability of an output being 0 or 1. There is an S.K. learn method 
# called predict_proba() which returns the probability estimates for all possible outputs.
predicted_proba = reg.predict_log_proba(x_test)
predicted_proba

array([[-0.63807864, -0.7514259 ],
       [-0.54755105, -0.86361399],
       [-0.86176532, -0.54890091],
       [-1.83504826, -0.17388421],
       [-1.51941703, -0.24697455],
       [-0.83960609, -0.56542369],
       [-1.6732593 , -0.20780493],
       [-0.64507484, -0.74364769],
       [-0.62523247, -0.76601257],
       [-1.16300809, -0.37475833],
       [-0.16230698, -1.89832191],
       [-0.15633651, -1.93289458],
       [-0.33133037, -1.26573451],
       [-0.88538249, -0.53197494],
       [-1.20396252, -0.35667935],
       [-0.41960702, -1.07091466],
       [-0.67714678, -0.70940777],
       [-0.67714678, -0.70940777],
       [-0.3970913 , -1.11557324],
       [-0.27456864, -1.42669912],
       [-0.11433765, -2.22522355],
       [-0.35041205, -1.21874058],
       [-1.14305277, -0.38396418],
       [-0.69729683, -0.68901468],
       [-0.16677316, -1.87334865],
       [-0.74339036, -0.64530807],
       [-0.1470854 , -1.98938335],
       [-0.33163621, -1.26495636],
       [-1.04008814,

In [56]:
predicted_proba.shape

(140, 2)

We get is a 140 x 2 array. There are 140 test observations and 2 columns.

The first column shows the probability our model assigned to the observation being zero and the second, the probability the model assigned to the observation being one.
That's why summing any two numbers horizontally will give you an output of one.


What we're interested in is the probability of excessive absenteeism right. So the probability of getting one.
Therefore we can simply slice out all values from the second column. This will give us the probabilities of excessive absenteeism

In [57]:
predicted_proba[:,1]

array([-0.7514259 , -0.86361399, -0.54890091, -0.17388421, -0.24697455,
       -0.56542369, -0.20780493, -0.74364769, -0.76601257, -0.37475833,
       -1.89832191, -1.93289458, -1.26573451, -0.53197494, -0.35667935,
       -1.07091466, -0.70940777, -0.70940777, -1.11557324, -1.42669912,
       -2.22522355, -1.21874058, -0.38396418, -0.68901468, -1.87334865,
       -0.64530807, -1.98938335, -1.26495636, -0.4360638 , -1.15586208,
       -0.3457037 , -1.50151655, -0.5203248 , -0.37730495, -0.90933166,
       -0.29438864, -0.43858182, -0.54890091, -0.54504456, -0.34635211,
       -0.1078956 , -0.37687875, -1.37073547, -1.37073547, -1.34064378,
       -0.0791364 , -0.45325656, -1.02323291, -0.28255332, -1.16231558,
       -1.42669912, -0.81022087, -0.16494857, -1.56968865, -0.73365407,
       -0.35011434, -0.88726269, -1.36568988, -1.42669912, -0.88531724,
       -1.19035415, -1.02149644, -0.39988884, -0.63863159, -0.97205407,
       -0.5203248 , -0.29085532, -1.56968865, -1.41267967, -1.16

#### Saving the model

We must save:
***
1. The model object which in our case is `reg`. This object has the 
    - Type of regression
    - coefficient
    - Intercept
1. Scalar object. What it did was store the:
    - columns to scale, 
    - mean and, 
    - standard deviation of each feature.
 ***
The information in the absenteeism_scalar is needed to preprocess any new data using the same rules as the ones apply to training data.

In [58]:
import pickle

In [59]:
# Save the model
with open(os.path.join(os.path.pardir, 'src', 'models', 'absenteeism_model_1.pickle'), 'wb') as file:
    pickle.dump(reg, file)

In [60]:
# Save the scalar
with open(os.path.join(os.path.pardir, 'src', 'models', 'absenteeism_scalar_1.pickle'), 'wb') as file:
    pickle.dump(absenteeism_scalar, file)

***A Note on Pickling***
 

There are several popular ways to save (and finalize) a model. To name some, you can use Joblib (a part of the SciPy ecosystem), and JSON. Certainly, each of those choices has its pros and cons. Pickle is probably the most intuitive and definitely our preferred choice.

Once again, ‘pickle’ is the standard Python tool for serialization and deserialization. In simple words, pickling means: converting a Python object (no matter what) into a string of characters. Logically, unpickling is about converting a string of characters (that has been pickled) into a Python object.



There are some potential issues you should be aware of, though!

###### Pickle and Python version.

Pickling is strictly related to Python version. It is not recommended to (de)serialize objects across different Python versions. Logically, if you’re working on your own this will never be an issue (unless you upgrade/downgrade your Python version). 



###### Pickle is slow.

Well, you will barely notice that but for complex structures it may take loads of time to pickle and unpickle.



###### Pickle is not secure.

This is evident from the documentation of pickle, quote: “Never unpickle data received from an untrusted or unauthenticated source.” The reason is that just about anything can be pickled, so you can easily unpickle malicious code.



Now, if you are unpickling your own code, you are more or less safe.



If, however, you receive pickled objects from someone you don’t fully trust, you should be very cautious. That’s how viruses affect your operating system.



Finally, even your own file may be changed by an attacker. Thus, the next time you unpickle, you can unpickle just about anything (that this unethical person put there).



Certainly, all these cases are very rare, but you must be aware of them. Generally, it is recommended to use JSON

## Creating a module to automate the process

In [61]:
predict_absenteeism_script_file = os.path.join(os.path.pardir,'src','data','predict_absenteeism.py')

In [62]:
%%writefile predict_absenteeism_script_file
# -*- coding: utf-8 -*-

# Import the relevant libraries
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
import pickle
import os

class CustomScalar(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scalar = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scalar.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scalar.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]


# Create a class that will be used to predict new data
class AbsenteeismModel:
    def __init__(self, model_file, scalar_file):
        # read the saved model and scalar file
        with open(model_file, 'rb') as model_file, open(scalar_file, 'rb') as scalar_file:
            self.reg = pickle.load(model_file)
            self.scalar = pickle.load(scalar_file)
            self.data = None
    
    # Take the *.csv file and preprocess it
    def load_and_clean_data(self, data_file):
        # import the data
        df = pd.read_csv(data_file, delimiter=',')
        # store the data in a new variable for later use
        self.df_with_predictions = df.copy()
        # drop the ID column
        df = df.drop(['ID'], axis=1)
        # to preserve the code we've created, we will add a column with NAN strings
        df['Absenteeism Time in Hours'] = 'NaN'
        
        # create a separate dataframe containing dummy variables for all the available Reasons
        reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first=True)
        
        # split reason_columns into 4 types
        reason_type_1 = reason_columns.loc[:, 1:14].max(axis=1)
        reason_type_2 = reason_columns.loc[:, 15:17].max(axis=1)
        reason_type_3 = reason_columns.loc[:, 18:21].max(axis=1)
        reason_type_4 = reason_columns.loc[:, 22:].max(axis=1)
        
        # to avoid multicollinnearity, drop the `Reason for Absence` column from df
        df = df.drop(['Reason for Absence'], axis=1)
        
        # concatenate df with the 4 Reasons for Absence
        df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis=1)
        
        # assign names to the 4 reason types columns
        # Note: There is a more universal version of this code. However, this will best suit our current purpose
        column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
        
        df.columns = column_names
        
        # reorder the columns
        column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date',
                          'Transportation Expense', 'Distance to Work', 'Age',
                          'Daily Work Load Average', 'Body Mass Index', 'Education',
                          'Children', 'Pets', 'Absenteeism Time in Hours']
        df = df[column_names_reordered]
        
        # conver the date column into datetime
        df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
        
        # create a list with month values retrieved from the 'Date' column
        list_months = []
        list_months = list(map(lambda x: df['Date'][x].month, list(range(df.shape[0]))))
        
        # insert the values into a new colun in the df called 'Month Value'
        df['Month Value'] = list_months
        
        # create a new feature called 'Day of the Week'
        day_of_week = lambda x: x.weekday()
        df['Day of the Week'] = df['Date'].apply(day_of_week)
        
        # drop the 'Date' column from the df
        df = df.drop('Date', axis=1)
        
        # reorder the columns
        column_names_updated = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
                                'Day of the Week', 'Transportation Expense', 'Distance to Work', 'Age',
                                'Daily Work Load Average', 'Body Mass Index', 'Education',
                                'Children', 'Pets', 'Absenteeism Time in Hours']
        df = df[column_names_updated]
        
        # map 'Education' variables; the results is a dummy
        df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})
        
        # replace the NaN values
        df = df.fillna(value=0)
        
        # drop the original 'Absenteeism Time in Hours'
        df = df.drop(['Absenteeism Time in Hours'], axis=1)
        
        # drop the variables we decided we will not use
        df = df.drop(['Distance to Work', 'Daily Work Load Average', 'Day of the Week'], axis=1)
        
        # declare a new variable called processed_data
        self.preprocessed_data = df.copy()
        
        # scale the data
        self.data = self.scalar.transform(self.preprocessed_data)
        
    # a function which outputs the probability of a data point to be 1
    def predict_probability(self):
        if self.data is not None:
            pred = self.reg.predict_proba(self.data)[:,1]
            return pred
    
    # a function that outputs 0 or 1 based on the model
    def predicted_output_category():
        if self.data is not None:
            pred_outputs = self.reg.predict(self.data)
            return pred_outputs
    
    # predict the outputs and the probability and
    # add add columns with these values at the end of the df
    def predicted_outputs(self):
        if self.data is not None:
            self.preprocessed_data['Probability'] = self.predict_probability()
            self.preprocessed_data['Prediction'] = self.predicted_output_category()
            return self.preprocessed_data
        
        

Overwriting predict_absenteeism_script_file
