## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
data = pd.read_csv("Absenteeism_preprocessed.csv")
data

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Months value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,5,2,179,22,40,237.656,22,1,2,0,8
696,True,False,False,False,5,2,225,26,28,237.656,24,0,1,2,3
697,True,False,False,False,5,3,330,16,28,237.656,25,1,0,0,8
698,False,False,False,True,5,3,235,16,32,237.656,25,1,0,0,2


## Create the targets

In [3]:
data["Absenteeism Time in Hours"].median()

3.0

In [4]:
targets = np.where(data["Absenteeism Time in Hours"]<=3.0, 0, 1)

In [5]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
data = data.drop("Absenteeism Time in Hours", axis=1)

In [7]:
# Check point
data["Excessively Absenteeism"] = targets
data_with_targets = data.copy()

In [8]:
data_with_targets

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Months value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessively Absenteeism
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,1
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,0
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,1
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,5,2,179,22,40,237.656,22,1,2,0,1
696,True,False,False,False,5,2,225,26,28,237.656,24,0,1,2,0
697,True,False,False,False,5,3,330,16,28,237.656,25,1,0,0,1
698,False,False,False,True,5,3,235,16,32,237.656,25,1,0,0,0


## Select the inputs for model

In [9]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

In [10]:
unscaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Months value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,5,2,179,22,40,237.656,22,1,2,0
696,True,False,False,False,5,2,225,26,28,237.656,24,0,1,2
697,True,False,False,False,5,3,330,16,28,237.656,25,1,0,0
698,False,False,False,True,5,3,235,16,32,237.656,25,1,0,0


## Standardizing

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
absenteeism_scaler = StandardScaler()
absenteeism_scaler.fit(unscaled_inputs)

In [13]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [14]:
scaled_inputs

array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035,  0.26848661]])

## Split the data into train and test

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [17]:
print(x_train.shape, x_test.shape)

(560, 14) (140, 14)


In [18]:
print(y_train.shape, y_test.shape)

(560,) (140,)


## Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [20]:
reg = LogisticRegression()

In [21]:
reg.fit(x_train, y_train)

## Check the accuracy of the model

In [22]:
reg.score(x_train, y_train)

0.7839285714285714

In [23]:
reg.score(x_test, y_test)

0.7357142857142858

## Coefficent

In [24]:
reg.intercept_

array([-0.22206736])

In [25]:
reg.coef_

array([[ 2.07601767,  0.33504757,  1.56162303,  1.32927434,  0.18793677,
        -0.07062253,  0.70639316, -0.03986811, -0.20089491, -0.00456366,
         0.31933564, -0.135508  ,  0.38172443, -0.3332426 ]])

In [26]:
summary_table = pd.DataFrame()

In [27]:
summary_table["Features name"] = unscaled_inputs.columns.values
summary_table["Coefficients"] = np.transpose(reg.coef_)

In [28]:
summary_table

Unnamed: 0,Features name,Coefficients
0,Reason 1,2.076018
1,Reason 2,0.335048
2,Reason 3,1.561623
3,Reason 4,1.329274
4,Months value,0.187937
5,Day of the week,-0.070623
6,Transportation Expense,0.706393
7,Distance to Work,-0.039868
8,Age,-0.200895
9,Daily Work Load Average,-0.004564


In [29]:
summary_table.index += 1

In [30]:
summary_table.loc[0] = ["Intercept", reg.intercept_[0]]

In [31]:
summary_table = summary_table.sort_index()

In [32]:
summary_table

Unnamed: 0,Features name,Coefficients
0,Intercept,-0.222067
1,Reason 1,2.076018
2,Reason 2,0.335048
3,Reason 3,1.561623
4,Reason 4,1.329274
5,Months value,0.187937
6,Day of the week,-0.070623
7,Transportation Expense,0.706393
8,Distance to Work,-0.039868
9,Age,-0.200895


In [33]:
summary_table["Odds ratio"] = np.exp(summary_table["Coefficients"])

In [34]:
summary_table.sort_values("Odds ratio", ascending=False)

Unnamed: 0,Features name,Coefficients,Odds ratio
1,Reason 1,2.076018,7.972656
3,Reason 3,1.561623,4.766551
4,Reason 4,1.329274,3.778301
7,Transportation Expense,0.706393,2.026668
13,Children,0.381724,1.464808
2,Reason 2,0.335048,1.398007
11,Body Mass Index,0.319336,1.376213
5,Months value,0.187937,1.206757
10,Daily Work Load Average,-0.004564,0.995447
8,Distance to Work,-0.039868,0.960916
