### Importing libraries:

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Importing dataset

In [2]:
data = pd.read_csv('./df_preprocessed.csv')
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


As we have already preprocessed the dataset, we can be assure that we need not to preprocess it here again.<br>
However, bear in mind that the main goal of this research is to ascertain whether an individual will be absent frequently; in this case, excessiveness might be defined as the median of absenteeism time. I believe a logistic regression will be helpful in this situation, and to do so, we should create a new column that categorises the data as either excessively absentee or not.


In [3]:
data['Excsssive Absenteesm'] = (data['Absenteeism Time in Hours'] > data['Absenteeism Time in Hours'].median()).astype('int')

In [4]:
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excsssive Absenteesm
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


With this new column, we can now safely remove Absenteesm time column.

### Checkpoint

In [5]:
unscaled_data = data.drop('Absenteeism Time in Hours', axis=1)
unscaled_data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excsssive Absenteesm
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


Now, in order to get good results, it is recomended to scale the inputs, so that range of inputs is not a contributing factor in their weights.

In [6]:
unscaled_inputs = unscaled_data.iloc[:,:-1]

In [7]:
unscaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1


As we know that the reason columns are dummies(binary), so we need not to scale them.

In [8]:
unscaled_inputs.columns

Index(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month', 'Weekday',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
       'Pets'],
      dtype='object')

In [9]:
data_to_scale = unscaled_inputs[['Month', 'Weekday',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Children',
       'Pets']]

In [10]:
scaler = StandardScaler()
scaler.fit(data_to_scale)


In [11]:
scaled_data = scaler.transform(data_to_scale)
scaled_data

array([[ 0.18272635, -0.68370352,  1.00584437, ...,  0.76743118,
         0.88046927,  0.26848661],
       [ 0.18272635, -0.68370352, -1.57468098, ...,  1.00263338,
        -0.01928035, -0.58968976],
       [ 0.18272635, -0.00772546, -0.6541427 , ...,  1.00263338,
        -0.91902997, -0.58968976],
       ...,
       [-0.3882935 ,  0.66825259,  1.62456682, ..., -0.40857982,
        -0.91902997, -0.58968976],
       [-0.3882935 ,  0.66825259,  0.19094163, ..., -0.40857982,
        -0.91902997, -0.58968976],
       [-0.3882935 ,  0.66825259,  1.03602595, ..., -0.40857982,
        -0.01928035,  0.26848661]])

In [12]:
scaled_inputs = pd.DataFrame(columns=['Month', 'Weekday',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Children',
       'Pets'], data=scaled_data)
scaled_inputs.head()

Unnamed: 0,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets
0,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487
1,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.01928,-0.58969
2,0.182726,-0.007725,-0.654143,1.426749,0.24831,-0.806331,1.002633,-0.91903,-0.58969
3,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0.880469,-0.58969
4,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487


In [13]:
print(unscaled_inputs[['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']].shape)
print(scaled_inputs.shape)


(700, 4)
(700, 9)


In [14]:
scaled_inputs = pd.concat([unscaled_inputs[['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']], scaled_inputs, unscaled_inputs['Education']], axis=1)
scaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets,Education
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487,0
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.01928,-0.58969,0
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.24831,-0.806331,1.002633,-0.91903,-0.58969,0
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0.880469,-0.58969,0
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487,0


In [15]:
targets = unscaled_data['Excsssive Absenteesm']

### Checkpoint

#### Data Split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size=0.2, random_state=42)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


(560, 14)
(560,)
(140, 14)
(140,)


### Model

In [17]:
log = LogisticRegression()
log.fit(x_train, y_train)

In [18]:
log.score(x_train, y_train)

0.7678571428571429

Determining the accuracy score manually:

In [19]:
train_outputs = log.predict(x_train)
correct_outs = (train_outputs == y_train).sum()
acc = correct_outs/y_train.shape[0]
acc

0.7678571428571429

Generating coefficients table

In [20]:
x_train.columns

Index(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month', 'Weekday',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets',
       'Education'],
      dtype='object')

In [21]:
coef_table = pd.DataFrame(columns=["coefficients"], data=['intercept', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month', 'Weekday',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets',
       'Education'])

In [22]:
log.coef_

array([[ 2.93133591,  0.73381745,  3.07461822,  0.99668015,  0.07974773,
        -0.15816644,  0.67461938, -0.05693191, -0.25781684, -0.02065229,
         0.24529605,  0.41827005, -0.31142714, -0.26827384]])

In [23]:
values = [log.intercept_[0]]
for i in log.coef_[0]:
    values.append(i)


In [24]:
coef_table['weights'] = values
coef_table

Unnamed: 0,coefficients,weights
0,intercept,-1.714767
1,Reason_1,2.931336
2,Reason_2,0.733817
3,Reason_3,3.074618
4,Reason_4,0.99668
5,Month,0.079748
6,Weekday,-0.158166
7,Transportation Expense,0.674619
8,Distance to Work,-0.056932
9,Age,-0.257817


In [25]:
coef_table['Odds_ratio'] = np.exp(coef_table['weights'])
coef_table.sort_values(by='Odds_ratio', ascending=False)

Unnamed: 0,coefficients,weights,Odds_ratio
3,Reason_3,3.074618,21.641618
1,Reason_1,2.931336,18.752666
4,Reason_4,0.99668,2.709272
2,Reason_2,0.733817,2.083017
7,Transportation Expense,0.674619,1.963286
12,Children,0.41827,1.519331
11,Body Mass Index,0.245296,1.278
5,Month,0.079748,1.083014
10,Daily Work Load Average,-0.020652,0.97956
8,Distance to Work,-0.056932,0.944658


On observing these weights, we can notice that features like:
<li> Month
<li> Daily Work Load Average
<li> Distance to Work
<br> have the weights closest to zero or odds_ratio closest to 1.<br>
This means that these variables have negligent effect on our model.
So we can remove these variables without compromising the accuracy.

In [26]:
data_to_scale = unscaled_inputs[['Weekday',
       'Transportation Expense', 'Age',
       'Body Mass Index', 'Children',
       'Pets']]

In [27]:
scaler = StandardScaler()
scaler.fit(data_to_scale)

In [28]:
scaled_data = scaler.transform(data_to_scale)
scaled_data

array([[-0.68370352,  1.00584437, -0.53606239,  0.76743118,  0.88046927,
         0.26848661],
       [-0.68370352, -1.57468098,  2.13080317,  1.00263338, -0.01928035,
        -0.58968976],
       [-0.00772546, -0.6541427 ,  0.24830984,  1.00263338, -0.91902997,
        -0.58968976],
       ...,
       [ 0.66825259,  1.62456682, -1.32043461, -0.40857982, -0.91902997,
        -0.58968976],
       [ 0.66825259,  0.19094163, -0.69293683, -0.40857982, -0.91902997,
        -0.58968976],
       [ 0.66825259,  1.03602595,  0.56205873, -0.40857982, -0.01928035,
         0.26848661]])

In [29]:
scaled_inputs = pd.DataFrame(columns=['Weekday',
       'Transportation Expense', 'Age',
       'Body Mass Index', 'Children',
       'Pets'], data=scaled_data)

In [30]:
scaled_inputs_new = pd.concat([unscaled_inputs[['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']], scaled_inputs, unscaled_inputs['Education']], axis=1)
targets = unscaled_data['Excsssive Absenteesm']


In [31]:
scaled_inputs_new.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Weekday,Transportation Expense,Age,Body Mass Index,Children,Pets,Education
0,0,0,0,1,-0.683704,1.005844,-0.536062,0.767431,0.880469,0.268487,0
1,0,0,0,0,-0.683704,-1.574681,2.130803,1.002633,-0.01928,-0.58969,0
2,0,0,0,1,-0.007725,-0.654143,0.24831,1.002633,-0.91903,-0.58969,0
3,1,0,0,0,0.668253,0.854936,0.405184,-0.643782,0.880469,-0.58969,0
4,0,0,0,1,0.668253,1.005844,-0.536062,0.767431,0.880469,0.268487,0


In [32]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs_new, targets, test_size=0.2, random_state=42)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(560, 11)
(560,)
(140, 11)
(140,)


In [33]:
log = LogisticRegression()
log.fit(x_train, y_train)

In [34]:
log.score(x_train, y_train)

0.7678571428571429

In [35]:
x_train.columns

Index(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Weekday',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Children', 'Pets',
       'Education'],
      dtype='object')

In [47]:
coef_table = pd.DataFrame(columns=["coefficients"], data=['intercept', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Weekday',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Children', 'Pets',
       'Education'])

In [48]:
values = [log.intercept_[0]]
for i in log.coef_[0]:
    values.append(i)

coef_table['weights'] = values
coef_table

Unnamed: 0,coefficients,weights
0,intercept,-1.692543
1,Reason_1,2.903725
2,Reason_2,0.722558
3,Reason_3,3.043913
4,Reason_4,0.959216
5,Weekday,-0.159269
6,Transportation Expense,0.67091
7,Age,-0.246555
8,Body Mass Index,0.242408
9,Children,0.424103


In [49]:
coef_table['Odds_ratio'] = np.exp(coef_table['weights'])
coef_table.sort_values(by='Odds_ratio', ascending=False)

Unnamed: 0,coefficients,weights,Odds_ratio
3,Reason_3,3.043913,20.987198
1,Reason_1,2.903725,18.241964
4,Reason_4,0.959216,2.609649
2,Reason_2,0.722558,2.059694
6,Transportation Expense,0.67091,1.956017
9,Children,0.424103,1.528219
8,Body Mass Index,0.242408,1.274315
5,Weekday,-0.159269,0.852767
11,Education,-0.221582,0.80125
7,Age,-0.246555,0.781489


#### Interpretation of table:
The likelihood that a person will exhibit excessive absenteeism increases with the values for the corresponding attributes.<br>
For instance:
<li>There is no stated cause for excessive absenteeism in our default model, which operates on reason 0. Now, a person is 21 times more likely to stay away for a longer amount of time if they give Reason 3 as their explanation.
<li>In a similar vein, a person who owns numerous pets is far less likely to take time off for them because they could not be the only ones providing care.


### Testing

In [50]:
model_outputs = log.predict(x_test)


In [51]:
log.score(x_test, y_test)

0.7785714285714286

In [52]:

correct_outs = (model_outputs == y_test).sum()
acc = correct_outs/y_test.shape[0]
acc

0.7785714285714286

The Accuracy comes out to be around 78%

In [53]:
prediction_probabilities = log.predict_proba(x_test)

In [54]:
excessive_absenteesm_probab = prediction_probabilities[:,1]
excessive_absenteesm_probab

array([0.20230053, 0.11860123, 0.24318075, 0.45598465, 0.46216239,
       0.92376966, 0.36739935, 0.63418064, 0.22893314, 0.2731386 ,
       0.11860123, 0.28514311, 0.7471054 , 0.47443983, 0.26801789,
       0.59131774, 0.08559695, 0.74614028, 0.11860123, 0.38352153,
       0.34390262, 0.21387169, 0.28965918, 0.30758612, 0.11900423,
       0.7961201 , 0.38352153, 0.42522529, 0.21387169, 0.38352153,
       0.11387977, 0.12520393, 0.60886213, 0.52821873, 0.24743029,
       0.71440025, 0.24743029, 0.13747945, 0.87109391, 0.19848161,
       0.59131774, 0.23252979, 0.52985057, 0.17149148, 0.21387169,
       0.76044697, 0.80265665, 0.88208942, 0.31230235, 0.15673088,
       0.21387169, 0.28965918, 0.40927453, 0.94947423, 0.13747945,
       0.2731386 , 0.97158985, 0.28965918, 0.87371544, 0.2520508 ,
       0.52844056, 0.13747945, 0.51695794, 0.63418064, 0.11860123,
       0.39914678, 0.70226148, 0.06032517, 0.31230235, 0.48565775,
       0.2731386 , 0.23252979, 0.74496648, 0.28965918, 0.10817

### Saving the model

In [55]:
import pickle

In [56]:
with open('model', 'wb') as file:
    pickle.dump(log, file)

In [57]:
with open('scaler', 'wb') as file:
    pickle.dump(scaler, file)