#### Classification using logistic regression
* We want to use classify the people into excessively absent and moderately absent
* one naiive and simple way to prepare the target data for such classification is to divide the target column by mean threshold


In [250]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [251]:
df_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
df_preprocessed.describe()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.25,0.008571,0.09,0.945714,6.36,2.011429,222.347143,29.892857,36.417143,271.801774,26.737143,0.167143,1.021429,0.687143,6.761429
std,0.433322,0.09225,0.286386,0.226743,3.50501,1.480396,66.31296,14.804446,6.379083,40.021804,4.254701,0.37337,1.112215,1.166095,12.670082
min,0.0,0.0,0.0,0.0,1.0,0.0,118.0,5.0,27.0,205.917,19.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,3.0,1.0,179.0,16.0,31.0,241.476,24.0,0.0,0.0,0.0,2.0
50%,0.0,0.0,0.0,1.0,6.0,2.0,225.0,26.0,37.0,264.249,25.0,0.0,1.0,0.0,3.0
75%,0.25,0.0,0.0,1.0,10.0,3.0,260.0,50.0,40.0,294.217,31.0,0.0,2.0,1.0,8.0
max,1.0,1.0,1.0,1.0,12.0,6.0,388.0,52.0,58.0,378.884,38.0,1.0,4.0,8.0,120.0


#### Target data for classification
* set median value as the threshold to cut samples into excessive absence and regular absence
* using median, the resulting dataset will be balanced between the two classes
* But, keep in mind that such decision should be made according to the requests and questions you are trying to answer.

In [252]:
med = df_preprocessed['Absenteeism Time in Hours'].median(axis=0)
med

3.0

In [253]:
targets = df_preprocessed['Absenteeism Time in Hours'].map(lambda x: 1 if (x>med) else 0)
df_preprocessed['Excessively Absenteeism'] = targets
df_with_targets = df_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

In [254]:
df_with_targets.describe()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessively Absenteeism
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.25,0.008571,0.09,0.945714,6.36,2.011429,222.347143,29.892857,36.417143,271.801774,26.737143,0.167143,1.021429,0.687143,0.455714
std,0.433322,0.09225,0.286386,0.226743,3.50501,1.480396,66.31296,14.804446,6.379083,40.021804,4.254701,0.37337,1.112215,1.166095,0.498391
min,0.0,0.0,0.0,0.0,1.0,0.0,118.0,5.0,27.0,205.917,19.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,3.0,1.0,179.0,16.0,31.0,241.476,24.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,6.0,2.0,225.0,26.0,37.0,264.249,25.0,0.0,1.0,0.0,0.0
75%,0.25,0.0,0.0,1.0,10.0,3.0,260.0,50.0,40.0,294.217,31.0,0.0,2.0,1.0,1.0
max,1.0,1.0,1.0,1.0,12.0,6.0,388.0,52.0,58.0,378.884,38.0,1.0,4.0,8.0,1.0


#### select the inputs

In [255]:
unscaled_inputs = df_with_targets.iloc[:, :-1]
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [286]:
dummy_cols = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

tobe_scaled_cols = ['Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets']

TypeError: bad operand type for unary -: 'list'

In [257]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler()
absenteeism_scaler.fit(unscaled_inputs[tobe_scaled_cols])
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs[tobe_scaled_cols])
scaled_inputs_df = pd.DataFrame(data=scaled_inputs, columns=tobe_scaled_cols)
scaled_inputs_df = pd.concat([unscaled_inputs[dummy_cols], scaled_inputs_df], axis=1)
scaled_inputs_df.describe()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.25,0.008571,0.09,0.945714,-8.120488e-17,-1.040438e-16,-2.131628e-16,5.582836e-17,1.319579e-16,-8.526513e-16,1.446462e-16,0.0,9.135549e-17,-1.2688260000000002e-17
std,0.433322,0.09225,0.286386,0.226743,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715
min,0.0,0.0,0.0,0.0,-1.530333,-1.359682,-1.574681,-1.682647,-1.477309,-1.647399,-1.819793,-0.44798,-0.91903,-0.5896898
25%,0.0,0.0,0.0,1.0,-0.9593133,-0.6837035,-0.6541427,-0.9390957,-0.8498113,-0.7582731,-0.643782,-0.44798,-0.91903,-0.5896898
50%,0.0,0.0,0.0,1.0,-0.1027836,-0.007725463,0.04003371,-0.2631399,0.09143539,-0.1888514,-0.4085798,-0.44798,-0.01928035,-0.5896898
75%,0.25,0.0,0.0,1.0,1.039256,0.6682526,0.5682114,1.359154,0.5620587,0.5604758,1.002633,-0.44798,0.8804693,0.2684866
max,1.0,1.0,1.0,1.0,1.610276,2.696187,2.499833,1.494345,3.385799,2.67751,2.649049,2.232242,2.679969,6.275721


In [258]:
scaled_inputs_df.shape

(700, 14)

#### Divide the data into train, validation and test
* Instead of manual shuffle and manual division, this time we use sklearn ready function

In [278]:
from sklearn.model_selection import train_test_split
x_train, test_input, y_train, test_target = train_test_split(scaled_inputs_df, targets, train_size=0.8, random_state=20)
unscaled_x_train, unscaled_test_input, unscaled_y_train, unscaled_test_target = train_test_split(unscaled_inputs, targets, train_size=0.8, random_state=20)

In [260]:
print(x_train.shape)
print(y_train.shape)
print(test_input.shape)
print(test_target.shape)

(560, 14)
(560,)
(140, 14)
(140,)


### Creating the Model with sklearn

In [261]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#### Create the model

In [262]:
reg = LogisticRegression()

#### Fit the model

In [263]:
reg.fit(x_train, y_train)

#### Show the accuracy of the model

In [264]:
reg.score(x_train, y_train)

0.7875

In [265]:
# reg.score(test_input, test_target)

#### Manually check the accuracy

In [266]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [267]:
model_outputs == y_train

346     True
91      True
299     True
129     True
695     True
       ...  
218     True
223    False
271     True
474    False
355     True
Name: Absenteeism Time in Hours, Length: 560, dtype: bool

In [268]:
np.sum(model_outputs == y_train)

441

In [269]:
model_outputs.shape[0]

560

In [270]:
np.sum(model_outputs == y_train)/model_outputs.shape[0]

0.7875

### We can now extract the logistic regression equation from the model and use it to prediction outside here

#### Finding the intercept and coefficients

In [271]:
reg.intercept_

array([-3.28133526])

In [272]:
reg.coef_

array([[ 1.96069738,  0.45912059,  2.34294784,  2.48910819,  0.18626043,
        -0.0846088 ,  0.67899083, -0.03743055, -0.17952223,  0.00847809,
         0.29874766, -0.10464221,  0.38323021, -0.31170546]])

In [273]:
scaled_inputs_df.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [274]:
feature_name = scaled_inputs_df.columns.values

In [275]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,1.960697
1,Reason_2,0.459121
2,Reason_3,2.342948
3,Reason_4,2.489108
4,Month Value,0.18626
5,Day of the Week,-0.084609
6,Transportation Expense,0.678991
7,Distance to Work,-0.037431
8,Age,-0.179522
9,Daily Work Load Average,0.008478


#### intercept
* we would like to put the intercept at the beginning of the dataframe
* we can shift the index by one, and then add the new row at location 0
* after than sort the dataframe by index again

In [276]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table.sort_index(inplace=True)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-3.281335
1,Reason_1,1.960697
2,Reason_2,0.459121
3,Reason_3,2.342948
4,Reason_4,2.489108
5,Month Value,0.18626
6,Day of the Week,-0.084609
7,Transportation Expense,0.678991
8,Distance to Work,-0.037431
9,Age,-0.179522


#### Calculate the odds_ratio for each feature
* since the equation is logistic regression, we use log function. log(odds) = intercept + b1x1 + b2x2 ..
* the odds_ratio for each variable means to add exponential to both sides of the equation
* odds_ration = e^intercept + e^b1x1 + ...

In [277]:
summary_table['odds_ratio'] = np.exp(summary_table['Coefficient'])
summary_table.sort_values(by= 'odds_ratio', ascending=False)
# Whenever a person states absence, we have a higher chance of getting excessive absence

Unnamed: 0,Feature name,Coefficient,odds_ratio
4,Reason_4,2.489108,12.050525
3,Reason_3,2.342948,10.411884
1,Reason_1,1.960697,7.10428
7,Transportation Expense,0.678991,1.971887
2,Reason_2,0.459121,1.582682
13,Children,0.38323,1.467016
11,Body Mass Index,0.298748,1.348169
5,Month Value,0.18626,1.204736
10,Daily Work Load Average,0.008478,1.008514
8,Distance to Work,-0.037431,0.963261


#### a second model with unscaled inputs
* this model is easier to interpret but it loses accuracy

In [282]:
reg_unscaled = LogisticRegression(max_iter=10000)
reg_unscaled.fit(unscaled_x_train, unscaled_y_train)
reg_unscaled.score(unscaled_x_train, unscaled_y_train)


0.7892857142857143

In [283]:
feature_name2 = unscaled_inputs.columns.values
summary_table2 = pd.DataFrame(columns=['Feature name'], data=feature_name2)
summary_table2['Coefficient'] = np.transpose(reg_unscaled.coef_)
summary_table2.index = summary_table2.index + 1
summary_table2.loc[0] = ['Intercept', reg_unscaled.intercept_[0]]
summary_table2.sort_index(inplace=True)
summary_table2['odds_ratio'] = np.exp(summary_table2['Coefficient'])
summary_table2.sort_values(by= 'odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,odds_ratio
4,Reason_4,2.452127,11.613019
3,Reason_3,2.360519,10.596451
1,Reason_1,1.967871,7.155425
2,Reason_2,0.506557,1.659568
13,Children,0.345648,1.412905
11,Body Mass Index,0.071798,1.074438
5,Month Value,0.05338,1.054831
7,Transportation Expense,0.010362,1.010416
10,Daily Work Load Average,0.000145,1.000145
8,Distance to Work,-0.002728,0.997275


### Backward Elimination