## Logistic Regression

### Import Libraries

In [85]:
import numpy as np
import pandas as pd

### Load Data

In [86]:
df=pd.read_csv('Absenteeism_preprocessed.csv')

In [87]:
df.sample(5)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
18,1,0,0,0,7,3,189,29,33,239.554,25,0,2,2,8
275,0,0,0,1,9,1,291,31,40,294.217,25,0,1,1,3
40,0,0,0,1,1,4,184,42,27,241.476,21,0,0,0,2
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0,4
332,1,0,0,0,11,0,246,25,41,284.031,23,0,0,0,8


### Create Targets

In [88]:
md=df['Absenteeism Time in Hours'].median()

In [89]:
targets = np.where(df['Absenteeism Time in Hours']>md,1,0)

In [90]:
targets[0:10]

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1])

In [91]:
df['Excessive Absenteeism']=targets

### A comment on the targets

In [92]:
targets.sum() / targets.shape[0]
## 45% are 1

0.45571428571428574

In [93]:
df=df.drop('Absenteeism Time in Hours',axis=1)

In [94]:
df.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


### Select the inputs for regression

In [95]:
unscaled_inputs=df.iloc[:,:-1]

### Standardize

In [113]:
from sklearn.preprocessing import StandardScaler

absenteeism_scaler=StandardScaler()

In [114]:
absenteeism_scaler.fit(unscaled_inputs)

In [115]:
scaled_inputs=absenteeism_scaler.transform(unscaled_inputs)

In [116]:
scaled_inputs[0:5]

array([[-0.57735027, -0.09298136, -0.31448545,  0.82136542,  0.03079619,
        -0.80094984,  1.00584437,  0.41281584, -0.53606239, -0.80633129,
         0.76743118, -0.44798003,  0.88046927,  0.26848661],
       [-0.57735027, -0.09298136, -0.31448545, -1.21748491,  0.03079619,
        -0.80094984, -1.57468098, -1.1418824 ,  2.13080317, -0.80633129,
         1.00263338, -0.44798003, -0.01928035, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545,  0.82136542,  0.03079619,
        -0.23290031, -0.6541427 ,  1.42674947,  0.24830984, -0.80633129,
         1.00263338, -0.44798003, -0.91902997, -0.58968976],
       [ 1.73205081, -0.09298136, -0.31448545, -1.21748491,  0.03079619,
         0.33514923,  0.85493646, -1.68264701,  0.40518428, -0.80633129,
        -0.64378202, -0.44798003,  0.88046927, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545,  0.82136542,  0.03079619,
         0.33514923,  1.00584437,  0.41281584, -0.53606239, -0.80633129,
         0.76743118, -0.44

In [117]:
scaled_inputs.shape

(700, 14)

### Split data into train and test and shuffle

In [118]:
from sklearn.model_selection import train_test_split

In [119]:
x_train,x_test,y_train,y_test=train_test_split(scaled_inputs,targets,test_size=0.2,random_state=20)

In [120]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)


(560, 14) (560,)
(140, 14) (140,)


### Logistic Regression Model

In [121]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training Model

In [122]:
reg=LogisticRegression()

In [123]:
reg.fit(x_train,y_train)

In [124]:
reg.score(x_train,y_train)

0.7803571428571429

### Manualy Check The Accuracy

In [125]:
model_outputs=reg.predict(x_train)

In [126]:
np.sum(model_outputs == y_train)

437

In [127]:
np.sum(model_outputs == y_train)/len(model_outputs)

0.7803571428571429

### Finding the intercept and coefficient

In [128]:
reg.intercept_

array([-0.21150898])

In [129]:
reg.coef_

array([[ 2.07192269,  0.33075027,  1.56390046,  1.31283678,  0.02577323,
        -0.08622837,  0.72326969, -0.06149037, -0.20628304, -0.02865604,
         0.3259002 , -0.16141702,  0.38153429, -0.32129837]])

In [130]:
feature_name=unscaled_inputs.columns.values

In [131]:
summary_table=pd.DataFrame(columns=['Feature Name'],data=feature_name)
summary_table['Coefficient']=np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Reason_1,2.071923
1,Reason_2,0.33075
2,Reason_3,1.5639
3,Reason_4,1.312837
4,Month Value,0.025773
5,Day of the Week,-0.086228
6,Transportation Expense,0.72327
7,Distance to Work,-0.06149
8,Age,-0.206283
9,Daily Work Load Average,-0.028656


In [132]:
summary_table.index=summary_table.index+1
summary_table.loc[0]=['Intercept',reg.intercept_[0]]
summary_table=summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-0.211509
1,Reason_1,2.071923
2,Reason_2,0.33075
3,Reason_3,1.5639
4,Reason_4,1.312837
5,Month Value,0.025773
6,Day of the Week,-0.086228
7,Transportation Expense,0.72327
8,Distance to Work,-0.06149
9,Age,-0.206283


### Interpreting the Coefficients

In [135]:
summary_table['Odds_Ratio']=np.exp(summary_table['Coefficient'])

In [136]:
summary_table.sort_values('Odds_Ratio',ascending=False)

Unnamed: 0,Feature Name,Coefficient,Odds_Ratio
1,Reason_1,2.071923,7.940075
3,Reason_3,1.5639,4.777419
4,Reason_4,1.312837,3.716702
7,Transportation Expense,0.72327,2.061162
13,Children,0.381534,1.46453
2,Reason_2,0.33075,1.392012
11,Body Mass Index,0.3259,1.385277
5,Month Value,0.025773,1.026108
10,Daily Work Load Average,-0.028656,0.971751
8,Distance to Work,-0.06149,0.940362


### Testing Model

In [137]:
reg.score(x_test,y_test)

0.7285714285714285

In [140]:
predicted_proba=reg.predict_proba(x_test)
predicted_proba[0:10]

array([[0.76491084, 0.23508916],
       [0.61192048, 0.38807952],
       [0.40784145, 0.59215855],
       [0.78559655, 0.21440345],
       [0.06083149, 0.93916851],
       [0.26784831, 0.73215169],
       [0.28605183, 0.71394817],
       [0.06690094, 0.93309906],
       [0.75855461, 0.24144539],
       [0.76534143, 0.23465857]])