In [64]:
#Reading the Data and including important library files needed to handle dataframe functions and plotting
import pandas as pd
import scipy.cluster.hierarchy as shc
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

#Reading Excel file
Absent_data = pd.read_excel('Absenteeism_at_work.xls')
Absent_data.head()

Unnamed: 0,ID,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,...,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
0,11,26,7,3,1,289,36,13,33,239554,...,0,1,2,1,0,1,90,172,30,4
1,36,0,7,3,1,118,13,18,50,239554,...,1,1,1,1,0,0,98,178,31,0
2,3,23,7,4,1,179,51,18,38,239554,...,0,1,0,1,0,0,89,170,31,2
3,7,7,7,5,1,279,5,14,39,239554,...,0,1,2,1,1,0,68,168,24,4
4,11,23,7,5,1,289,36,13,33,239554,...,0,1,2,1,0,1,90,172,30,2


In [82]:
#Data Preprocessing, removing ID column because it is irrelevent for our prediction
df = Absent_data.drop(['ID'], axis = 1,errors = 'coerce')
df.head()

Unnamed: 0,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,Hit target,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
0,26,7,3,1,289,36,13,33,239554,97,0,1,2,1,0,1,90,172,30,4
1,0,7,3,1,118,13,18,50,239554,97,1,1,1,1,0,0,98,178,31,0
2,23,7,4,1,179,51,18,38,239554,97,0,1,0,1,0,0,89,170,31,2
3,7,7,5,1,279,5,14,39,239554,97,0,1,2,1,1,0,68,168,24,4
4,23,7,5,1,289,36,13,33,239554,97,0,1,2,1,0,1,90,172,30,2


In [66]:
#To check null values in the dataset.
df.isnull().sum()

Reason for absence                 0
Month of absence                   0
Day of the week                    0
Seasons                            0
Transportation expense             0
Distance from Residence to Work    0
Service time                       0
Age                                0
Work load Average/day              0
Hit target                         0
Disciplinary failure               0
Education                          0
Son                                0
Social drinker                     0
Social smoker                      0
Pet                                0
Weight                             0
Height                             0
Body mass index                    0
Absenteeism time in hours          0
dtype: int64

In [67]:
#DEFINE THE PREDICTORS (X) AND OUTCOME (Y), I choose Absenteeism time in hours as it is the factor variable 
#that we have to predict the effect of other variables over this.
X = df1.drop(['Absenteeism time in hours'], axis=1)
y = df1['Absenteeism time in hours']
X.shape

(740, 19)

In [68]:
#Logistic Regression using selection model to choose what all things affect the most to the number of absent hours
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

#Fitting X & Y subsets that we defined earlier to predict the outcome over our factor variable
Logic_model = SelectFromModel(estimator=LogisticRegression(max_iter=5000)).fit(X, y)
Logic_model.estimator_.coef_


array([[-5.39669026e-01, -1.37128822e+00,  1.91257054e-01,
         2.49741822e-01, -3.99582658e-01,  2.23576193e-01,
        -4.36500799e-01,  3.59216310e-01,  1.20736954e-01,
        -5.61721458e-01,  2.00707760e+00, -2.43550090e-01,
        -1.16525267e-01, -3.74983610e-01,  1.33896068e-01,
         3.64586588e-01,  1.62253029e-01, -2.95362836e-01,
         3.83047811e-01],
       [ 9.45721690e-01,  1.15597797e-01,  6.01702301e-01,
        -1.12103557e-01,  1.60817250e-01,  3.73501800e-01,
         2.52543530e-01, -3.46650365e-02,  8.33989025e-02,
        -9.61053615e-02, -6.37398488e-01,  2.97532250e-01,
        -3.40407951e-01, -4.77786972e-01,  7.38184323e-02,
         1.04261658e-01,  2.85687455e-04, -1.42883599e-01,
         4.00262516e-02],
       [ 1.99359533e-02, -2.67620998e-01,  6.11504820e-01,
         1.80326465e-02,  2.75614756e-01, -4.14548293e-02,
         4.07378075e-01, -5.08832426e-01,  3.71762849e-01,
        -3.76239076e-01, -4.43879033e-01,  4.19554384e-01,
    

In [69]:
#Get the threshold value
Logic_model.threshold_

2.6740547939386197

In [70]:
Logic_model.get_support()

array([ True,  True,  True, False, False, False,  True, False,  True,
        True,  True, False, False, False, False, False, False, False,
       False])

In [71]:
X_logic = Logic_model.transform(X) #Gives the subset of features to use in analysis
X_logic.shape

(740, 7)

### The above result states that only 7 columns are the most important ones to predict the number of absent hours of an employee, To confirm we will use other techniques to see what changes.

In [84]:
#Using LinearSVC model to check if loistic reression is the best fit model for analysis
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

lsvc = LinearSVC(C=0.02, penalty="l1", dual=False).fit(X, y)
model_svm = SelectFromModel(lsvc, prefit=True)

In [85]:
model_svm.get_support()

array([ True, False,  True, False, False, False, False, False,  True,
       False,  True,  True,  True,  True, False, False, False,  True,
       False])

In [86]:
X_svm = model_svm.transform(X) #Gives the subset of features to use in analysis
X_svm.shape

(740, 8)

### The predictors above are less or more same. Lets check the accuracy score for both the models in order to predict the right columns to conclude what factors influence the number of absent hours.

In [75]:

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_logic, y, test_size=0.3, random_state=0)
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train, y_train)
y_pred_man = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.4f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.8964


In [87]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_svm, y, test_size=0.3, random_state=0)
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train1, y_train1)
y_pred_man = logreg.predict(X_test1)
print('Accuracy of logistic regression classifier on test set: {:.4f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.8964


### I tried using two decimal point but the answer is still same for four decimal points so it's hard to predict but I will add my conclusions in the report. 