In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
data1 = pd.read_csv("Absenteeism-data.csv")

In [3]:
data = data1.copy()

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


In [5]:
reasons = pd.get_dummies(data["Reason for Absence"], drop_first = True,dtype=int)

In [6]:
data=data.drop(["ID","Reason for Absence"],axis=1)

In [7]:
reason_1 = reasons.loc[:,1:14].max(axis=1)
reason_2 = reasons.loc[:,15:17].max(axis=1)
reason_3 = reasons.loc[:,18:21].max(axis=1)
reason_4 = reasons.loc[:,22:].max(axis=1)

In [8]:
data = pd.concat([data, reason_1, reason_2, reason_3, reason_4], axis = 1)

In [9]:
data.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
      dtype=object)

In [10]:
columns_rename = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', "Reason_1", "Reason_2", "Reason_3", "Reason_4"]
data.columns = columns_rename

In [11]:
data.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_1,Reason_2,Reason_3,Reason_4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [12]:
columns_reorder = [ "Reason_1", "Reason_2", "Reason_3", "Reason_4",'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
data = data[columns_reorder]

In [13]:
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [14]:
type(data["Date"][0])

str

In [15]:
data["Date"] = pd.to_datetime(data["Date"],format="%d/%m/%Y")

In [16]:
list_months = []
for i in range(data.shape[0]):
    list_months.append(data['Date'][i].month)
data['Month Value'] = list_months
def date_to_weekday(date_value):
    return date_value.weekday()
data['Day of the Week'] = data['Date'].apply(date_to_weekday)

In [17]:
data = data.drop(['Date'], axis = 1)

In [18]:
data.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Month Value',
       'Day of the Week'], dtype=object)

In [19]:
columns_reorder = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Month Value','Day of the Week',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

In [20]:
data = data[columns_reorder]

In [21]:
data['Education'] = data['Education'].map({1:0, 2:1, 3:1, 4:1})

In [22]:
data['Absenteeism Time in Hours'].median()

3.0

In [23]:
targets = np.where(data['Absenteeism Time in Hours'] > data['Absenteeism Time in Hours'].median(), 1, 0)

In [24]:
data['Excessive Absenteeism'] = targets

In [25]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [26]:
data = data.drop(['Absenteeism Time in Hours'],axis=1)

In [27]:
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


In [28]:
data.shape

(700, 15)

In [29]:
unscaled_inputs = data.iloc[:,:-1]
unscaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1


In [30]:
class CustomScaler(BaseEstimator,TransformerMixin): 
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)
        self.columns = columns
    def fit(self, X, y=0):
        self.scaler.fit(X[self.columns], y)
        return self
    def transform(self, X, y=0, copy=True):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [31]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 
                   'Month', 'Day of the Week', 'Education', 'Children', 'Pets']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]
absenteeism_scaler = CustomScaler(columns_to_scale)
absenteeism_scaler.fit(unscaled_inputs)
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs.head(20)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,2,1
1,0,0,0,0,0.182726,1,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,1,0
2,0,0,0,1,0.182726,2,-0.654143,1.426749,0.24831,-0.806331,1.002633,0,0,0
3,1,0,0,0,0.182726,3,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,2,0
4,0,0,0,1,0.182726,3,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,2,1
5,0,0,0,1,0.182726,4,-0.654143,1.426749,0.24831,-0.806331,1.002633,0,0,0
6,0,0,0,1,0.182726,4,2.092381,1.494345,-1.320435,-0.806331,0.061825,0,1,4
7,0,0,0,1,0.182726,4,0.568211,1.359154,-0.065439,-0.806331,-0.878984,0,4,0
8,0,0,1,0,0.182726,0,-1.016322,-1.209478,-0.379188,-0.806331,-0.40858,0,2,0
9,0,0,0,1,0.182726,0,0.190942,-1.277074,0.091435,-0.806331,0.532229,1,1,1


In [32]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size = 0.2, random_state = 42)
print (x_train.shape, y_train.shape)
print (x_test.shape, y_test.shape)

(560, 14) (560,)
(140, 14) (140,)


In [33]:
modal = LogisticRegression()
modal.fit(x_train,y_train)

In [34]:
modal.score(x_train,y_train)

0.7678571428571429

In [35]:
feature_name = unscaled_inputs.columns.values
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(modal.coef_)
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', modal.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.697336
1,Reason_1,2.929546
2,Reason_2,0.732485
3,Reason_3,3.072405
4,Reason_4,0.994685
5,Month Value,0.079796
6,Day of the Week,-0.107707
7,Transportation Expense,0.674758
8,Distance to Work,-0.056624
9,Age,-0.258149


In [36]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,3.072405,21.593772
1,Reason_1,2.929546,18.71913
4,Reason_4,0.994685,2.703873
2,Reason_2,0.732485,2.080244
7,Transportation Expense,0.674758,1.963557
13,Children,0.377281,1.458313
11,Body Mass Index,0.245613,1.278405
5,Month Value,0.079796,1.083066
10,Daily Work Load Average,-0.020675,0.979538
8,Distance to Work,-0.056624,0.944949


In [38]:
modal.score(x_test,y_test)

0.7785714285714286