# Absenteeism Preporocessing and Modeling

## Preprocessing

### Load the Libraries and data

In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_csv_data= pd.read_csv('Absenteeism_data.csv')
df= raw_csv_data.copy()

In [3]:
df = df.drop(['ID'], axis= 1)
reason_columns= pd.get_dummies(df['Reason for Absence'])
df= df.drop(['Reason for Absence'], axis=1)

reason_type_1= reason_columns.loc[:, 1:14].max(axis=1)
reason_type_2= reason_columns.loc[:, 15:17].max(axis=1)
reason_type_3= reason_columns.loc[:, 18:21].max(axis=1)
reason_type_4= reason_columns.loc[:, 22:28].max(axis=1)

df= pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis= 1)

columns_names= ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

df.columns= columns_names

columns_names_orders= ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

df= df[columns_names_orders]

In [4]:
df_reason_mod= df.copy()

In [5]:
type(df_reason_mod['Date'][0])

df_reason_mod['Date']=pd.to_datetime(df_reason_mod['Date'], format= '%d/%m/%Y')

list_months=[]
for i in range(len(df_reason_mod['Date'])):
               list_months.append(df_reason_mod['Date'][i].month)
        
df['Month Values']= list_months

df_reason_mod['Date'][0].weekday()

def date_to_week(date_value):
    return date_value.weekday()
df_reason_mod['Day of the Week']= df_reason_mod['Date'].apply(date_to_week)

df_reason_mod['Education'].unique()
df_reason_mod['Education'].value_counts()
# 1: High school, 2: Graduate, 3: Postgraduate, 4: Master or Doctor

map = {1: 0, 2: 1, 3: 1, 4:1}
df_reason_mod['Education']= df_reason_mod['Education'].map(map)

df_reason_mod= df_reason_mod.drop(['Date'], axis=1)

df_reason_mod['Month Values']= df['Month Values']

df_reason_mod.columns.values

columns=['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Month Values', 'Day of the Week', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours'
       ]

df_reason_mod= df_reason_mod[columns]

In [6]:
# Checkpoint

data_preprocessed= df_reason_mod.copy()

# data_preprocessed.to_excel('Abseteeism_preprocessed.xlsx')
# data_preprocessed= pd.read_csv('Abseteeism_preprocessed.csv')
# data_preprocessed= data_preprocessed.drop(['Unnamed: 0'], axis=1)

In [7]:
median= data_preprocessed['Absenteeism Time in Hours'].median()
targets= np.where(data_preprocessed['Absenteeism Time in Hours']>median, 1, 0)

targets.sum()/targets.shape[0]   # A balance of targets is essential for targets, 45%-55% for targets is always sufficient

data_with_targets= data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

unscaled_inputs= data_with_targets.iloc[:, :14]

### Standardizing the data

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy= True, with_maen= True, with_std= True):
        self.scaler= StandardScaler()
        self.columns= columns
        self.mean= None
        self.var= None
        
    def fit(self, X, y= None):
        self.scaler.fit(X[self.columns], y)
        self.mean = np.mean(X[self.columns])
        self.var= np.var(X[self.columns])
        return self
    
    def transform(self, X, y= None):
        init_col_order= X.columns
        X_scaled= pd.DataFrame(self.scaler.transform(X[self.columns]), columns= self.columns)
        X_not_scaled= X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [9]:
columns_to_omit= ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']
columns_to_scale= [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

absenteeism_scaler= CustomScaler(columns_to_scale)
absenteeism_scaler.fit(unscaled_inputs)
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

### Split the data into train & test and shuffle 

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test= train_test_split(scaled_inputs, targets, train_size= 0.8, shuffle= True, random_state= 120)

## Logistic Regression Model

### Training the Model, Prediction

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
reg= LogisticRegression()
reg.fit(x_train, y_train)

LogisticRegression()

In [13]:
reg.score(x_train, y_train)

0.7678571428571429

In [14]:
model_outputs=reg.predict(x_train)

np.sum(model_outputs == y_train)/ model_outputs.shape[0]

0.7678571428571429

### Finding the intercept and coefficients

In [15]:
print(reg.intercept_, reg.coef_)

[-1.97766912] [[ 2.92224312  1.66511081  1.66511081  1.2255857   0.11528594 -0.20847351
   0.78472433 -0.07390694 -0.27591319  0.05386436  0.32688999  0.18619675
   0.46380254 -0.35843023]]


In [16]:
feature_name= unscaled_inputs.columns.values

summary_table=pd.DataFrame(columns= ['Features'], data= feature_name)
summary_table['Coefficients']= np.transpose(reg.coef_)

In [17]:
summary_table.index= summary_table.index + 1
summary_table.loc[0]= ['Intercept', reg.intercept_[0]]

In [18]:
summary_table= summary_table.sort_index()
summary_table['Odds_ratio']= np.exp(summary_table.Coefficients)
summary_table.sort_values('Odds_ratio', ascending= False)

Unnamed: 0,Features,Coefficients,Odds_ratio
1,Reason_1,2.922243,18.582924
2,Reason_2,1.665111,5.286259
3,Reason_3,1.665111,5.286259
4,Reason_4,1.225586,3.40616
7,Transportation Expense,0.784724,2.191803
13,Children,0.463803,1.590109
11,Body Mass Index,0.32689,1.386649
12,Education,0.186197,1.204659
5,Month Values,0.115286,1.122194
10,Daily Work Load Average,0.053864,1.055341


### Testing the Model

In [19]:
reg.score(x_test, y_test)

0.7142857142857143

In [20]:
predicted_proba= reg.predict_proba(x_test)