In [None]:
#Importing libraries
import os
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from fancyimpute import KNN

#Libraries for plots
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Setting working directory
os.chdir("C:/Users/Rishabh/Desktop/All/edwisor/Project2")

# Loading data
emp_absent = pd.read_excel("Absenteeism_at_work.xls")

# Exploratory Data Analysis

In [None]:
emp_absent.shape

In [None]:
# First 5 rows of data
emp_absent.head()

In [None]:
# Data Types of all the variables
emp_absent.dtypes

In [None]:
# Number of Unique values present in each variable
emp_absent.nunique()

In [None]:
#Transform data types
emp_absent['ID'] = emp_absent['ID'].astype('category')

emp_absent['Reason for absence'] = emp_absent['Reason for absence'].replace(0,20)
emp_absent['Reason for absence'] = emp_absent['Reason for absence'].astype('category')

emp_absent['Month of absence'] = emp_absent['Month of absence'].replace(0,np.nan)
emp_absent['Month of absence'] = emp_absent['Month of absence'].astype('category')

emp_absent['Day of the week'] = emp_absent['Day of the week'].astype('category')
emp_absent['Seasons'] = emp_absent['Seasons'].astype('category')
emp_absent['Disciplinary failure'] = emp_absent['Disciplinary failure'].astype('category')
emp_absent['Education'] = emp_absent['Education'].astype('category')
emp_absent['Son'] = emp_absent['Son'].astype('category')
emp_absent['Social drinker'] = emp_absent['Social drinker'].astype('category')
emp_absent['Social smoker'] = emp_absent['Social smoker'].astype('category')
emp_absent['Pet'] = emp_absent['Pet'].astype('category')

In [None]:
#Copy of dataframe
df = emp_absent.copy()

In [None]:
#Categorising variables into " Continuos" and "Categorical"
continuous_vars = ['Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Transportation expense',
       'Hit target', 'Weight', 'Height', 'Body mass index', 'Absenteeism time in hours']

categorical_vars = ['ID','Reason for absence','Month of absence','Day of the week',
                     'Seasons','Disciplinary failure', 'Education', 'Social drinker',
                     'Social smoker', 'Pet', 'Son']

# Missing Value Analysis

In [None]:
#dataframe with number of missing values
missing_val = pd.DataFrame(df.isnull().sum())

#Getting row names as columns
missing_val = missing_val.reset_index()

#Renaming columns
missing_val = missing_val.rename(columns = {'index': 'Variables', 0: 'Missing_perc'})
missing_val

#Missing value percentage
missing_val['Missing_perc'] = (missing_val['Missing_perc']/len(df))*100

#Sorting rows as per missing percentage
missing_val = missing_val.sort_values('Missing_perc', ascending = False).reset_index(drop = True)
missing_val.to_csv("Missing_perc.csv", index = False)

missing_val

# Impute missing values


In [None]:
#Actual value = 29
#Mean = 26.68
#Median = 25
#KNN = 29
print(df['Body mass index'].iloc[1])

#Set the value of first row in Body mass index as NAN
#create missing value
df['Body mass index'].iloc[1] = np.nan

In [None]:
#Mean Imputation
#df['Body mass index'] = df['Body mass index'].fillna(df['Body mass index'].mean())

#Median Imputation
#df['Body mass index'] = df['Body mass index'].fillna(df['Body mass index'].median())

#KNN Imputation
df = pd.DataFrame(KNN(k = 3).fit_transform(df), columns = df.columns)
df['Body mass index'].iloc[1]

In [None]:
#cCategorical values rounding
for i in categorical_vars:
    df.loc[:,i] = df.loc[:,i].round()    
    df.loc[:,i] = df.loc[:,i].astype('category')

In [None]:
#Rechecking missing values
df.isnull().sum()

# Distribution of data using graphs

In [None]:
#Plotting Bar graph of categorical Data
sns.set_style("whitegrid")
sns.factorplot(data=df, x='Reason for absence', kind= 'count',size=4,aspect=2)
sns.factorplot(data=df, x='Seasons', kind= 'count',size=4,aspect=2)
sns.factorplot(data=df, x='Education', kind= 'count',size=4,aspect=2)
sns.factorplot(data=df, x='Disciplinary failure', kind= 'count',size=4,aspect=2)

In [None]:
#Distribution of numeric data
plt.hist(data=df, x='Weight', bins='auto', label='Weight')
plt.xlabel('Weight')
plt.title("Weight Distribution")

In [None]:
#Checking distribution of numeric data
plt.hist(data=df, x='Age', bins='auto', label='Age')
plt.xlabel('Age')
plt.title("Age Distribution")

In [None]:
#Boxplot to check outliers
sns.boxplot(data=df[['Absenteeism time in hours','Body mass index','Height','Weight']])
fig=plt.gcf()
fig.set_size_inches(8,8)

In [None]:
#Boxplot to check outliers
sns.boxplot(data=df[['Hit target','Service time','Age','Transportation expense']])
fig=plt.gcf()
fig.set_size_inches(8,8)

# Outlier Analysis

In [None]:
for i in continuous_vars:
    q75, q25 = np.percentile(df[i], [75,25])
    iqr = q75 - q25
    
    # Calculating upper-ex and lower-ex
    minimum = q25 - (iqr*1.5)
    maximum = q75 + (iqr*1.5)
    
    # Replacing all the outliers with NA
    df.loc[df[i]< minimum,i] = np.nan
    df.loc[df[i]> maximum,i] = np.nan


# Impute NA's with KNN
df = pd.DataFrame(KNN(k = 3).fit_transform(df), columns = df.columns)
# Rechecking missing value
df.isnull().sum()

In [None]:
#Boxplot to check outliers
sns.boxplot(data=df[['Absenteeism time in hours','Body mass index','Height','Weight']])
fig=plt.gcf()
fig.set_size_inches(8,8)

In [None]:
#Boxplot to check outliers
sns.boxplot(data=df[['Hit target','Service time','Age','Transportation expense']])
fig=plt.gcf()
fig.set_size_inches(8,8)

# Feature Selection

In [None]:
#Subsetting continuous variables
df_corr = df.loc[:,continuous_vars]

In [None]:
#Multicollinearity test
f, ax = plt.subplots(figsize=(10, 10))

#Generating correlation matrix
corr = df_corr.corr()

#Plot matrix
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), 
            cmap=sns.diverging_palette(220, 50, as_cmap=True),
            square=True, ax=ax, annot = True)
plt.plot()

In [None]:
#Variable Reduction
to_drop = ['Weight']
df = df.drop(to_drop, axis = 1)

In [None]:
# Updating the dataset
continuous_vars.remove('Weight')

In [None]:
#Copy of clean data
clean_data = df.copy()

In [None]:
continuous_vars

# Feature Scaling

In [None]:
#Normality check
for i in continuous_vars:
    if i == 'Absenteeism time in hours':
        continue
    sns.distplot(df[i],bins = 'auto')
    plt.title("Checking Distribution for Variable "+str(i))
    plt.ylabel("Density")
    plt.show()

In [None]:
#Normalizing continuous variables
for i in continuous_vars:
    if i == 'Absenteeism time in hours':
        continue
    df[i] = (df[i] - df[i].min())/(df[i].max()-df[i].min())

# Machine Learning Models

In [None]:
#Creating dummy variables of categorical variables
df = pd.get_dummies(data = df, columns = categorical_vars)
df1 = df.copy()

In [None]:
df.shape

In [None]:
df.head(1)

In [None]:
#Splitting data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( df.iloc[:, df.columns != 'Absenteeism time in hours'], df.iloc[:, 8], test_size = 0.20, random_state = 1)

# Dimension Reduction using PCA

In [None]:
#Getting target variable
target = df['Absenteeism time in hours']

In [None]:
df.shape

In [None]:
#Importing required library for PCA
from sklearn.decomposition import PCA

#Converting data to numpy
X = df.values
pca = PCA(n_components=115)
pca.fit(X)

#Variance proportion
var= pca.explained_variance_ratio_

#Scree plot
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

#Plotting
plt.plot(var1)
plt.show()

In [None]:
#Selecting 45 components (95+% data explainatory)
pca = PCA(n_components=45)
pca.fit(X)

#Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,target, test_size=0.2, random_state = 1)

# Decision Tree
#RMSE: 0.0353
#R-squared: 0.9998

In [None]:
#Training Decision tree model
dt_model = DecisionTreeRegressor(random_state = 1).fit(X_train,y_train)

#Perdicting for test
dt_predictions = dt_model.predict(X_test)

#Creating data frame for actual and predicted values
df_dt = pd.DataFrame({'actual': y_test, 'pred': dt_predictions})
print(df_dt.head())

#Calculating RMSE and R-squared value
print("Root Mean Squared Error: "+str(RMSE(y_test, dt_predictions)))
print("R^2 Score(coefficient of determination) = "+str(r2_score(y_test, dt_predictions)))

# Random Forest
#RMSE: 0.04453
#R-squared: 0.9998

In [None]:
#Training Random forest model
rf_model = RandomForestRegressor(n_estimators = 500, random_state = 1).fit(X_train,y_train)

#Perdicting for test 
rf_predictions = rf_model.predict(X_test)

#Creating data frame for actual and predicted values
df_rf = pd.DataFrame({'actual': y_test, 'pred': rf_predictions})
print(df_rf.head())

#Calculating RMSE and R-squared value
print("Root Mean Squared Error: "+str(RMSE(y_test, rf_predictions)))
print("R^2 Score(coefficient of determination) = "+str(r2_score(y_test, rf_predictions)))

# Linear Regression
#RMSE: 0.0013
#R-squared: 0.9999

In [None]:
from sklearn.linear_model import LinearRegression

#Training the model
lr_model = LinearRegression().fit(X_train , y_train)

#Perdicting for test
lr_predictions = lr_model.predict(X_test)

#Creating data frame for actual and predicted values
df_lr = pd.DataFrame({'actual': y_test, 'pred': lr_predictions})
print(df_lr.head())

#Calculating RMSE and R-squared value
print("Root Mean Squared Error: "+str(RMSE(y_test, lr_predictions)))
print("R^2 Score(coefficient of determination) = "+str(r2_score(y_test, lr_predictions)))