In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [2]:
#Import the dataset
df = pd.read_csv('HR_capstone_dataset.csv')

Variable Description of the dataset:

satisfaction_level-Employee-reported job satisfaction level [0–1]

last_evaluation-Score of employee's last performance review [0–1]

number_project-Number of projects employee contributes to

average_monthly_hours-Average number of hours employee worked per month

time_spend_company-How long the employee has been with the company (years)

Work_accident-Whether or not the employee experienced an accident while at work

left-Whether or not the employee left the company

promotion_last_5years-Whether or not the employee was promoted in the last 5 years

Department-The employee's department

salary-The employee's salary (U.S. dollars)

In [3]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
#List of Columns of the dataset
df.columns.tolist()

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'left',
 'promotion_last_5years',
 'Department',
 'salary']

In [5]:
#Total number of Null values in each column
df.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
Department               0
salary                   0
dtype: int64

# Data Preprocessing

In [6]:
#One-hot encoding for department column
df = pd.get_dummies(df, columns = ['Department'], drop_first = True)

In [7]:
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.38,0.53,2,157,3,0,1,0,low,False,False,False,False,False,False,True,False,False
1,0.80,0.86,5,262,6,0,1,0,medium,False,False,False,False,False,False,True,False,False
2,0.11,0.88,7,272,4,0,1,0,medium,False,False,False,False,False,False,True,False,False
3,0.72,0.87,5,223,5,0,1,0,low,False,False,False,False,False,False,True,False,False
4,0.37,0.52,2,159,3,0,1,0,low,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,low,False,False,False,False,False,False,False,True,False
14995,0.37,0.48,2,160,3,0,1,0,low,False,False,False,False,False,False,False,True,False
14996,0.37,0.53,2,143,3,0,1,0,low,False,False,False,False,False,False,False,True,False
14997,0.11,0.96,6,280,4,0,1,0,low,False,False,False,False,False,False,False,True,False


In [8]:
#Ordinal Encoding for salary column
labels = {'low':1, 'medium':2, 'high':3}
df['salary'] = df['salary'].map(labels)

In [9]:
df['salary'].value_counts()

salary
1    7316
2    6446
3    1237
Name: count, dtype: int64

In [10]:
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.38,0.53,2,157,3,0,1,0,1,False,False,False,False,False,False,True,False,False
1,0.80,0.86,5,262,6,0,1,0,2,False,False,False,False,False,False,True,False,False
2,0.11,0.88,7,272,4,0,1,0,2,False,False,False,False,False,False,True,False,False
3,0.72,0.87,5,223,5,0,1,0,1,False,False,False,False,False,False,True,False,False
4,0.37,0.52,2,159,3,0,1,0,1,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,1,False,False,False,False,False,False,False,True,False
14995,0.37,0.48,2,160,3,0,1,0,1,False,False,False,False,False,False,False,True,False
14996,0.37,0.53,2,143,3,0,1,0,1,False,False,False,False,False,False,False,True,False
14997,0.11,0.96,6,280,4,0,1,0,1,False,False,False,False,False,False,False,True,False


In [14]:
#MinMaxScaler makes sure the column values in between 0 and 1 
scaled_columns = ['satisfaction_level', 'last_evaluation', 'average_montly_hours', 'time_spend_company', 'number_project']
scaled = MinMaxScaler()
df[scaled_columns] = scaled.fit_transform(df[scaled_columns])

In [15]:
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.318681,0.265625,0.0,0.285047,0.125,0,1,0,1,False,False,False,False,False,False,True,False,False
1,0.780220,0.781250,0.6,0.775701,0.500,0,1,0,2,False,False,False,False,False,False,True,False,False
2,0.021978,0.812500,1.0,0.822430,0.250,0,1,0,2,False,False,False,False,False,False,True,False,False
3,0.692308,0.796875,0.6,0.593458,0.375,0,1,0,1,False,False,False,False,False,False,True,False,False
4,0.307692,0.250000,0.0,0.294393,0.125,0,1,0,1,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,0.340659,0.328125,0.0,0.257009,0.125,0,1,0,1,False,False,False,False,False,False,False,True,False
14995,0.307692,0.187500,0.0,0.299065,0.125,0,1,0,1,False,False,False,False,False,False,False,True,False
14996,0.307692,0.265625,0.0,0.219626,0.125,0,1,0,1,False,False,False,False,False,False,False,True,False
14997,0.021978,0.937500,0.8,0.859813,0.250,0,1,0,1,False,False,False,False,False,False,False,True,False


# Exploratory Data Analysis

In [16]:
df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.574542,0.556409,0.360611,0.490889,0.187279,0.14461,0.238083,0.021268,1.594706
std,0.27322,0.267452,0.246518,0.233379,0.182517,0.351719,0.425924,0.144281,0.637183
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.384615,0.3125,0.2,0.280374,0.125,0.0,0.0,0.0,1.0
50%,0.604396,0.5625,0.4,0.485981,0.125,0.0,0.0,0.0,2.0
75%,0.802198,0.796875,0.6,0.696262,0.25,0.0,0.0,0.0,2.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0


In [17]:
df['left'].value_counts()

left
0    11428
1     3571
Name: count, dtype: int64