In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Problem Statement

**Employee Attrition is a huge problem across industries and generally costs the company a lot for
hiring, retraining, productivity and work loss for each employee who leaves.**

**To understand weather an employee quits at the earliest, we need to build model to predict whether an employee would quit in the near future.**

# Reading the data sets & knowing about each attribute.


In [0]:
import pandas as pd
import numpy as np

In [0]:
#Reading train data 
train_attrition = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Data/Traindata/train_attrition.csv")
train_work = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Data/Traindata/train_work.csv")


In [0]:
#Reading the test data 
test_attrition = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Data/Testdata/test_attrition.csv")
test_work = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Data/Testdata/test_work.csv")

In [0]:
#Reading the employee_data
emp_data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Data/employee_data/employee_data.csv")

In [0]:
#read the top 5 rows of the train_attrition data
train_attrition.head() 

Unnamed: 0,EmployeeID,Left_Company
0,emp-1,0.0
1,emp-10,1.0
2,emp-100,0.0
3,emp-101,0.0
4,emp-102,0.0


## Shape of the Raw training data.

### No of Rows & Column in the train attrition & train work data.

In [0]:
#dimensions of the train_attrition data.

print("The train_attrition has {} rows & {} columns." .format(train_attrition.shape[0], train_attrition.shape[1]))
print("The train_work has {} rows & {} columns." .format(train_work.shape[0], train_work.shape[1]))


The train_attrition has 245 rows & 2 columns.
The train_work has 3983 rows & 7 columns.


In [0]:
train_work.head() #Reading the top 5 rows of train_work file

Unnamed: 0,EmployeeID,TotalWorkingHours,Billable_Hours,Hours_off_Duty,Touring_Hours,NoOfProjects,ActualTimeSpent
0,emp-1,184.0,144.0,40.0,144.0,3,77
1,emp-1,176.0,136.0,40.0,136.0,2,98
2,emp-1,200.0,120.0,72.0,120.0,1,14
3,emp-1,160.0,144.0,8.0,144.0,3,147
4,emp-1,171.0,135.0,0.0,135.0,1,42


In [0]:
#let's understand what is the employee_data
emp_data.head()

Unnamed: 0,EmployeeID,Job_History,Joining_Date,Designation,Sex
0,emp-10,"Worked at Company - 816 , Company - 724 , Comp...",2006_02,VP,0
1,emp-108,"Worked at Company - 205 , Company - 373 , Comp...",2012_01,VP,1
2,emp-115,"Worked at Company - 690 , Company - 280 , Comp...",2012_02,Senior,1
3,emp-121,"Worked at Company - 684 , Company - 159 , Comp...",2012_03,VP,1
4,emp-138,"Worked at Company - 443 , Company - 121 , Comp...",2012_05,Senior,1


## Employee data of both Train & Test data.


### Shape of the Employee Data.

In [0]:
print("The employee data has {} rows & {} columns." .format(emp_data.shape[0], emp_data.shape[1]))

The employee data has 296 rows & 5 columns.


In [0]:
#Let's see the top rows of the test data 

test_attrition.head()

Unnamed: 0,EmployeeID,Left_Company
0,emp-106,
1,emp-112,
2,emp-114,
3,emp-118,
4,emp-12,


### No of Rows & Column in th test attrition & test work data.

In [0]:
print("The Test Attrition data has {} rows & {} columns." .format(test_attrition.shape[0], test_attrition.shape[1]))
print("The Test work has {} rows & {} columns." .format(test_work.shape[0], test_work.shape[1]))


The Test Attrition data has 51 rows & 2 columns.
The Test work has 874 rows & 7 columns.


**We need to predict weather these will leave the company**

In [0]:
test_work.head()

Unnamed: 0,EmployeeID,TotalWorkingHours,Billable_Hours,Hours_off_Duty,Touring_Hours,NoOfProjects,ActualTimeSpent
0,emp-2,170.0,124.0,8.0,0.0,4,77
1,emp-2,164.0,114.0,24.0,0.0,4,105
2,emp-2,201.0,139.0,48.0,0.0,4,112
3,emp-2,153.0,114.0,24.0,0.0,2,91
4,emp-2,122.0,106.0,0.0,0.0,2,98


# Description of the features

● Left_Company (Target) : Whether the employee left the  company or not (1 - Yes, 0 - No).

● EmployeeID : A unique identification key for every individual employee.

● TotalWorkingHours : The total working hours logged for the employee at the location.

● Billable_Hours : The number of hours that are used to charge the Client.

● Hours_off_Duty : Number of hours the employee took off.

● Touring_Hours : Number of hours the employee spent working at an offsite location.

● NoOfProjects : Number of Projects the employee is assigned to.

● ActualTimeSpent : Actual time the employee spent working according to the timesheets.

● Job_History : A feature containing the previous companies where the employee was employed.

● Joining_Date : The date on which the employee Joined the organisation.

● Designation : The role of the employee in the company, with the following levels: EVP, Junior, MD, Senior, VP.

● Sex : The gender of the employee.

## Merging the datasets based on the employee data.

In [0]:
train_attrition.head()

Unnamed: 0,EmployeeID,Left_Company
0,emp-1,0.0
1,emp-10,1.0
2,emp-100,0.0
3,emp-101,0.0
4,emp-102,0.0


In [0]:
emp_data.head()

Unnamed: 0,EmployeeID,Job_History,Joining_Date,Designation,Sex
0,emp-10,"Worked at Company - 816 , Company - 724 , Comp...",2006_02,VP,0
1,emp-108,"Worked at Company - 205 , Company - 373 , Comp...",2012_01,VP,1
2,emp-115,"Worked at Company - 690 , Company - 280 , Comp...",2012_02,Senior,1
3,emp-121,"Worked at Company - 684 , Company - 159 , Comp...",2012_03,VP,1
4,emp-138,"Worked at Company - 443 , Company - 121 , Comp...",2012_05,Senior,1


## Merging the train data into a single datafrfame

In [0]:
train_emp_data = pd.merge(train_attrition, emp_data, how='left', on='EmployeeID')

In [0]:
train_emp_data.shape

(245, 6)

In [0]:
train_emp_data.head()

Unnamed: 0,EmployeeID,Left_Company,Job_History,Joining_Date,Designation,Sex
0,emp-1,0.0,"Worked at Company - 639 , Company - 212 , Comp...",2011_01,MD,1
1,emp-10,1.0,"Worked at Company - 816 , Company - 724 , Comp...",2006_02,VP,0
2,emp-100,0.0,"Worked at Company - 562 , Company - 319",2011_12,Senior,1
3,emp-101,0.0,"Worked at Company - 212 , Company - 668 , Comp...",2012_01,Senior,1
4,emp-102,0.0,Worked at Company - 234,2011_12,Senior,1


In [0]:
train_final = pd.merge(train_emp_data, train_work, on='EmployeeID', how='left')

In [0]:
train_final.shape

(3983, 12)

In [0]:
train_final.head()

Unnamed: 0,EmployeeID,Left_Company,Job_History,Joining_Date,Designation,Sex,TotalWorkingHours,Billable_Hours,Hours_off_Duty,Touring_Hours,NoOfProjects,ActualTimeSpent
0,emp-1,0.0,"Worked at Company - 639 , Company - 212 , Comp...",2011_01,MD,1,184.0,144.0,40.0,144.0,3,77
1,emp-1,0.0,"Worked at Company - 639 , Company - 212 , Comp...",2011_01,MD,1,176.0,136.0,40.0,136.0,2,98
2,emp-1,0.0,"Worked at Company - 639 , Company - 212 , Comp...",2011_01,MD,1,200.0,120.0,72.0,120.0,1,14
3,emp-1,0.0,"Worked at Company - 639 , Company - 212 , Comp...",2011_01,MD,1,160.0,144.0,8.0,144.0,3,147
4,emp-1,0.0,"Worked at Company - 639 , Company - 212 , Comp...",2011_01,MD,1,171.0,135.0,0.0,135.0,1,42


In [0]:
#considering only the columns in the train_attrition using groupby  & agg function.
train = train_final.groupby('EmployeeID').agg('max')

In [0]:
train.head()

Unnamed: 0_level_0,Left_Company,Job_History,Joining_Date,Designation,Sex,TotalWorkingHours,Billable_Hours,Hours_off_Duty,Touring_Hours,NoOfProjects,ActualTimeSpent
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
emp-1,0.0,"Worked at Company - 639 , Company - 212 , Comp...",2011_01,MD,1,205.0,189.0,72.0,189.0,3,231
emp-10,1.0,"Worked at Company - 816 , Company - 724 , Comp...",2006_02,VP,0,241.0,233.0,120.0,186.0,2,147
emp-100,0.0,"Worked at Company - 562 , Company - 319",2011_12,Senior,1,278.0,264.0,40.0,264.0,3,308
emp-101,0.0,"Worked at Company - 212 , Company - 668 , Comp...",2012_01,Senior,1,345.5,233.5,64.0,0.0,2,217
emp-102,0.0,Worked at Company - 234,2011_12,Senior,1,270.0,249.0,88.0,213.0,2,231


In [0]:
train.shape

(245, 11)

## Merging the test data into Single data frame. 

In [0]:
#merging the test data 

test_emp_data = pd.merge(test_attrition, emp_data, how='left', on='EmployeeID')


In [0]:
test_emp_data.head()

Unnamed: 0,EmployeeID,Left_Company,Job_History,Joining_Date,Designation,Sex
0,emp-106,,"Worked at Company - 510 , Company - 658 , Comp...",2012_01,Senior,0
1,emp-112,,"Worked at Company - 42 , Company - 432 , Compa...",2012_02,VP,1
2,emp-114,,"Worked at Company - 99 , Company - 88 , Compan...",2012_02,EVP,1
3,emp-118,,"Worked at Company - 554 , Company - 304",2012_04,EVP,1
4,emp-12,,"Worked at Company - 548 , Company - 405",2006_09,VP,1


In [0]:
test_final = pd.merge(test_emp_data,test_work, how='left', on='EmployeeID')

In [0]:
test_final.head()

Unnamed: 0,EmployeeID,Left_Company,Job_History,Joining_Date,Designation,Sex,TotalWorkingHours,Billable_Hours,Hours_off_Duty,Touring_Hours,NoOfProjects,ActualTimeSpent
0,emp-106,,"Worked at Company - 510 , Company - 658 , Comp...",2012_01,Senior,0,96.0,16.0,40.0,2.0,2,42
1,emp-106,,"Worked at Company - 510 , Company - 658 , Comp...",2012_01,Senior,0,80.0,0.0,8.0,0.0,0,0
2,emp-106,,"Worked at Company - 510 , Company - 658 , Comp...",2012_01,Senior,0,200.0,16.0,0.0,0.0,1,0
3,emp-106,,"Worked at Company - 510 , Company - 658 , Comp...",2012_01,Senior,0,165.0,114.0,40.0,0.0,1,28
4,emp-106,,"Worked at Company - 510 , Company - 658 , Comp...",2012_01,Senior,0,251.0,219.0,8.0,0.0,1,63


In [0]:
#using Groupby & agg functin merging the column based on the sample submission data.
test = test_final.groupby(by='EmployeeID').agg('max')

In [0]:
test.shape

(51, 11)

In [0]:
#Writing the files into csv for furter use 
train.to_csv("train.csv", index=True)
test.to_csv("test.csv", index=True)

# Feature Engineering

In [0]:
train_final = pd.read_csv("/content/train.csv")
test_final = pd.read_csv("/content/test.csv")

In [0]:
#Get the list of column in the training data
print((train_final.columns))

Index(['EmployeeID', 'Left_Company', 'Job_History', 'Joining_Date',
       'Designation', 'Sex', 'TotalWorkingHours', 'Billable_Hours',
       'Hours_off_Duty', 'Touring_Hours', 'NoOfProjects', 'ActualTimeSpent'],
      dtype='object')


In [0]:
print(test_final.columns)

Index(['EmployeeID', 'Left_Company', 'Job_History', 'Joining_Date',
       'Designation', 'Sex', 'TotalWorkingHours', 'Billable_Hours',
       'Hours_off_Duty', 'Touring_Hours', 'NoOfProjects', 'ActualTimeSpent'],
      dtype='object')


In [0]:
#Dropping the unwanted columns in the data
train_final.drop(columns=['EmployeeID'], axis=1,inplace=True)
test_final.drop(columns=['EmployeeID','Left_Company'], axis=1, inplace=True)

In [0]:
print("Train data shape : {} " .format(train_final.shape))
print("Test data shape: {} ".format(test_final.shape))

Train data shape : (245, 11) 
Test data shape: (51, 10) 


In [0]:
train_final.head()

Unnamed: 0,Left_Company,Job_History,Joining_Date,Designation,Sex,TotalWorkingHours,Billable_Hours,Hours_off_Duty,Touring_Hours,NoOfProjects,ActualTimeSpent
0,0.0,"Worked at Company - 639 , Company - 212 , Comp...",2011_01,MD,1,205.0,189.0,72.0,189.0,3,231
1,1.0,"Worked at Company - 816 , Company - 724 , Comp...",2006_02,VP,0,241.0,233.0,120.0,186.0,2,147
2,0.0,"Worked at Company - 562 , Company - 319",2011_12,Senior,1,278.0,264.0,40.0,264.0,3,308
3,0.0,"Worked at Company - 212 , Company - 668 , Comp...",2012_01,Senior,1,345.5,233.5,64.0,0.0,2,217
4,0.0,Worked at Company - 234,2011_12,Senior,1,270.0,249.0,88.0,213.0,2,231


In [0]:
# Splitting the Joining_date column into Joining year & Joining month.

#convert on training data. 
train_final[['joining_year','joining_month']] = train_final['Joining_Date'].str.split("_", expand=True)

#convert on test data
test_final[['joining_year','joining_month']] = test_final['Joining_Date'].str.split("_", expand=True)

In [0]:
#check if it has been converted
train_final.head()

Unnamed: 0,Left_Company,Job_History,Joining_Date,Designation,Sex,TotalWorkingHours,Billable_Hours,Hours_off_Duty,Touring_Hours,NoOfProjects,ActualTimeSpent,joining_year,joining_month
0,0.0,"Worked at Company - 639 , Company - 212 , Comp...",2011_01,MD,1,205.0,189.0,72.0,189.0,3,231,2011,1
1,1.0,"Worked at Company - 816 , Company - 724 , Comp...",2006_02,VP,0,241.0,233.0,120.0,186.0,2,147,2006,2
2,0.0,"Worked at Company - 562 , Company - 319",2011_12,Senior,1,278.0,264.0,40.0,264.0,3,308,2011,12
3,0.0,"Worked at Company - 212 , Company - 668 , Comp...",2012_01,Senior,1,345.5,233.5,64.0,0.0,2,217,2012,1
4,0.0,Worked at Company - 234,2011_12,Senior,1,270.0,249.0,88.0,213.0,2,231,2011,12


In [0]:
train_final['job_count'] = train_final['Job_History'].str.split(',').str.len()

test_final['job_count'] = test_final['Job_History'].str.split(',').str.len()

In [0]:
#drop the Joining date column in train & test data

train_final.drop(columns=['Joining_Date'], axis=1, inplace=True)


test_final.drop(columns=['Joining_Date'], axis=1, inplace=True)

In [0]:
train_final.head()

Unnamed: 0,Left_Company,Job_History,Designation,Sex,TotalWorkingHours,Billable_Hours,Hours_off_Duty,Touring_Hours,NoOfProjects,ActualTimeSpent,joining_year,joining_month,job_count
0,0.0,"Worked at Company - 639 , Company - 212 , Comp...",MD,1,205.0,189.0,72.0,189.0,3,231,2011,1,3
1,1.0,"Worked at Company - 816 , Company - 724 , Comp...",VP,0,241.0,233.0,120.0,186.0,2,147,2006,2,4
2,0.0,"Worked at Company - 562 , Company - 319",Senior,1,278.0,264.0,40.0,264.0,3,308,2011,12,2
3,0.0,"Worked at Company - 212 , Company - 668 , Comp...",Senior,1,345.5,233.5,64.0,0.0,2,217,2012,1,3
4,0.0,Worked at Company - 234,Senior,1,270.0,249.0,88.0,213.0,2,231,2011,12,1


In [0]:
train_final.dtypes #check the data type of train data 

Left_Company         float64
Job_History           object
Designation           object
Sex                    int64
TotalWorkingHours    float64
Billable_Hours       float64
Hours_off_Duty       float64
Touring_Hours        float64
NoOfProjects           int64
ActualTimeSpent        int64
joining_year          object
joining_month         object
job_count              int64
dtype: object

In [0]:
test_final.dtypes #check the data types of the test data.

Job_History           object
Designation           object
Sex                    int64
TotalWorkingHours    float64
Billable_Hours       float64
Hours_off_Duty       float64
Touring_Hours        float64
NoOfProjects           int64
ActualTimeSpent        int64
joining_year          object
joining_month         object
job_count              int64
dtype: object

In [0]:
#creating a copy for using with label encoding

train_copy = train_final.copy()
test_copy = test_final.copy()

In [0]:
#convert floats into int

for col in ['TotalWorkingHours','Billable_Hours','Hours_off_Duty','Touring_Hours']:
  train_final[col] = train_final[col].astype('float')

In [0]:
#Do the same on the test data 
for col in ['TotalWorkingHours','Billable_Hours','Hours_off_Duty','Touring_Hours']:
  test_final[col] = test_final[col].astype('float')

In [0]:
#creating the list of numeric attributes

### Converting objects to categories 

In [0]:
#on training data

for col in ['Sex','Designation','NoOfProjects','joining_year','joining_month','Job_History']:
  train_final[col] = train_final[col].astype('category')
#on testing data 

for col in ['Sex','Designation','NoOfProjects','joining_year','joining_month','Job_History']:
  test_final[col] = test_final[col].astype('category')

train_final['Left_Company'] = train_final['Left_Company'].astype('category')

In [0]:
#creating a list of categorical & numerical attributes

cat_attr = list(train_final.select_dtypes("category").columns) #exclude target column inthe list
num_attr = list(test_final.columns.difference(cat_attr))

# cat_attr.pop()

In [0]:
cat_attr.remove('Left_Company')

In [0]:
cat_attr

['Job_History',
 'Designation',
 'Sex',
 'NoOfProjects',
 'joining_year',
 'joining_month']

In [0]:
num_attr

['ActualTimeSpent',
 'Billable_Hours',
 'Hours_off_Duty',
 'TotalWorkingHours',
 'Touring_Hours',
 'job_count']

### Columns with missing values

In [0]:
# Missing values in train data  & test data
missing_cols_train = train_final.columns[train_final.isnull().any()]
print(missing_cols_train)

missing_cols_test = test_final.columns[test_final.isnull().any()]
print(missing_cols_test)

Index([], dtype='object')
Index([], dtype='object')


### Importing the Required packages

In [0]:
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score


import warnings
warnings.filterwarnings('ignore')



## Dummification

In [0]:
y= train_final['Left_Company']

del train_final['Left_Company']
X = train_final

In [0]:

#One hot encoding on the train data

dummy_train = pd.get_dummies(data= X, columns= cat_attr, drop_first=True)

In [0]:
dummy_train.shape

(245, 283)

In [0]:
#One hot encoding on the test data

test_final = pd.get_dummies(data= test_final, columns= cat_attr, drop_first=True)

In [0]:
print(test_final.shape)

(51, 86)


In [0]:
# Align the columns in the test data with train data 

dummy_train, test_final = dummy_train.align(test_final, join='left', axis=1)

In [0]:
#check the missing values in the test
print(test_final.isnull().sum().sum())

test_final.fillna(value=0, inplace=True)

12597


In [0]:
from sklearn.preprocessing import StandardScaler


In [0]:
#standardize the train dat a& test data

std = StandardScaler() #Instantiating an object. 
std.fit(dummy_train) #Fittin gon th train data

std_x = std.transform(dummy_train) #transform on th etrain data
std_test = std.transform(test_final) #transform on the test data

In [0]:
print(std_x.shape)
print(std_test.shape)

(245, 283)
(51, 283)


# Models On complete std data without splitting : 
 

## Model-1 -Logistic Regression : 


In [0]:
log_reg = LogisticRegression()

log_reg.fit(std_x, y)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
log_pred = log_reg.predict(std_x)

print(f1_score(y, log_pred))

1.0


##  Model- 2 - Decision tree :

In [0]:
%%time
clf_dt =  DecisionTreeClassifier()

dt_param_grid = {'criterion': ['entropy', 'gini'], 
                 'max_depth': [3,4,5],
                 "min_samples_split": [2,5],
                 "min_samples_leaf": [1,3,5]}

dt_grid = GridSearchCV(clf_dt, param_grid=dt_param_grid, n_jobs=-1, cv=10,return_train_score=True)

dt_grid.fit(std_x,y)

print(dt_grid.best_params_)

dt_pred = dt_grid.predict(std_x)

print(f1_score(y, dt_pred))

{'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 5}
0.851063829787234
CPU times: user 373 ms, sys: 44.7 ms, total: 418 ms
Wall time: 3.24 s


## Model-3 --Build Gradient Boosting 

In [0]:

%%time
clf_gbc = GradientBoostingClassifier()

gbm_param_grid = {'max_depth': [2,3,4],
                  'subsample': [0.8,0.6],
                  'max_features':[0.3], 
                  'n_estimators': [10, 20, 30],
                  'learning_rate':[0.1]}

gbm_grid = GridSearchCV(clf_gbc, param_grid=gbm_param_grid, n_jobs =-1, cv=5)

gbm_grid.fit(std_x,y)

print("The Best parameters are: {} .".format(gbm_grid.best_params_))

train_pred_gb = gbm_grid.predict(std_x)

print(f1_score(y, train_pred_gb))



The Best parameters are: {'learning_rate': 0.1, 'max_depth': 2, 'max_features': 0.3, 'n_estimators': 20, 'subsample': 0.8} .
0.851063829787234
CPU times: user 155 ms, sys: 9.21 ms, total: 164 ms
Wall time: 1.57 s


# Building model after outholding the dataset.

In [0]:
x_train, x_test, y_train, y_test = train_test_split(std_x, y, test_size=0.3, random_state=11, stratify=y)


In [0]:
sm = SMOTE(random_state=2)

x_train_re, y_train_re = sm.fit_sample(x_train, y_train.ravel())
x_test_re, y_test_re = sm.fit_sample(x_test,y_test.ravel())


In [0]:
print(x_train_re.shape)
print(x_test_re.shape)
print(y_train_re.shape)
print(y_test_re.shape)


(304, 283)
(132, 283)
(304,)
(132,)


## Model-1 Gradient Boosting : 

In [0]:

%%time
clf_gbc = GradientBoostingClassifier()

gbm_param_grid = {'max_depth': [2,3,4],
                  'subsample': [0.8,0.6],
                  'max_features':[0.3, 0.4], 
                  'n_estimators': [1,2,3,4,5,6,7,8,9,10],
                  'learning_rate':[0.1, 0.01, 0.001]} #making learning rate lesser gave the best results

gbm_grid_1 = GridSearchCV(clf_gbc, param_grid=gbm_param_grid, n_jobs =-1, cv=5)

gbm_grid_1.fit(x_train_re,y_train_re)

print("The Best parameters are: {} .".format(gbm_grid_1.best_params_))

train_pred_gb_2 = gbm_grid_1.predict(x_train_re)
test_pred_gb_2 = gbm_grid_1.predict(x_test_re)
print(f1_score(y_train_re, train_pred_gb_2)*100)
print(f1_score(y_test_re, test_pred_gb_2)*100)



The Best parameters are: {'learning_rate': 0.1, 'max_depth': 4, 'max_features': 0.3, 'n_estimators': 6, 'subsample': 0.8} .
99.33774834437085
96.06299212598425
CPU times: user 1.78 s, sys: 43.3 ms, total: 1.82 s
Wall time: 12.5 s


## Model-2  Logit Model :  


In [0]:
log_reg_2 = LogisticRegression()

log_reg_2.fit(x_train_re,y_train_re)

log_reg_pred_train = log_reg_2.predict(x_train_re)
log_reg_pred_test = log_reg_2.predict(x_test_re)

print(f1_score(y_train_re, log_reg_pred_train))
print(f1_score(y_test_re, log_reg_pred_test))

#prediction on test data 



1.0
0.8148148148148148


## Model-3 xgBoost : 

In [0]:

%%time
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
xgb = xgb.XGBClassifier()


kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=143)

param_grid = {"n_estimators" : [10,15],
              "max_depth" : [3,5,6,7],
              "colsample_bytree":[0.7,.8],
              "learning_rate": [0.001,0.01,0.1],
              "subsample":[0.8,0.6]}

xg_grid = GridSearchCV(xgb, param_grid=param_grid, n_jobs=-1, cv=kfold)


xg_grid.fit(x_train_re,y_train_re)

print(xg_grid.best_params_)

train_pred = xg_grid.predict(x_train_re)
test_pred = xg_grid.predict(x_test_re)

print("Train Score: {}." .format(f1_score(y_train_re, train_pred)))
print("Validation Score: {}." .format(f1_score(y_test_re, test_pred)))


{'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.6}
Train Score: 0.9764309764309764.
Validation Score: 0.9606299212598425.
CPU times: user 940 ms, sys: 228 ms, total: 1.17 s
Wall time: 13.8 s


## Model- 4 RF

In [0]:
%%time
clf_rf = RandomForestClassifier()

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=143)

param_grid = {"n_estimators" : [3,4,5,10,12,13,14],
              "max_depth" : [2,3,5,6],
              "max_features" : [3,4, 5, 7],
              "min_samples_leaf" : [4, 6, 8, 10]}

rf_grid = GridSearchCV(clf_rf, param_grid=dt_param_grid, cv=kfold)


rf_grid.fit(x_train_re,y_train_re)

print(rf_grid.best_params_)

train_pred_rf = rf_grid.predict(x_train_re)
test_pred_rf = rf_grid.predict(x_test_re)

print("Train Score: {} ." .format(f1_score(y_train_re, train_pred_rf)))
print("Validation Score: {} ." .format(f1_score(y_test_re, test_pred_rf)))



{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5}
Train Score: 0.9706840390879479 .
Validation Score: 0.9076923076923077 .
CPU times: user 1.98 s, sys: 14.1 ms, total: 1.99 s
Wall time: 2 s


In [0]:
from sklearn.neighbors import KNeighborsClassifier

In [0]:
knn = KNeighborsClassifier()

param_grid_knn = {'n_neighbors':[4,5,6]}

knn_grid = GridSearchCV(knn, param_grid=param_grid_knn, cv=5)


knn_grid.fit(x_train_re,y_train_re)

print(knn_grid.best_params_)

train_pred_rf = knn_grid.predict(x_train_re)
test_pred_rf = knn_grid.predict(x_test_re)

print("Train Score: {} ." .format(f1_score(y_train_re, train_pred_rf)))
print("Validation Score: {} ." .format(f1_score(y_test_re, test_pred_rf)))



{'n_neighbors': 4}
Train Score: 0.9967213114754099 .
Validation Score: 0.6666666666666666 .


# Models using Label Encoding.

In [0]:
#convert floats into int

for col in ['TotalWorkingHours','Billable_Hours','Hours_off_Duty','Touring_Hours']:
  train_copy[col] = train_copy[col].astype('int')

In [0]:
#Do the same on the test data 
for col in ['TotalWorkingHours','Billable_Hours','Hours_off_Duty','Touring_Hours']:
  test_copy[col] = test_copy[col].astype('int')

In [0]:
#creating the list of numeric attributes

### Converting objects to categories 

In [0]:
#on training data

for col in ['Sex','Designation','NoOfProjects','joining_year','joining_month']:
  train_copy[col] = train_copy[col].astype('object')
#on testing data 

for col in ['Sex','Designation','NoOfProjects','joining_year','joining_month']:
  test_copy[col] = test_copy[col].astype('object')

train_copy['Left_Company'] = train_copy['Left_Company'].astype('category')

In [0]:
del train_copy['Left_Company']

### Encoding the data using Label encoder

In [0]:
#pass the label encoded data to the random forest Model

for x in train_copy.columns:
  if train_copy[x].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_copy[x].values))
        train_copy[x] = lbl.transform(list(train_copy[x].values))
        

In [0]:
train_copy.head()

Unnamed: 0,Job_History,Designation,Sex,TotalWorkingHours,Billable_Hours,Hours_off_Duty,Touring_Hours,NoOfProjects,ActualTimeSpent,joining_year,joining_month,job_count
0,156,2,1,205,189,72,189,3,231,8,0,3
1,217,4,0,241,233,120,186,2,147,3,1,4
2,129,3,1,278,264,40,264,3,308,8,11,2
3,28,3,1,345,233,64,0,2,217,9,0,3
4,34,3,1,270,249,88,213,2,231,8,11,1


In [0]:
x_train_l, x_test_l, y_train_l, y_test_l = train_test_split(train_copy, y, test_size= 0.3, random_state=12, stratify= y)

In [0]:
test_copy.head()

Unnamed: 0,Job_History,Designation,Sex,TotalWorkingHours,Billable_Hours,Hours_off_Duty,Touring_Hours,NoOfProjects,ActualTimeSpent,joining_year,joining_month,job_count
0,"Worked at Company - 510 , Company - 658 , Comp...",Senior,0,278,263,90,2,2,203,2012,1,4
1,"Worked at Company - 42 , Company - 432 , Compa...",VP,1,273,256,72,176,3,182,2012,2,3
2,"Worked at Company - 99 , Company - 88 , Compan...",EVP,1,275,179,48,176,3,154,2012,2,3
3,"Worked at Company - 554 , Company - 304",EVP,1,238,215,40,160,3,238,2012,4,2
4,"Worked at Company - 548 , Company - 405",VP,1,293,228,88,228,4,315,2006,9,2


## Model 1 Random forest :

In [0]:
%%time
clf_rf = RandomForestClassifier()

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=143)

param_grid = {"n_estimators" : [3,4,5,10,12,13,14],
              "max_depth" : [2,3,5,6],
              "max_features" : [3, 5, 7],
              "min_samples_leaf" : [4, 6, 8, 10]}

rf_grid = GridSearchCV(clf_rf, param_grid=dt_param_grid, cv=kfold)


rf_grid.fit(x_train_l,y_train_l)

print(rf_grid.best_params_)

train_pred_rf = rf_grid.predict(x_train_l)
test_pred_rf = rf_grid.predict(x_test_l)

print("Train Score: {} ." .format(f1_score(y_train_l, train_pred_rf)))
print("Validation Score: {} ." .format(f1_score(y_test_l, test_pred_rf)))



{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 5}
Train Score: 0.9142857142857143 .
Validation Score: 0.6666666666666666 .
CPU times: user 2.49 s, sys: 3.06 ms, total: 2.5 s
Wall time: 2.5 s


## Model 2 xgboost : 

In [0]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [0]:

%%time
xgb = xgb.XGBClassifier()


kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=143)

param_grid = {"n_estimators" : [3,4,5,6,7,8,10],
              "max_depth" : [3,5,6,7],
              "colsample_bytree":[0.4,0.5],
              "learning_rate": [0.001,0.01,0.1],
              "subsample":[0.8,0.6],
              'objective':['binary:logistic']}

xg_grid = GridSearchCV(xgb, param_grid=param_grid, n_jobs=-1, cv=kfold)


xg_grid.fit(x_train_l,y_train_l)

print(xg_grid.best_params_)

train_pred_xgb = xg_grid.predict(x_train_l)
test_pred_xgb = xg_grid.predict(x_test_l)

print("Train Score: {}." .format(f1_score(y_train_l, train_pred_xgb)))
print("Validation Score: {}." .format(f1_score(y_test_l, test_pred_xgb)))


{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 6, 'objective': 'binary:logistic', 'subsample': 0.8}
Train Score: 0.8823529411764706.
Validation Score: 0.5454545454545454.
CPU times: user 1.93 s, sys: 19.4 ms, total: 1.95 s
Wall time: 9.55 s


## Model 3 Gradient Boosting : 

In [0]:

%%time
clf_gb = GradientBoostingClassifier()

gbm_param_grid = {'max_depth': [2,3,4],
                  'subsample': [0.8,0.6],
                  'max_features':[0.3], 
                  'n_estimators': [5,7,9,11],
                  'learning_rate':[0.1]}

gbm_grid_l = GridSearchCV(clf_gb, param_grid=gbm_param_grid, n_jobs =-1, cv=5)

gbm_grid_l.fit(x_train_l, y_train_l)

print("The Best parameters are: {} .".format(gbm_grid.best_params_))

train_pred_gb_l = gbm_grid_l.predict(x_train_l)
test_pred_gb_l = gbm_grid_l.predict(x_test_l)
print(f1_score(y_train_l, train_pred_gb_l))
print(f1_score(y_test_l, test_pred_gb_l))


The Best parameters are: {'learning_rate': 0.1, 'max_depth': 2, 'max_features': 0.3, 'n_estimators': 20, 'subsample': 0.8} .
0.5384615384615384
0.0
CPU times: user 179 ms, sys: 4.05 ms, total: 183 ms
Wall time: 1.16 s


### The training data is insufficient & it is underfitting when building the models. 


### The target labelled data is class imbalanced & should perform SMOTE method to over sample the data, It may be helpful in building the model without underfitting. 

### Feature Engineering can be done by using the Job_History column by counting the no of companies he worked earlier & make better predictions. 

# Deep Learning Techniques : 

In [0]:
#importing packages 
import tensorflow as tf 
import keras
from keras import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras.layers import Activation

In [0]:
# chceking the splitted data 
print(x_train_re.shape)
print(x_test_re.shape)
print(y_train_re.shape)
print(y_test_re.shape)


(304, 283)
(132, 283)
(304,)
(132,)


In [0]:
#chcecking the label encoded  data 

print(x_train_l.shape)
print(x_test_l.shape)
print(y_train_l.shape) 
print(y_test_l.shape)

(171, 12)
(74, 12)
(171,)
(74,)


In [0]:
x_train_l.dtypes

Job_History          int64
Designation          int64
Sex                  int64
TotalWorkingHours    int64
Billable_Hours       int64
Hours_off_Duty       int64
Touring_Hours        int64
NoOfProjects         int64
ActualTimeSpent      int64
joining_year         int64
joining_month        int64
job_count            int64
dtype: object

In [0]:
model = Sequential()

#layer-1
model.add(Dense(6,kernel_initializer='glorot_uniform', input_dim=283))
# model.add(BatchNormalization())
model.add(Activation('relu'))

#layer -2 

#model.add(Dense(6, kernel_initializer='glorot_uniform', activation='relu'))
#model.add(BatchNormalization())

#layer-3 -Output layer 

model.add(Dense(1, activation='softmax', kernel_initializer='glorot_normal'))

#compiling the model 

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [0]:
model_1 = model.fit(std_x, y, epochs=100, batch_size=22, validation_split=0.1)

Train on 220 samples, validate on 25 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Ep