In [1]:
# Importing Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score

In [2]:
# Reading the train and test dataset 
train_model = pd.read_csv('train.csv')
X_Test = pd.read_csv('test.csv')
Y_Test = pd.read_csv('Sample_submission.csv')

In [3]:
# Checking the data to look for similarities and correlations
train_model.head()

Unnamed: 0,Id,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,...,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,CommunicationSkill,Behaviour
0,1,30,0,Non-Travel,Research & Development,2,3,Medical,571,3,...,3,0,12,2,11,7,6,7,4,1
1,2,36,0,Travel_Rarely,Research & Development,12,4,Life Sciences,1614,3,...,3,2,7,2,3,2,1,1,2,1
2,3,55,1,Travel_Rarely,Sales,2,1,Medical,842,3,...,3,0,12,3,9,7,7,3,5,1
3,4,39,0,Travel_Rarely,Research & Development,24,1,Life Sciences,2014,1,...,3,0,18,2,7,7,1,7,4,1
4,5,37,0,Travel_Rarely,Research & Development,3,3,Other,689,3,...,3,1,10,2,10,7,7,8,1,1


In [5]:
# It contains 29 columns and out of that our target variable is Attrition
X_Test.head()

Unnamed: 0,Id,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,CommunicationSkill,Behaviour
0,1,28,Travel_Rarely,Research & Development,9,3,Medical,377,4,Male,...,4,1,5,3,5,2,0,4,5,1
1,2,31,Travel_Rarely,Sales,6,4,Medical,653,1,Male,...,4,2,13,4,7,7,5,7,3,1
2,3,37,Travel_Rarely,Research & Development,6,3,Medical,474,3,Male,...,3,2,13,2,7,7,6,7,4,1
3,4,42,Travel_Rarely,Research & Development,1,2,Life Sciences,827,4,Female,...,3,1,8,4,4,3,0,2,5,1
4,5,45,Non-Travel,Research & Development,4,2,Life Sciences,972,3,Male,...,3,0,9,5,9,7,0,8,2,1


In [6]:
# Test model does not contain the target variable in it.
# Now we will split the target variable from the train model.
train_model.columns

Index(['Id', 'Age', 'Attrition', 'BusinessTravel', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'CommunicationSkill',
       'Behaviour'],
      dtype='object')

In [7]:
# We will look for columns which can affect the probability of attrition. 
# and get rid of those columns which do not affect the attrition to get the clear picture and visualisation.
# From the train dataset it is quite clear that ID and Behvaiour does not affect the attrition
train_model2 = train_model.drop(['EmployeeNumber', 'Behaviour'],axis = 1)

In [8]:
# We have to transform these required columns to number format for calculation
for column in train_model2.columns:
    if(isinstance(train_model2[column],int)):
        continue
    else:
        train_model2[column] = LabelEncoder().fit_transform(train_model2[column])

In [9]:
train_model2.head()

Unnamed: 0,Id,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,CommunicationSkill
0,0,12,0,0,1,1,2,3,2,0,...,3,0,0,12,2,11,7,6,7,3
1,1,18,0,2,1,11,3,1,2,0,...,1,0,2,7,2,3,2,1,1,1
2,2,37,1,2,2,1,0,3,2,1,...,5,0,0,12,3,9,7,7,3,4
3,3,21,0,2,1,23,0,1,0,1,...,2,0,0,18,2,7,7,1,7,3
4,4,19,0,2,1,2,2,4,2,1,...,4,0,1,10,2,10,7,7,8,0


In [10]:
X_data = train_model2.drop(['Attrition'],axis=1)
y_data = train_model2['Attrition']
# Splitting the data training data into training set and validation set
X_train,X_test,Y_train,Y_test = train_test_split(X_data,y_data,test_size = 0.3,random_state=1)

In [58]:
# Predicting the data using Logistic Regression
LR = LogisticRegression(solver = 'lbfgs', penalty = 'l2', max_iter = 10000, random_state = 1)
LR.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [59]:
y_pred = LR.predict(X_test)

In [60]:
score = roc_auc_score(Y_train, LR.predict_proba(X_train)[:,1])
score

0.9206054874619097

In [61]:
score = roc_auc_score(Y_test, LR.predict_proba(X_test)[:,1])
score

0.9434741878282982

In [15]:
# Predicting the data using Decision Tree Classifier
DT = DecisionTreeClassifier(criterion = 'gini' ,max_depth = 9, random_state = 1)
DT.fit(X_train, Y_train)
# Trying the outputs score with different levels of maximum depth of tree
# It performs the best with maximum depth of 9 and criterion as 'gini' instead of 'entropy'

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=1, splitter='best')

In [16]:
scoreDT = roc_auc_score(Y_train, DT.predict_proba(X_train)[:,1])
scoreDT

0.9947291809601513

In [17]:
scoreDT = roc_auc_score(Y_test, DT.predict_proba(X_test)[:,1])
scoreDT

0.8921844156713172

In [37]:
# Predicting the data using Random Forest Classifier
RF = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy' ,max_depth = 11, random_state = 1)
RF.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=11, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [38]:
scoreRF = roc_auc_score(Y_train, RF.predict_proba(X_train)[:,1])
scoreRF

1.0

In [39]:
scoreRF = roc_auc_score(Y_test, RF.predict_proba(X_test)[:,1])
scoreRF

0.9907156478972197

In [38]:
# Predicting the data using SVC
SVM = SVC(probability = True, kernel = 'rbf', C = 1.0, gamma = 0.0001)
SVM.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [39]:
# Overfitting
scoreSVM = roc_auc_score(Y_train, SVM.predict_proba(X_train)[:,1])
scoreSVM

0.9731359750783438

In [40]:
scoreSVM = roc_auc_score(Y_test, SVM.predict_proba(X_test)[:,1])
scoreSVM

0.9222790993342033

In [50]:
# Predicting the data using Neural networks 
NN = MLPClassifier(hidden_layer_sizes = (100,1000,), activation = 'logistic',random_state = 1, max_iter = 500)
NN.fit(X_train, Y_train)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100, 1000), learning_rate='constant',
              learning_rate_init=0.001, max_iter=500, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [51]:
scoreNN = roc_auc_score(Y_train, NN.predict_proba(X_train)[:,1])
scoreNN

0.998569106294078

In [52]:
scoreNN = roc_auc_score(Y_test, NN.predict_proba(X_test)[:,1])
scoreNN

0.9422362742145939

In [14]:
X_Test = X_Test.drop(['EmployeeNumber', 'Behaviour'],axis = 1)

In [15]:
for column in X_Test.columns:
    if(isinstance(X_Test[column],int)):
        continue
    else:
        X_Test[column] = LabelEncoder().fit_transform(X_Test[column])

In [326]:
a = NN.predict_proba(X_Test)[:,1]
b = a

In [327]:
Y_Test['Attrition'] = b

In [328]:
Y_Test.head()

Unnamed: 0,Id,Attrition
0,1,2.62087e-07
1,2,2.465939e-05
2,3,0.01565453
3,4,0.004852916
4,5,3.972707e-05


In [329]:
a = LR.predict_proba(X_Test)[:,1]
b = a

In [330]:
Y_Test['Attrition'] = b

In [331]:
Y_Test.head()

Unnamed: 0,Id,Attrition
0,1,0.013385
1,2,0.005253
2,3,0.032053
3,4,0.05479
4,5,0.002738


In [332]:
a = DT.predict_proba(X_Test)[:,1]
b = a

In [333]:
Y_Test['Attrition'] = b

In [334]:
Y_Test.head()

Unnamed: 0,Id,Attrition
0,1,1.0
1,2,1.0
2,3,0.08
3,4,0.666667
4,5,0.045802


In [335]:
a = RF.predict_proba(X_Test)[:,1]
b = a

In [336]:
Y_Test['Attrition'] = b

In [337]:
Y_Test.head()

Unnamed: 0,Id,Attrition
0,1,0.222432
1,2,0.151866
2,3,0.157031
3,4,0.274434
4,5,0.159177


In [338]:
a = SVM.predict_proba(X_Test)[:,1]
b = a

In [339]:
Y_Test['Attrition'] = b

In [340]:
Y_Test.head()

Unnamed: 0,Id,Attrition
0,1,0.464775
1,2,0.467015
2,3,0.467016
3,4,0.467024
4,5,0.467105


In [341]:
models = [RF,DT,SVM,LR,NN]
modelname = ['Random_Forest','Decision_Tree','Support_Vector','LogisticRegression','MLP_Classifier']
for model,name in zip(models,modelname):
    test_prob = model.predict_proba(X_Test)[:,1]
    result = Y_Test
    result.to_csv('Submission'+str(name)+'.csv',index=False)

In [355]:
testing = SVM.predict_proba(X_Test)[:,1]
b = testing

In [356]:
Y_Test['Attrition'] = b

In [357]:
Y_Test.to_csv('Submissionextra.csv',index=False)

In [368]:
testi = NN.predict_proba(X_Test)[:,1]
b = testi

In [369]:
Y_Test['Attrition'] = b

In [370]:
Y_Test.to_csv('Submissionextra_NN.csv',index=False)

In [386]:
testRF2 = RF.predict_proba(X_Test)[:,1]
b = testRF2

In [387]:
Y_Test['Attrition'] = b

In [388]:
Y_Test.to_csv('Submissionextra_RF2.csv',index=False)

In [75]:
testLR = LR.predict_proba(X_Test)[:,1]
b = testLR

In [76]:
Y_Test['Attrition'] = b

In [77]:
Y_Test.to_csv('Submissionextra_LR.csv',index=False)

In [16]:
testRF3 = RF.predict_proba(X_Test)[:,1]
b = testRF3

In [17]:
Y_Test['Attrition'] = b

In [18]:
Y_Test.to_csv('Submissionextra_RF3.csv',index=False)

In [25]:
testRF4 = RF.predict_proba(X_Test)[:,1]
b = testRF4

In [26]:
Y_Test['Attrition'] = b

In [27]:
Y_Test.to_csv('Submissionextra_RF4.csv',index=False)

In [40]:
testRF5 = RF.predict_proba(X_Test)[:,1]
b = testRF5

In [41]:
Y_Test['Attrition'] = b

In [43]:
Y_Test.to_csv('Submissionextra_RF5.csv',index=False)