In [45]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [47]:
def selectkbest(indep_x, dep_y, n):
    test = SelectKBest(score_func = chi2, k = n)
    fit1 = test.fit(indep_x, dep_y)
    selectk_features = fit1.transform(indep_x)
    feature_scores = fit1.scores_
    top_indices = fit1.get_support(indices = True)
    top_feature_names = indep_x.columns[top_indices]
    feature_imp_kbest = pd.DataFrame({'Feature': top_feature_names, 'Chi2_score': feature_scores[top_indices]}
                                   ).sort_values(by = 'Chi2_score', ascending = False)
    return selectk_features, top_feature_names, feature_imp_kbest

In [48]:
def split_scaler(indep_x, dep_y):
    x_train, x_test, y_train, y_test = train_test_split(indep_x, dep_y, test_size = 0.30, random_state = 0)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    return x_train, x_test, y_train, y_test

In [49]:
def cm_prediction(grid, x_test, y_test):
    grid_pred = grid.predict(x_test)
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, grid_pred)
    
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(y_test, grid_pred)
    
    from sklearn.metrics import classification_report
    report = classification_report(y_test, grid_pred)
    return grid, accuracy, report, x_test, y_test, cm

In [50]:
def logistic(x_train, y_train, x_test, y_test, feature_names):
    param_grid = {'solver':['lbfgs','newton-cg','liblinear','saga'], 'penalty':['l2','l1','elasticnet']}
    grid = GridSearchCV(LogisticRegression(), param_grid, refit = True, verbose = 3, n_jobs = -1,scoring = 'f1_weighted')
    grid.fit(x_train, y_train)
    classifier = LogisticRegression()
    classifier.fit(x_train, y_train)
    coefficients = classifier.coef_[0]  # coef_[0]--weights of each feature; abs(coefficients)--- taking positive values even it is negative eg: -1.5=1.5
    feature_imp_lr = pd.DataFrame({'Feature': feature_names,'Importance': abs(coefficients)}).sort_values(by='Importance', ascending=False)
    grid,accuracy,report,x_test,y_test,cm = cm_prediction(grid,x_test, y_test)
    return grid,accuracy,report,x_test,y_test,cm, feature_imp_lr

In [51]:
def svm_linear(x_train, y_train, x_test, y_test):
    from sklearn.svm import SVC
    param_grid = {'kernel':['linear'], 'gamma':['scale', 'auto'],
              'decision_function_shape':['ovo', 'ovr']}
    grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, n_jobs = -1, scoring = 'f1_weighted')
    grid.fit(x_train,y_train)
    grid, accuracy, report, x_test, y_test, cm = cm_prediction(grid, x_test, y_test) 
    return grid, accuracy, report, x_test, y_test, cm

In [52]:
def svm_nl(x_train, y_train, x_test, y_test):
    param_grid = {'kernel':['poly', 'rbf', 'sigmoid'], 'gamma':['scale', 'auto'],
              'decision_function_shape':['ovo', 'ovr']}
    grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, n_jobs = -1, scoring = 'f1_weighted')
    grid.fit(x_train, y_train)
    grid, accuracy, report, x_test, y_test, cm = cm_prediction(grid, x_test, y_test) 
    return grid, accuracy, report, x_test, y_test, cm    

In [53]:
def knn(x_train, y_train, x_test, y_test):
    from sklearn.neighbors import KNeighborsClassifier
    param_grid = {'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'], 'metric':['minkowski'],
              'weights':['uniform', 'distance']}
    grid = GridSearchCV(KNeighborsClassifier(), param_grid, refit = True, verbose = 3, n_jobs = -1, scoring = 'f1_weighted')
    grid.fit(x_train,y_train)
    grid, accuracy, report, x_test, y_test, cm = cm_prediction(grid, x_test, y_test) 
    return grid,accuracy,report,x_test,y_test,cm    

In [54]:
def naive(x_train, y_train, x_test, y_test):
    from sklearn.naive_bayes import GaussianNB
    param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]}
    grid = GridSearchCV(GaussianNB(), param_grid, cv=5, scoring = 'accuracy')
    grid.fit(x_train, y_train)
    grid, accuracy, report, x_test, y_test, cm = cm_prediction(grid, x_test, y_test) 
    return grid, accuracy, report, x_test, y_test, cm    

In [55]:
def decision(x_train, y_train, x_test, y_test):
    from sklearn.tree import DecisionTreeClassifier
    param_grid = {'criterion':['gini', 'entropy', 'log_loss'],'splitter':['best', 'random'],
                  'max_features':['sqrt', 'log2']}
    grid = GridSearchCV(DecisionTreeClassifier(), param_grid, refit = True, verbose = 3, n_jobs = -1, scoring = 'f1_weighted')
    grid.fit(x_train,y_train)
    grid, accuracy, report, x_test, y_test, cm = cm_prediction(grid, x_test, y_test) 
    return grid, accuracy, report, x_test, y_test, cm    

In [56]:
def random(x_train, y_train, x_test, y_test):
    from sklearn.ensemble import RandomForestClassifier
    param_grid = {'criterion':['gini','entropy', 'log_loss'],'max_features':['sqrt', 'log2', None],
              'class_weight':['balanced', 'balanced_subsample'], 'min_samples_split':[1,2]}

    grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 3, n_jobs = -1, scoring = 'f1_weighted')
    grid.fit(x_train,y_train)
    grid, accuracy, report, x_test, y_test, cm = cm_prediction(grid, x_test, y_test) 
    return grid, accuracy, report, x_test, y_test, cm    

In [57]:
def xgboost(x_train, y_train, x_test, y_test):
    from xgboost import XGBClassifier
    grid = XGBClassifier(n_estimators=1000,max_depth=7,eta=0.1,subsample=0.7,colsample_bytree=0.8)
    grid.fit(x_train,y_train)
    grid, accuracy, report, x_test, y_test, cm = cm_prediction(grid, x_test, y_test) 
    return grid, accuracy, report, x_test, y_test, cm    

In [58]:
def gradiant(x_train, y_train, x_test, y_test):
    from sklearn.ensemble import GradientBoostingClassifier
    grid = GradientBoostingClassifier(random_state=0)
    grid.fit(x_train,y_train)
    grid, accuracy, report, x_test, y_test, cm = cm_prediction(grid, x_test, y_test) 
    return grid, accuracy, report, x_test, y_test, cm    

In [59]:
def selectkclassification(acclog, accsvml, accsvmnl, accknn, accnaive, accdeci, accrand, accxg, accgr):
    dataframe = pd.DataFrame(index = ['ChiSquare'], columns =['Logistic', 'SVML', 'SVMNL', 'KNN', 'Naive', 'Decision', 'Random', 'Xgboost', 'Gradiant'])
    for number,item in enumerate(dataframe.index):
        dataframe['Logistic'][item] = acclog[number]
        dataframe['SVML'][item] = accsvml[number]
        dataframe['SVMNL'][item] = accsvmnl[number]
        dataframe['KNN'][item] = accknn[number]
        dataframe['Naive'][item] = accnaive[number]
        dataframe['Decision'][item] = accdeci[number]
        dataframe['Random'][item] = accrand[number]
        dataframe['Xgboost'][item] = accrand[number]
        dataframe['Gradiant'][item] = accrand[number]
    return dataframe

In [60]:
dataset = pd.read_csv("Employee_Attrition_Dataset.csv")
dataset

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [61]:
dataset.drop(['EmployeeCount', 'EmployeeNumber','StandardHours'], axis = 1, inplace = True)

In [62]:
df1 = pd.get_dummies(dataset, dtype = int, drop_first = True)
df1

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,41,1102,1,2,2,94,3,2,4,5993,...,0,0,0,0,0,1,0,0,1,1
1,49,279,8,1,3,61,2,2,2,5130,...,0,0,0,0,1,0,0,1,0,0
2,37,1373,2,2,4,92,2,1,3,2090,...,1,0,0,0,0,0,0,0,1,1
3,33,1392,3,4,4,56,3,1,3,2909,...,0,0,0,0,1,0,0,1,0,1
4,27,591,2,1,1,40,3,1,2,3468,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,884,23,2,3,41,4,2,4,2571,...,1,0,0,0,0,0,0,1,0,0
1466,39,613,6,1,4,42,2,3,1,9991,...,0,0,0,0,0,0,0,1,0,0
1467,27,155,4,3,2,87,4,2,2,6142,...,0,0,1,0,0,0,0,1,0,1
1468,49,1023,2,3,4,63,2,2,2,5390,...,0,0,0,0,0,1,0,1,0,0


In [63]:
# Input and Output Split
indep_x = df1.drop("Attrition_Yes", axis =1)
dep_y = df1['Attrition_Yes']

In [113]:
selectk_features, top_feature_names, feature_imp_kbest = selectkbest(indep_x, dep_y, 38)

acclog = []
accsvml = []
accsvmnl = []
accknn = []
accnaive = []
accdeci = []
accrand = []
accxg = []
accgr = []

print("\nTop 5 Features from SelectKBest (Chi2):")
print(feature_imp_kbest.head(5))


Top 5 Features from SelectKBest (Chi2):
              Feature     Chi2_score
8       MonthlyIncome  127922.293694
9         MonthlyRate    1196.633553
1           DailyRate     956.580494
13  TotalWorkingYears     230.721618
16     YearsAtCompany     142.100054


In [114]:
x_train, x_test, y_train, y_test = split_scaler(selectk_features, dep_y)

In [115]:
grid, accuracy, report, x_test, y_test, cm, feature_imp_lr = logistic(x_train, y_train, x_test, y_test, top_feature_names)
acclog.append(accuracy)
print("\nLogistic Regression Feature Importance:")
print(feature_imp_lr)

grid, accuracy, report, x_test, y_test, cm = svm_linear(x_train, y_train, x_test, y_test)
accsvml.append(accuracy)

grid, accuracy, report, x_test, y_test, cm= svm_nl(x_train, y_train, x_test, y_test)
accsvmnl.append(accuracy)

grid, accuracy, report, x_test, y_test, cm = knn(x_train, y_train, x_test, y_test)
accknn.append(accuracy)

grid, accuracy, report, x_test, y_test, cm = naive(x_train, y_train, x_test, y_test)
accnaive.append(accuracy)

grid, accuracy, report, x_test, y_test, cm = decision(x_train, y_train, x_test, y_test)
accdeci.append(accuracy)

grid, accuracy, report, x_test, y_test, cm = random(x_train, y_train, x_test, y_test)
accrand.append(accuracy)

grid, accuracy, report, x_test, y_test, cm = xgboost(x_train, y_train, x_test, y_test)
accxg.append(accuracy)

grid, accuracy, report, x_test, y_test, cm = gradiant(x_train, y_train, x_test, y_test)
accgr.append(accuracy)

Fitting 5 folds for each of 12 candidates, totalling 60 fits

Logistic Regression Feature Importance:
                              Feature  Importance
37                       OverTime_Yes    0.831248
20   BusinessTravel_Travel_Frequently    0.638969
16                     YearsAtCompany    0.603770
18            YearsSinceLastPromotion    0.570748
17                 YearsInCurrentRole    0.566646
10                 NumCompaniesWorked    0.501146
19               YearsWithCurrManager    0.498787
36               MaritalStatus_Single    0.481338
4             EnvironmentSatisfaction    0.438008
7                     JobSatisfaction    0.383409
21       BusinessTravel_Travel_Rarely    0.377208
26             EducationField_Medical    0.363812
2                    DistanceFromHome    0.361967
30      JobRole_Laboratory Technician    0.359058
13                  TotalWorkingYears    0.355146
15                    WorkLifeBalance    0.351276
0                                 Age    0.32064

In [116]:
result = selectkclassification(acclog, accsvml, accsvmnl, accknn, accnaive, accdeci, accrand, accxg, accgr)

In [82]:
# For K = 5
result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random,Xgboost,Gradiant
ChiSquare,0.84127,0.84127,0.84127,0.807256,0.843537,0.723356,0.829932,0.829932,0.829932


In [87]:
# For K = 10
result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random,Xgboost,Gradiant
ChiSquare,0.854875,0.84127,0.854875,0.848073,0.791383,0.777778,0.845805,0.845805,0.845805


In [92]:
# For K = 15
result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random,Xgboost,Gradiant
ChiSquare,0.857143,0.84127,0.811791,0.8322,0.802721,0.748299,0.861678,0.861678,0.861678


In [97]:
# For K = 23
result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random,Xgboost,Gradiant
ChiSquare,0.886621,0.863946,0.854875,0.861678,0.587302,0.795918,0.866213,0.866213,0.866213


In [102]:
# For K = 29
result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random,Xgboost,Gradiant
ChiSquare,0.897959,0.897959,0.873016,0.85941,0.612245,0.809524,0.852608,0.852608,0.852608


In [107]:
# For K = 33
result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random,Xgboost,Gradiant
ChiSquare,0.907029,0.895692,0.875283,0.854875,0.619048,0.791383,0.85034,0.85034,0.85034


In [112]:
# For  K = 35
result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random,Xgboost,Gradiant
ChiSquare,0.900227,0.888889,0.877551,0.852608,0.623583,0.77551,0.845805,0.845805,0.845805


In [117]:
# For K = 38
result

Unnamed: 0,Logistic,SVML,SVMNL,KNN,Naive,Decision,Random,Xgboost,Gradiant
ChiSquare,0.893424,0.886621,0.873016,0.863946,0.62585,0.748299,0.852608,0.852608,0.852608
