# Import Libraries and Load Training and Testing Dataset

In [1]:
import pandas as pd
import numpy as np

In [10]:
pd.set_option('display.max_columns',None)
pd.set_option('display.float_format',lambda x : '%.2f'%x)

In [11]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Cleaning Training and Testing DataSets

In [12]:
def cleandata(df):
    df.drop(['id','Over18','StandardHours','EmployeeCount','EducationField','JobRole'],axis=1,inplace= True)
    df.MaritalStatus.replace({'Divorced':'Single'},inplace=True)
    df.OverTime.replace({'Yes':1,'No':0},inplace=True)
    df = pd.get_dummies(data=df,columns=['Gender','BusinessTravel','Department','MaritalStatus'],drop_first=True)
    return df

In [13]:
cleaned_train = cleandata(train)
cleaned_test = cleandata(test)

In [34]:
X = cleaned_train.drop('Attrition',axis=1)
y = cleaned_train['Attrition']

In [35]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42,stratify=y)

In [36]:
X_test = cleaned_test

## Importing ML Libraries

In [37]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,recall_score,precision_score,roc_auc_score,f1_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier,StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

### Modeling with imbalanced Data set

In [53]:
model_list = [('LR',LogisticRegression(solver='lbfgs')),
             ('DT',DecisionTreeClassifier()),
              ('RF',RandomForestClassifier(n_estimators=150)),
              ('GBC',GradientBoostingClassifier(n_estimators=150)),
              ('SVC',SVC(kernel='rbf')),
              ('KNN',KNeighborsClassifier(n_neighbors=11)),
              ('GNB',GaussianNB()),
              ('BNB',BernoulliNB()),
              ('CB',CatBoostClassifier(verbose=0)),
              ('XBG',XGBClassifier(verbosity=0))
             ]

In [55]:
model_dict1 = {}
for model_name,model in model_list:
    print(model_name)
    model.fit(X_train,y_train)
    print('Training Set Score',model.score(X_train,y_train))
    print('Test Set Score',model.score(X_val,y_val))
    y_pred = model.predict(X_val)
    print('Test Set Accuracy:',accuracy_score(y_val,y_pred))
    print('Confusion Matrix:\n',confusion_matrix(y_val,y_pred))
    print('\n')
    model_dict1[model_name] = {'Test ACC':round(accuracy_score(y_val,y_pred)*100,2),
                             'Recall':round(recall_score(y_val,y_pred)*100,2),
                             'Precision':round(precision_score(y_val,y_pred)*100,2),
                              'F1-Score':round(f1_score(y_val,y_pred)*100,2),
                              'ROC_AUC':round(roc_auc_score(y_val,y_pred,)*100,2)
                             }

LR
Training Set Score 0.8806767586821015
Test Set Score 0.8808664259927798
Test Set Accuracy: 0.8808664259927798
Confusion Matrix:
 [[488   0]
 [ 66   0]]


DT
Training Set Score 1.0
Test Set Score 0.7924187725631769
Test Set Accuracy: 0.7924187725631769
Confusion Matrix:
 [[419  69]
 [ 46  20]]


RF


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


Training Set Score 1.0
Test Set Score 0.8826714801444043
Test Set Accuracy: 0.8826714801444043
Confusion Matrix:
 [[483   5]
 [ 60   6]]


GBC
Training Set Score 0.9830810329474622
Test Set Score 0.8808664259927798
Test Set Accuracy: 0.8808664259927798
Confusion Matrix:
 [[472  16]
 [ 50  16]]


SVC
Training Set Score 0.8806767586821015
Test Set Score 0.8808664259927798
Test Set Accuracy: 0.8808664259927798
Confusion Matrix:
 [[488   0]
 [ 66   0]]


KNN
Training Set Score 0.8815672306322351
Test Set Score 0.8808664259927798


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Test Set Accuracy: 0.8808664259927798
Confusion Matrix:
 [[488   0]
 [ 66   0]]


GNB
Training Set Score 0.8646482635796973
Test Set Score 0.8321299638989169
Test Set Accuracy: 0.8321299638989169
Confusion Matrix:
 [[433  55]
 [ 38  28]]


BNB
Training Set Score 0.8637577916295637
Test Set Score 0.851985559566787
Test Set Accuracy: 0.851985559566787
Confusion Matrix:
 [[459  29]
 [ 53  13]]


CB
Training Set Score 0.9884238646482636
Test Set Score 0.8935018050541517
Test Set Accuracy: 0.8935018050541517
Confusion Matrix:
 [[483   5]
 [ 54  12]]


XBG
Training Set Score 1.0
Test Set Score 0.8826714801444043
Test Set Accuracy: 0.8826714801444043
Confusion Matrix:
 [[473  15]
 [ 50  16]]




In [40]:
model_comp1 = pd.DataFrame(model_dict1).T
model_comp1.sort_values('ROC_AUC',ascending=False)

Unnamed: 0,Test ACC,Recall,Precision,F1-Score,ROC_AUC
GNB,83.21,42.42,33.73,37.58,65.58
XBG,88.27,24.24,51.61,32.99,60.58
GBC,88.09,24.24,50.0,32.65,60.48
DT,79.24,34.85,24.21,28.57,60.05
CB,89.35,18.18,70.59,28.92,58.58
BNB,85.2,19.7,30.95,24.07,56.88
RF,88.09,9.09,50.0,15.38,53.93
LR,88.09,0.0,0.0,0.0,50.0
SVC,88.09,0.0,0.0,0.0,50.0
KNN,88.09,0.0,0.0,0.0,50.0


In [47]:
gnb = GaussianNB()
gnb.fit(X_train,y_train.ravel())
final_preds = gnb.predict(X_test)

In [48]:
final_preds

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)