# MODEL DEVELOPMENT AND EVALUATION

In [6]:
import pandas as pd
import numpy as np
from pandas import DataFrame
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics 

In [7]:
data = pd.read_csv('Employeeattrition.csv')
data = data.drop(columns=['StandardHours','EmployeeCount','Over18','EmployeeNumber','StockOptionLevel'])

le = preprocessing.LabelEncoder()
categorial_variables = ['Attrition','BusinessTravel','Department','EducationField',
                        'Gender','JobRole','MaritalStatus','OverTime']
for i in categorial_variables:
    data[i] = le.fit_transform(data[i])
data.head(5)
data.to_csv('LabelEncoded_CleanData.csv')

In [8]:
target = data['Attrition']
train = data.drop('Attrition',axis = 1)
train.shape

(1470, 29)

# Implementation of all the popular classifiers in scikit-learn
 1.Logistic Regression
 2.SVM
 3.KNN
 4.Decision Tree


In [9]:
train_accuracy = []
test_accuracy = []
models = ['Logistic Regression','SVM','KNN','Decision Tree','Naive bayes']

In [10]:
#Defining a function which will give us train and test accuracy for each classifier.
def train_test_error(y_train,y_test):
    train_error = ((y_train==Y_train).sum())/len(y_train)*100
    test_error = ((y_test==Y_test).sum())/len(Y_test)*100
    train_accuracy.append(train_error)
    test_accuracy.append(test_error)
    print('{}'.format(train_error) + " is the train accuracy")
    print('{}'.format(test_error) + " is the test accuracy")

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(train, target, test_size=0.33, random_state=42)

# Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

log_reg = LogisticRegression()
log_reg.fit(X_train,Y_train)
train_predict = log_reg.predict(X_train)
test_predict = log_reg.predict(X_test)
y_prob = log_reg.predict(train)
y_pred = np.where(y_prob > 0.5, 1, 0)
train_test_error(train_predict , test_predict)
print("Classification Report for Test Set:\n")
print(classification_report(Y_test, test_predict))

print("Confusion Matrix for Test Set:\n")
print(confusion_matrix(Y_test, test_predict))


83.53658536585365 is the train accuracy
85.39094650205762 is the test accuracy
Classification Report for Test Set:

              precision    recall  f1-score   support

           0       0.86      1.00      0.92       416
           1       0.00      0.00      0.00        70

    accuracy                           0.85       486
   macro avg       0.43      0.50      0.46       486
weighted avg       0.73      0.85      0.79       486

Confusion Matrix for Test Set:

[[415   1]
 [ 70   0]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

nb = GaussianNB()
nb.fit(X_train, Y_train)

train_pred = nb.predict(X_train)
test_pred = nb.predict(X_test)
train_test_error(train_pred, test_pred)

print("Classification Report for Test Set:\n")
print(classification_report(Y_test, test_pred))

print("Confusion Matrix for Test Set:\n")
print(confusion_matrix(Y_test, test_pred))


78.04878048780488 is the train accuracy
78.18930041152264 is the test accuracy
Classification Report for Test Set:

              precision    recall  f1-score   support

           0       0.92      0.82      0.87       416
           1       0.34      0.56      0.42        70

    accuracy                           0.78       486
   macro avg       0.63      0.69      0.64       486
weighted avg       0.83      0.78      0.80       486

Confusion Matrix for Test Set:

[[341  75]
 [ 31  39]]


# SVM

In [14]:
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix

# Fit the SVM model
SVM = svm.SVC(probability=True)
SVM.fit(X_train, Y_train)

# Make predictions on training and test sets
train_predict = SVM.predict(X_train)
test_predict = SVM.predict(X_test)

# Calculate predicted probabilities and predicted classes for the entire dataset
y_prob = SVM.predict(train)
y_pred = np.where(y_prob > 0.5, 1, 0)
train_test_error(train_predict , test_predict)

# Print classification report and confusion matrix for the test set
print("Classification Report for Test Set:\n")
print(classification_report(Y_test, test_predict))

print("Confusion Matrix for Test Set:\n")
print(confusion_matrix(Y_test, test_predict))

83.02845528455285 is the train accuracy
85.59670781893004 is the test accuracy
Classification Report for Test Set:

              precision    recall  f1-score   support

           0       0.86      1.00      0.92       416
           1       0.00      0.00      0.00        70

    accuracy                           0.86       486
   macro avg       0.43      0.50      0.46       486
weighted avg       0.73      0.86      0.79       486

Confusion Matrix for Test Set:

[[416   0]
 [ 70   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# KNN

In [15]:
from sklearn import neighbors
from sklearn.metrics import classification_report, confusion_matrix

# Fit the k-NN model
n_neighbors = 15
knn = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
knn.fit(X_train, Y_train)

# Make predictions on training and test sets
train_predict = knn.predict(X_train)
test_predict = knn.predict(X_test)

# Calculate predicted probabilities and predicted classes for the entire dataset
y_prob = knn.predict(train)
y_pred = np.where(y_prob > 0.5, 1, 0)
train_test_error(train_predict , test_predict)

# Print classification report and confusion matrix for the test set
print("Classification Report for Test Set:\n")
print(classification_report(Y_test, test_predict))

print("Confusion Matrix for Test Set:\n")
print(confusion_matrix(Y_test, test_predict))

100.0 is the train accuracy
84.5679012345679 is the test accuracy
Classification Report for Test Set:

              precision    recall  f1-score   support

           0       0.86      0.98      0.92       416
           1       0.22      0.03      0.05        70

    accuracy                           0.85       486
   macro avg       0.54      0.51      0.48       486
weighted avg       0.77      0.85      0.79       486

Confusion Matrix for Test Set:

[[409   7]
 [ 68   2]]


# Decision Tree

In [16]:
from sklearn import tree
from sklearn.metrics import classification_report, confusion_matrix
dec = tree.DecisionTreeClassifier()
dec.fit(X_train, Y_train)
train_predict = dec.predict(X_train)
test_predict = dec.predict(X_test)
y_prob = dec.predict(train)
y_pred = np.where(y_prob > 0.5, 1, 0)
train_test_error(train_predict , test_predict)
# Print classification report and confusion matrix for the test set
print("Classification Report for Test Set:\n")
print(classification_report(Y_test, test_predict))
print("Confusion Matrix for Test Set:\n")
print(confusion_matrix(Y_test, test_predict))


100.0 is the train accuracy
80.24691358024691 is the test accuracy
Classification Report for Test Set:

              precision    recall  f1-score   support

           0       0.89      0.88      0.88       416
           1       0.32      0.33      0.32        70

    accuracy                           0.80       486
   macro avg       0.60      0.61      0.60       486
weighted avg       0.80      0.80      0.80       486

Confusion Matrix for Test Set:

[[367  49]
 [ 47  23]]


In [17]:
results = pd.DataFrame({"Test Accuracy" : test_accuracy[:5], "Train Accuracy" : train_accuracy[:5]} , index = models[:5])

In [18]:
results

Unnamed: 0,Test Accuracy,Train Accuracy
Logistic Regression,85.390947,83.536585
SVM,78.1893,78.04878
KNN,85.596708,83.028455
Decision Tree,84.567901,100.0
Naive bayes,80.246914,100.0
