In [231]:
#importing basic things
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

In [232]:
#importing modules common to all models
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [233]:
#importing model building things for Naive Bayes
from sklearn.naive_bayes import ComplementNB

In [234]:
#importing model building things for Logistic Regression
from sklearn.linear_model import LogisticRegression

In [235]:
#importing model building things for Random Forest
from sklearn.ensemble import RandomForestClassifier

In [236]:
#importing model building things for KNN
from sklearn.neighbors import KNeighborsClassifier

In [237]:
#reading the dataset
dataframe = pd.read_csv("Churn_Modelling.csv")

In [238]:
dataframe.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [239]:
#removing string columns (including columns which can be converted)
cols3 = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary','Exited']
dataframe2 = dataframe[cols3]

In [240]:
dataframe2.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [241]:
#making another dataframe for one hot encoding
dataframe3 = dataframe2[cols3]

In [242]:
dataframe3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [243]:
#label encoding for Gender
labelEncoder = LabelEncoder()
dataframe3['Gender'] = labelEncoder.fit_transform(dataframe3['Gender'])
dataframe3.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [244]:
#using one hot encoding for Geography field
dataframe3 = pd.get_dummies(dataframe3, columns=['Geography'], prefix="Country")
dataframe3.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Country_France,Country_Germany,Country_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1,0,0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1,0,0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1,0,0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0,0,1


In [245]:
x = dataframe3.iloc[:,:-1].values
y = dataframe3.iloc[:,-1].values

In [246]:
#Testing Model Building Using Naive Bayes

In [247]:
#splitting the database
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 23)

#Complement Naive Bayes is used as it is better on imbalanced datasets
cnb = ComplementNB()
cnb.fit(X_train, y_train) 
  
# Evaluating the classifier
y_predicted = cnb.predict(X_test)

In [248]:
cm = confusion_matrix(y_test,y_predicted)
print("Confusion Matrix:\n",cm)

Confusion Matrix:
 [[1302  987]
 [ 299  412]]


In [249]:
print("Accuracy:", accuracy_score(y_test, y_predicted))
print("Precision:", precision_score(y_test, y_predicted))
print("Recall:", recall_score(y_test, y_predicted))
print("F1 Score:", f1_score(y_test, y_predicted))

Accuracy: 0.5713333333333334
Precision: 0.2944960686204432
Recall: 0.5794655414908579
F1 Score: 0.39052132701421804


In [250]:
#Testing Model Building Using Logistic Regression

In [251]:
#creating x and y
x = dataframe3.iloc[:,0:11].values
y = dataframe3.iloc[:,11].values

In [252]:
#scaling x
norm = StandardScaler()
x = norm.fit_transform(x)

In [253]:
#splitting the database
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.20, random_state = 23)

#applying logistic regression
log = LogisticRegression()
log.fit(x_train,y_train)

#predicting output from test dataset
y_pred = log.predict(x_test)



In [254]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n",cm)

Confusion Matrix:
 [[1320  169]
 [ 103  408]]


In [255]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.864
Precision: 0.707105719237435
Recall: 0.7984344422700587
F1 Score: 0.75


In [256]:
#Testing Model Building Using Random Forest

In [257]:
#splitting the database
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.20, random_state = 23)

#applying random forest
rf = RandomForestClassifier()
rf.fit(x_train,y_train)

#predicting output from test dataset
y_pred = rf.predict(x_test)



In [258]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n",cm)

Confusion Matrix:
 [[1352  137]
 [ 138  373]]


In [259]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.8625
Precision: 0.7313725490196078
Recall: 0.7299412915851272
F1 Score: 0.7306562193927522


In [260]:
#Testing Model Building Using KNN

In [261]:
#splitting the database
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.20, random_state = 23)

#applying KNN
knn = KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2)
knn.fit(x_train,y_train)

#Evaluating the classifier
y_pred = knn.predict(x_test)

In [262]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n",cm)

Confusion Matrix:
 [[1321  168]
 [ 129  382]]


In [263]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.8515
Precision: 0.6945454545454546
Recall: 0.7475538160469667
F1 Score: 0.7200754005655042
