# Churn_Modelling dataset

In [51]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

# Importing churn_modeling_dataset for classification problem

In [64]:
dataset = pd.read_csv("Churn_Modelling.csv")
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [65]:
# Drop the unwanted variables
dataset1 = dataset.iloc[:,3:]

In [66]:
dataset1.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Pre-Processing Part

In [67]:
# part 1 - missing value
dataset1.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [68]:
# part 2 - encoding
dataset1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [69]:
# Encoding

dataset1 = pd.get_dummies(dataset1, columns=['Geography','Gender'],drop_first=True)

In [70]:
dataset1.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


In [71]:
# part 3 - outlier 
Q1 = dataset1.quantile(0.25)
Q3 = dataset1.quantile(0.75)
IQR = Q3 - Q1

df = dataset1[~((dataset1<(Q1-1.5*IQR)) | (dataset1>(Q3+1.5*IQR))).any(axis=1)]

In [72]:
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
6,822,50,7,0.0,2,1,1,10062.8,0,0,0,1
8,501,44,4,142051.07,2,0,1,74940.5,0,0,0,1
9,684,27,2,134603.88,1,1,1,71725.73,0,0,0,1
10,528,31,6,102016.72,2,0,0,80181.12,0,0,0,1


In [73]:
print(dataset1.shape)
print(df.shape)

(10000, 12)
(5689, 12)


In [74]:
print('###############################')

###############################


In [75]:
# preprocessing part 4 - feature scaling
# instead of handling outlier and then will do feature scaling. the good idea is to do feature scaling directly
# Since, we are using RF, Bagging method, hence this step is also not required

In [76]:
# part 5 - check imbalance dataset
dataset1['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [77]:
# split the data into x and y basis idependent and dependent variable
x = dataset1.drop(['Exited'], axis=1)
y = dataset1['Exited']

In [78]:
x.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0


In [79]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

In [80]:
# Since we do have imbalance dataset, so we have to use oversampling method to make our data balance

import imblearn
# status of imbalance
# method - oversampling
# majority class - 0 = 7963
# minority class - 1 = 2037

# status of balance
# class 0 = 7963
# class 1 = 7963

from imblearn.over_sampling import RandomOverSampler
over = RandomOverSampler()
x_over, y_over = over.fit_resample(x,y)

In [81]:
print(x_over.shape)
print(y_over.shape)

(15926, 11)
(15926,)


In [82]:
7963*2

15926

# split the data into training and test for model building and evaluation

In [83]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_over, y_over,train_size=0.75,random_state=501)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(11944, 11)
(3982, 11)
(11944,)
(3982,)


# Bagging model

In [84]:
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier()
bagging.fit(x_train, y_train)

In [85]:
y_pred_train = bagging.predict(x_train)
y_pred_test = bagging.predict(x_test)

In [86]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [87]:
print(classification_report(y_train, y_pred_train))
print("######"*20)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5963
           1       1.00      1.00      1.00      5981

    accuracy                           1.00     11944
   macro avg       1.00      1.00      1.00     11944
weighted avg       1.00      1.00      1.00     11944

########################################################################################################################
              precision    recall  f1-score   support

           0       0.97      0.90      0.93      2000
           1       0.90      0.97      0.94      1982

    accuracy                           0.93      3982
   macro avg       0.94      0.93      0.93      3982
weighted avg       0.94      0.93      0.93      3982



In [88]:
print(confusion_matrix(y_train, y_pred_train))
print("######"*20)
print(confusion_matrix(y_test, y_pred_test))

[[5948   15]
 [  16 5965]]
########################################################################################################################
[[1794  206]
 [  55 1927]]


In [89]:
print(accuracy_score(y_train, y_pred_train))
print("######"*20)
print(accuracy_score(y_test, y_pred_test))

0.9974045545880776
########################################################################################################################
0.9344550477147162


# RandomForest model

In [90]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200,criterion='entropy')
rf.fit(x_train,y_train)

In [91]:
from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier(n_estimators=200,criterion='gini')
rf1.fit(x_train,y_train)

In [92]:
y_pred_rf_train = rf.predict(x_train)
y_pred_rf_test = rf.predict(x_test)

In [93]:
y_pred_rf_train1 = rf1.predict(x_train)
y_pred_rf_test1 = rf1.predict(x_test)

In [94]:
# entropy
print(accuracy_score(y_train, y_pred_rf_train))
print("######"*20)
print(accuracy_score(y_test, y_pred_rf_test))

1.0
########################################################################################################################
0.9515318935208438


In [95]:
# gini
print(accuracy_score(y_train, y_pred_rf_train1))
print("######"*20)
print(accuracy_score(y_test, y_pred_rf_test1))

1.0
########################################################################################################################
0.9515318935208438


# Decistion Tree Classifier

In [96]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

In [97]:
y_pred_dt_train = dt.predict(x_train)
y_pred_dt_test = dt.predict(x_test)

In [98]:
print(accuracy_score(y_train, y_pred_dt_train))
print("######"*20)
print(accuracy_score(y_test, y_pred_dt_test))

1.0
########################################################################################################################
0.9083375188347564


# LogisticRegression

In [99]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)

In [100]:
y_pred_lr_train = lr.predict(x_train)
y_pred_lr_test = lr.predict(x_test)

In [50]:
print(accuracy_score(y_train, y_pred_lr_train))
print("######"*20)
print(accuracy_score(y_test, y_pred_lr_test))

0.6625920964501004
########################################################################################################################
0.6438975389251632
