In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("churn_modelling.csv")

### This dataset contains 14 features for about 10K customers of a bank of which 20% of them are churn customers. This dataset can be used for predicting customer churn. We can build a machine learning model to predict if a customer will quit the service of the bank in the next 6 months or not. Predicting customer churn will help banks develop retention compaign and loyalty programs to retain customers.

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,?,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,?,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### We will check the info of the dataset

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  object 
 8   Balance          10000 non-null  object 
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 1.1+ MB


### From the above info it will seen that tenure and balance are actually numerical column but their data type are showing object. That means there are missing value in both column. We can check that with value_counts.

In [5]:
df['Tenure'].value_counts()

2     1048
1     1035
7     1028
8     1025
5     1012
3     1009
4      989
9      984
6      967
10     490
?      413
Name: Tenure, dtype: int64

In [6]:
df['Balance'].value_counts()

?            3617
130170.82       2
105473.74       2
85304.27        1
159397.75       1
             ... 
81556.89        1
112687.69       1
108698.96       1
238387.56       1
130142.79       1
Name: Balance, Length: 6382, dtype: int64

### we will replace the missing value of above two columns with np.nan.

In [7]:
df['Tenure'].replace('?',np.nan,inplace=True)
df['Balance'].replace('?',np.nan,inplace=True)

In [8]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Now we will change the datatype of above two column.

In [9]:
df['Tenure']=df['Tenure'].astype(float)
df['Balance']=df['Balance'].astype(float)

### We will check the central tendancy.

In [10]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,9587.0,6383.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.228747,119827.493793,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.756048,30095.056462,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,1.0,3768.69,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,100181.975,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,119839.69,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,8.0,139512.29,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


### AS there is not much difference in mean and median of Tenure and Balance, we can replace null values of above column with mean.

In [11]:
df['Tenure'].fillna(df['Tenure'].mean(),inplace=True)
df['Balance'].fillna(df['Balance'].mean(),inplace=True)

### We will once again check for null value.

In [12]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [13]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,119827.493793,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,119827.493793,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


### We will check corelation between the variable.

In [14]:
df.corr()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,1.0,0.004202,0.00584,0.000783,-0.006079,-0.003796,0.007246,0.000599,0.012044,-0.005988,-0.016571
CustomerId,0.004202,1.0,0.005308,0.009497,-0.01316,-0.006942,0.016972,-0.014025,0.001665,0.015271,-0.006248
CreditScore,0.00584,0.005308,1.0,-0.003965,0.002157,-0.003801,0.012238,-0.005458,0.025651,-0.001384,-0.027094
Age,0.000783,0.009497,-0.003965,1.0,-0.008695,-0.010232,-0.03068,-0.011721,0.085472,-0.007201,0.285323
Tenure,-0.006079,-0.01316,0.002157,-0.008695,1.0,0.004355,0.002424,0.014268,-0.027969,0.011521,-0.009775
Balance,-0.003796,-0.006942,-0.003801,-0.010232,0.004355,1.0,-0.000814,0.005403,-0.016314,-0.00148,0.014595
NumOfProducts,0.007246,0.016972,0.012238,-0.03068,0.002424,-0.000814,1.0,0.003183,0.009612,0.014204,-0.04782
HasCrCard,0.000599,-0.014025,-0.005458,-0.011721,0.014268,0.005403,0.003183,1.0,-0.011866,-0.009933,-0.007138
IsActiveMember,0.012044,0.001665,0.025651,0.085472,-0.027969,-0.016314,0.009612,-0.011866,1.0,-0.011421,-0.156128
EstimatedSalary,-0.005988,0.015271,-0.001384,-0.007201,0.011521,-0.00148,0.014204,-0.009933,-0.011421,1.0,0.012097


### From above information RowNumber, CustomerId, Tenure and HasCrCard are low corelated to target column so we can drop those column as well with domain knowlede Surname column also do not having any effect on target column so we can drop that column also. 

In [15]:
df.drop(['RowNumber','CustomerId','Surname','Tenure','HasCrCard'],axis=1,inplace=True)

In [16]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,119827.493793,1,1,101348.88,1
1,608,Spain,Female,41,83807.86,1,1,112542.58,0
2,502,France,Female,42,159660.8,3,0,113931.57,1
3,699,France,Female,39,119827.493793,2,0,93826.63,0
4,850,Spain,Female,43,125510.82,1,1,79084.1,0


### We will seperate categoracal and numerical column.

In [17]:
df_num=df.select_dtypes(['int','float'])
df_cat=df.select_dtypes(object)

In [18]:
df_num.head()

Unnamed: 0,CreditScore,Age,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited
0,619,42,119827.493793,1,1,101348.88,1
1,608,41,83807.86,1,1,112542.58,0
2,502,42,159660.8,3,0,113931.57,1
3,699,39,119827.493793,2,0,93826.63,0
4,850,43,125510.82,1,1,79084.1,0


In [19]:
df_cat.head()

Unnamed: 0,Geography,Gender
0,France,Female
1,Spain,Female
2,France,Female
3,France,Female
4,Spain,Female


### We will perform labelencoding on categorical column

In [20]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for i in df_cat:
    df_cat[i]=le.fit_transform(df_cat[i])

In [21]:
df_cat.head()

Unnamed: 0,Geography,Gender
0,0,0
1,2,0
2,0,0
3,0,0
4,2,0


### We will concatnate cat and num column

In [22]:
df=pd.concat([df_cat,df_num],axis=1)

In [23]:
df.head()

Unnamed: 0,Geography,Gender,CreditScore,Age,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited
0,0,0,619,42,119827.493793,1,1,101348.88,1
1,2,0,608,41,83807.86,1,1,112542.58,0
2,0,0,502,42,159660.8,3,0,113931.57,1
3,0,0,699,39,119827.493793,2,0,93826.63,0
4,2,0,850,43,125510.82,1,1,79084.1,0


### We will check whether dataset is balance or not.

In [26]:
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

### We will balance the dataset using smot technique.

In [27]:
df_majority = df[df['Exited']==0] 
df_minority = df[df['Exited']==1]

In [28]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy='minority', random_state=1)
oversampled_X, oversampled_Y = sm.fit_resample(df.drop('Exited', axis=1), df['Exited'])
df_upsampled = pd.concat([pd.DataFrame(oversampled_Y), pd.DataFrame(oversampled_X)], axis=1)

In [29]:
df_upsampled

Unnamed: 0,Exited,Geography,Gender,CreditScore,Age,Balance,NumOfProducts,IsActiveMember,EstimatedSalary
0,1,0,0,619,42,119827.493793,1,1,101348.880000
1,0,2,0,608,41,83807.860000,1,1,112542.580000
2,1,0,0,502,42,159660.800000,3,0,113931.570000
3,0,0,0,699,39,119827.493793,2,0,93826.630000
4,0,2,0,850,43,125510.820000,1,1,79084.100000
...,...,...,...,...,...,...,...,...,...
15921,1,1,0,458,42,104455.048209,1,1,183689.618873
15922,1,1,0,632,34,174531.728512,1,0,172868.200417
15923,1,1,0,687,30,78824.350975,1,0,121731.816514
15924,1,0,0,636,28,119827.493793,1,0,100398.241511


In [30]:
df_upsampled['Exited'].value_counts()

1    7963
0    7963
Name: Exited, dtype: int64

### The dataset is balance now. We will seperate x and y.

In [31]:
y=df_upsampled.iloc[:,0]
y

0        1
1        0
2        1
3        0
4        0
        ..
15921    1
15922    1
15923    1
15924    1
15925    1
Name: Exited, Length: 15926, dtype: int64

In [32]:
x=df_upsampled.iloc[:,1:]
x

Unnamed: 0,Geography,Gender,CreditScore,Age,Balance,NumOfProducts,IsActiveMember,EstimatedSalary
0,0,0,619,42,119827.493793,1,1,101348.880000
1,2,0,608,41,83807.860000,1,1,112542.580000
2,0,0,502,42,159660.800000,3,0,113931.570000
3,0,0,699,39,119827.493793,2,0,93826.630000
4,2,0,850,43,125510.820000,1,1,79084.100000
...,...,...,...,...,...,...,...,...
15921,1,0,458,42,104455.048209,1,1,183689.618873
15922,1,0,632,34,174531.728512,1,0,172868.200417
15923,1,0,687,30,78824.350975,1,0,121731.816514
15924,0,0,636,28,119827.493793,1,0,100398.241511


### We will slpit the data and build the model

In [33]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.30,random_state=1)

In [34]:
from sklearn.metrics import classification_report,confusion_matrix

### We will check accuracy for different classification model.

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost
from xgboost import XGBClassifier

In [52]:
def mymodel(model):
    model.fit(xtrain,ytrain)
    ypred=model.predict(xtest)
    print(classification_report(ytest,ypred))
    print(confusion_matrix(ytest,ypred))

In [53]:
lr=LogisticRegression()
knn=KNeighborsClassifier()
svm=SVC()
dt=DecisionTreeClassifier()
rf=RandomForestClassifier()
ad=AdaBoostClassifier()
gb=GradientBoostingClassifier()
xgb=XGBClassifier()

In [54]:
mymodel(lr)

              precision    recall  f1-score   support

           0       0.66      0.69      0.67      2322
           1       0.69      0.67      0.68      2456

    accuracy                           0.68      4778
   macro avg       0.68      0.68      0.68      4778
weighted avg       0.68      0.68      0.68      4778

[[1599  723]
 [ 821 1635]]


In [55]:
mymodel(knn)

              precision    recall  f1-score   support

           0       0.67      0.60      0.63      2322
           1       0.65      0.72      0.68      2456

    accuracy                           0.66      4778
   macro avg       0.66      0.66      0.66      4778
weighted avg       0.66      0.66      0.66      4778

[[1393  929]
 [ 699 1757]]


In [56]:
mymodel(svm)

              precision    recall  f1-score   support

           0       0.49      0.94      0.65      2322
           1       0.61      0.09      0.15      2456

    accuracy                           0.50      4778
   macro avg       0.55      0.51      0.40      4778
weighted avg       0.55      0.50      0.39      4778

[[2183  139]
 [2242  214]]


In [57]:
mymodel(dt)

              precision    recall  f1-score   support

           0       0.78      0.76      0.77      2322
           1       0.78      0.80      0.79      2456

    accuracy                           0.78      4778
   macro avg       0.78      0.78      0.78      4778
weighted avg       0.78      0.78      0.78      4778

[[1771  551]
 [ 494 1962]]


In [58]:
mymodel(rf)

              precision    recall  f1-score   support

           0       0.85      0.84      0.84      2322
           1       0.85      0.86      0.85      2456

    accuracy                           0.85      4778
   macro avg       0.85      0.85      0.85      4778
weighted avg       0.85      0.85      0.85      4778

[[1951  371]
 [ 354 2102]]


In [59]:
mymodel(ad)

              precision    recall  f1-score   support

           0       0.81      0.81      0.81      2322
           1       0.82      0.82      0.82      2456

    accuracy                           0.81      4778
   macro avg       0.81      0.81      0.81      4778
weighted avg       0.81      0.81      0.81      4778

[[1870  452]
 [ 436 2020]]


In [60]:
mymodel(gb)

              precision    recall  f1-score   support

           0       0.82      0.83      0.82      2322
           1       0.84      0.83      0.83      2456

    accuracy                           0.83      4778
   macro avg       0.83      0.83      0.83      4778
weighted avg       0.83      0.83      0.83      4778

[[1920  402]
 [ 413 2043]]


In [61]:
mymodel(xgb)

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      2322
           1       0.84      0.86      0.85      2456

    accuracy                           0.85      4778
   macro avg       0.85      0.85      0.85      4778
weighted avg       0.85      0.85      0.85      4778

[[1924  398]
 [ 340 2116]]


### As Randomforest classifier has given best accuracy we will further improve it by hyperparameter tunning.

In [47]:
for i in range(1,50):
    rf1=RandomForestClassifier(max_depth=i)
    rf1.fit(xtrain,ytrain)
    ypred=rf1.predict(xtest)
    print(i,"----",accuracy_score(ytest,ypred))

1 ---- 0.7840100460443701
2 ---- 0.797614064462118
3 ---- 0.8049393051485978
4 ---- 0.8041021347844286
5 ---- 0.8112180828798661
6 ---- 0.8139388865634156
7 ---- 0.8172875680200921
8 ---- 0.8250313938886563
9 ---- 0.8325659271661783
10 ---- 0.834658853076601
11 ---- 0.8409376308078694
12 ---- 0.8417748011720385
13 ---- 0.8482628714943491
14 ---- 0.8480535789033068
15 ---- 0.8493093344495605
16 ---- 0.8478442863122645
17 ---- 0.8491000418585182
18 ---- 0.8461699455839263
19 ---- 0.8507743825868564
20 ---- 0.8484721640853914
21 ---- 0.8503557974047719
22 ---- 0.8484721640853914
23 ---- 0.8465885307660109
24 ---- 0.8478442863122645
25 ---- 0.8463792381749686
26 ---- 0.8507743825868564
27 ---- 0.84742570113018
28 ---- 0.84742570113018
29 ---- 0.8491000418585182
30 ---- 0.8516115529510255
31 ---- 0.8516115529510255
32 ---- 0.8470071159480954
33 ---- 0.8486814566764337
34 ---- 0.8499372122226874
35 ---- 0.8461699455839263
36 ---- 0.8482628714943491
37 ---- 0.8480535789033068
38 ---- 0.849937

In [48]:
for i in range(1,50):
    rf2=RandomForestClassifier(min_samples_leaf=i)
    rf2.fit(xtrain,ytrain)
    ypred=rf2.predict(xtest)
    print(i,"----",accuracy_score(ytest,ypred))

1 ---- 0.848890749267476
2 ---- 0.8467978233570531
3 ---- 0.8428212641272499
4 ---- 0.8411469233989117
5 ---- 0.8373796567601507
6 ---- 0.8371703641691084
7 ---- 0.837588949351193
8 ---- 0.8354960234407702
9 ---- 0.8354960234407702
10 ---- 0.8359146086228547
11 ---- 0.8319380493930515
12 ---- 0.8325659271661783
13 ---- 0.8317287568020092
14 ---- 0.8313101716199247
15 ---- 0.8321473419840938
16 ---- 0.8273336123901214
17 ---- 0.8254499790707409
18 ---- 0.8275429049811637
19 ---- 0.8252406864796986
20 ---- 0.8225198827961491
21 ---- 0.8235663457513605
22 ---- 0.8241942235244872
23 ---- 0.8227291753871913
24 ---- 0.8200083717036417
25 ---- 0.818752616157388
26 ---- 0.8208455420678108
27 ---- 0.8229384679782336
28 ---- 0.8229384679782336
29 ---- 0.8177061532021767
30 ---- 0.8197990791125994
31 ---- 0.8166596902469653
32 ---- 0.8177061532021767
33 ---- 0.8174968606111344
34 ---- 0.8174968606111344
35 ---- 0.8166596902469653
36 ---- 0.8170782754290498
37 ---- 0.8133110087902888
38 ---- 0.811

In [49]:
for i in range(2,50):
    rf3=RandomForestClassifier(min_samples_split=i)
    rf3.fit(xtrain,ytrain)
    ypred=rf3.predict(xtest)
    print(i,"----",accuracy_score(ytest,ypred))

2 ---- 0.8495186270406028
3 ---- 0.8480535789033068
4 ---- 0.8467978233570531
5 ---- 0.8491000418585182
6 ---- 0.8463792381749686
7 ---- 0.8484721640853914
8 ---- 0.845123482628715
9 ---- 0.8465885307660109
10 ---- 0.8438677270824613
11 ---- 0.8421933863541231
12 ---- 0.8453327752197572
13 ---- 0.8444956048555882
14 ---- 0.8419840937630808
15 ---- 0.8421933863541231
16 ---- 0.8405190456257848
17 ---- 0.8405190456257848
18 ---- 0.8415655085809962
19 ---- 0.8424026789451653
20 ---- 0.837588949351193
21 ---- 0.8386354123064044
22 ---- 0.8401004604437003
23 ---- 0.8403097530347425
24 ---- 0.8392632900795312
25 ---- 0.8377982419422353
26 ---- 0.8388447048974467
27 ---- 0.8388447048974467
28 ---- 0.8386354123064044
29 ---- 0.8403097530347425
30 ---- 0.8390539974884889
31 ---- 0.8380075345332775
32 ---- 0.8380075345332775
33 ---- 0.8357053160318125
34 ---- 0.8390539974884889
35 ---- 0.8367517789870239
36 ---- 0.8369610715780661
37 ---- 0.8350774382586856
38 ---- 0.8344495604855588
39 ---- 0.8

In [62]:
rf4=RandomForestClassifier(max_depth=31,min_samples_leaf=1,min_samples_split=2)
mymodel(rf4)

              precision    recall  f1-score   support

           0       0.85      0.84      0.85      2322
           1       0.85      0.86      0.85      2456

    accuracy                           0.85      4778
   macro avg       0.85      0.85      0.85      4778
weighted avg       0.85      0.85      0.85      4778

[[1961  361]
 [ 354 2102]]


### Best accuracy by Randomforest is 85%

### We will check the prediction of model

In [64]:
a=int(input("enter the value of Geography "))                      # range (0-2)
b= int(input("enter the value of Gender "))                        # range (0-1)
c=float(input("enter the value of creditscore "))                  # range (350-850)
d=int(input("enter the value of Age "))                            # range (18-92)
e=float(input("enter the value of Balance "))                      # range (3768-250898)
f=int(input("enter the value of number of product "))              # range (1-4)
g=float(input("enter the value of estmsted salary "))              # range (11-200000)
h=int(input("enter the value of isactivemember "))                 # range (0-1)

m=rf4.predict([[a,b,c,d,e,f,g,h]])[0]
print("The new customer will classify in ",m,"Class")   

enter the value of Geography 1
enter the value of Gender 1
enter the value of creditscore 450.5
enter the value of Age 45
enter the value of Balance 75550.45
enter the value of number of product 3
enter the value of estmsted salary 155575.45
enter the value of isactivemember 1
The new customer will classify in  1 Class


## The customer will quit the service of the bank on above condition.