# Churn modeling dataset

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Churn_Modelling.csv')
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


# drop and check missing values

In [4]:
data = df.iloc[:,3:]
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
data.isna().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

#### We conclude no missing data present

In [6]:
data.Geography = data.Geography.astype('category')
data.Geography = data.Geography.cat.codes
data.Gender= data.Gender.astype('category')
data.Gender = data.Gender.cat.codes

In [7]:
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,0,42,2,0.0,1,1,1,101348.88,1
1,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,502,0,0,42,8,159660.8,3,1,0,113931.57,1
3,699,0,0,39,1,0.0,2,0,0,93826.63,0
4,850,2,0,43,2,125510.82,1,1,1,79084.1,0


In [8]:
data = pd.get_dummies(data,columns=['Geography','Gender'],drop_first=True)
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_1,Geography_2,Gender_1
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


part 3 is outlier treatment

don't perform OUTLIER treatment unnescesirily in classification problem

BUT

in Regression problem must rid of outliers

#### Instead of handling outliers perform FEATURE scalinng 

But

we are working on Ensable technique RF feature scalling also not required
* part 4 is feature scalling

part 5 is check imbalance dataset

In [9]:
data.Exited.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

maj_Class >= 2* min_class is the condition
7963 > 2*2037
* it's Data imbalance problem
so go with oversampling

do data split into independent and dependent variable

In [10]:
x = data.drop(['Exited'],axis=1)

y = data['Exited']

In [13]:
# So we do have imbalance dataset, so have to use oversampling method to make our data balance
import imblearn

# Status of imbalance
# method is oversampling
# majority class 0 -- 7963
# minority class 1 -- 2037

# status of balnce
# majority class 0 -- 7963
# minority class 1 -- 7963

from imblearn.over_sampling import RandomOverSampler
over = RandomOverSampler()
x_over,y_over = over.fit_resample(x,y)



In [15]:
x.shape, x_over.shape

((10000, 11), (15926, 11))

In [24]:
# split the data trainig and testing for model building and evaluation
from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest = train_test_split(x_over,y_over,random_state=102,test_size=0.25)

print(xtrain.shape,ytrain.shape)

(11944, 11) (11944,)


### Bagging model building

In [26]:
from sklearn.ensemble import BaggingClassifier
bag = BaggingClassifier()

bag.fit(xtrain,ytrain)

In [27]:
y_pred_train = bag.predict(xtrain)
y_pred_test = bag.predict(xtest)

In [34]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

print('Classification report on Training:\n',classification_report(ytrain,y_pred_train))
print('Classification report on Test:\n',classification_report(ytest,y_pred_test))

print('Matrix:\n',confusion_matrix(ytest,y_pred_test))

print('\nTrain Accuracy',accuracy_score(ytrain,y_pred_train))
print('Test Accuracy:',accuracy_score(ytest,y_pred_test))

Classification report on Training:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5944
           1       1.00      1.00      1.00      6000

    accuracy                           1.00     11944
   macro avg       1.00      1.00      1.00     11944
weighted avg       1.00      1.00      1.00     11944

Classification report on Test:
               precision    recall  f1-score   support

           0       0.97      0.89      0.93      2019
           1       0.90      0.97      0.93      1963

    accuracy                           0.93      3982
   macro avg       0.94      0.93      0.93      3982
weighted avg       0.94      0.93      0.93      3982

Matrix:
 [[1804  215]
 [  51 1912]]

Train Accuracy 0.9974882786336235
Test Accuracy: 0.9331993972877951


### Random Forest modelling

In [64]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200,criterion='entropy')

# fit the train dataset into model
rf.fit(xtrain,ytrain)

In [65]:
# Model prediction
rf_pred_train = rf.predict(xtrain)
rf_pred_test = rf.predict(xtest)

In [67]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

# Classification report on train and test
print("Classification report of Train:\n",classification_report(ytrain,rf_pred_train))
print("Classification report of Test:\n",classification_report(ytest,rf_pred_test))

# Accuracy of train and test
print("Accuracy of Train:\n",accuracy_score(ytrain,rf_pred_train))
print("Accuracy of Test:\n",accuracy_score(ytest,rf_pred_test))

# Confusion matrix of test
print("\nConfusion Matrix:\n",confusion_matrix(ytest,rf_pred_test))

Classification report of Train:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5944
           1       1.00      1.00      1.00      6000

    accuracy                           1.00     11944
   macro avg       1.00      1.00      1.00     11944
weighted avg       1.00      1.00      1.00     11944

Classification report of Test:
               precision    recall  f1-score   support

           0       0.98      0.91      0.95      2019
           1       0.92      0.98      0.95      1963

    accuracy                           0.95      3982
   macro avg       0.95      0.95      0.95      3982
weighted avg       0.95      0.95      0.95      3982

Accuracy of Train:
 1.0
Accuracy of Test:
 0.946258161727775

Confusion Matrix:
 [[1841  178]
 [  36 1927]]


### Decision Tree model building

In [79]:
from sklearn.tree import DecisionTreeClassifier

tr = DecisionTreeClassifier(criterion='gini')
tr.fit(xtrain,ytrain)


In [80]:
# predection of decesion tree
tr_predict_train = tr.predict(xtrain)
tr_predict_test =tr.predict(xtest)

In [81]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

# Classification report on train and test 

print('classificatin report on test\n',classification_report(ytest,tr_predict_test))
print('classificatin report on train\n',classification_report(ytrain,tr_predict_train))

# accuracy score on train and test

print('accuracy score on train\n',accuracy_score(ytrain,tr_predict_train))
print('accuracy score on test\n',accuracy_score(ytest,tr_predict_test))

# confusion matrix on test dataset

print('\nconfusion matrix on test:\n',confusion_matrix(ytest,tr_predict_test))

classificatin report on test
               precision    recall  f1-score   support

           0       0.98      0.85      0.91      2019
           1       0.86      0.98      0.92      1963

    accuracy                           0.91      3982
   macro avg       0.92      0.91      0.91      3982
weighted avg       0.92      0.91      0.91      3982

classificatin report on train
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5944
           1       1.00      1.00      1.00      6000

    accuracy                           1.00     11944
   macro avg       1.00      1.00      1.00     11944
weighted avg       1.00      1.00      1.00     11944

accuracy score on train
 1.0
accuracy score on test
 0.9116022099447514

confusion matrix on test:
 [[1709  310]
 [  42 1921]]


### Logistic model building

In [82]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(xtrain,ytrain)

In [83]:
lr_pred_train = lr.predict(xtrain)
lr_pred_test = lr.predict(xtest)

In [88]:
from sklearn.metrics import accuracy_score

# Classification report
print('classification_report for Train:\n',classification_report(ytrain,lr_pred_train))
print('classification_report for test:\n',classification_report(ytest,y_pred_test))

# Accuracy Score
print('Accuracy score for Train:\n',accuracy_score(ytrain,lr_pred_train))
print('accuracy_score for test:\n',accuracy_score(ytest,y_pred_test))


classification_report for Train:
               precision    recall  f1-score   support

           0       0.67      0.65      0.66      5944
           1       0.66      0.68      0.67      6000

    accuracy                           0.66     11944
   macro avg       0.66      0.66      0.66     11944
weighted avg       0.66      0.66      0.66     11944

classification_report for test:
               precision    recall  f1-score   support

           0       0.97      0.89      0.93      2019
           1       0.90      0.97      0.93      1963

    accuracy                           0.93      3982
   macro avg       0.94      0.93      0.93      3982
weighted avg       0.94      0.93      0.93      3982

Accuracy score for Train:
 0.6632618888144675
accuracy_score for test:
 0.9331993972877951


## It's clear that we reject Logistic regression model for this dataset beacuse pathetic result nad overfit.

# Conclusion: RF is best model.