# Predict customer churn or attrition using machine learning techniques. 

# --> Importing Required Libraires

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


# -->Importing Dataset

In [2]:
data = pd.read_csv('Churn_Modelling.csv')
data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


# -->Apply Preprocessing for data cleaning

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [4]:
data.isna().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [5]:
data.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


# -->Applying Label Encoding

In [6]:
label_encoder = LabelEncoder()
data['Geography'] = data[['Geography']].apply(label_encoder.fit_transform)
data['Gender'] = data[['Gender']].apply(label_encoder.fit_transform)
data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,0,0,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,0,0,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,0,0,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,2,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,0,1,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,0,1,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,0,0,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,1,1,42,3,75075.31,2,1,0,92888.52,1


# -->Features Extraction

In [7]:
X = data[['Geography','Age','Tenure','Balance','IsActiveMember','EstimatedSalary']]
y = data['Exited']

In [8]:
X

Unnamed: 0,Geography,Age,Tenure,Balance,IsActiveMember,EstimatedSalary
0,0,42,2,0.00,1,101348.88
1,2,41,1,83807.86,1,112542.58
2,0,42,8,159660.80,0,113931.57
3,0,39,1,0.00,0,93826.63
4,2,43,2,125510.82,1,79084.10
...,...,...,...,...,...,...
9995,0,39,5,0.00,0,96270.64
9996,0,35,10,57369.61,1,101699.77
9997,0,36,7,0.00,1,42085.58
9998,1,42,3,75075.31,0,92888.52


In [9]:
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64

# -->Spliting Data for Train and Test

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=33,random_state=43)

In [11]:
X_train

Unnamed: 0,Geography,Age,Tenure,Balance,IsActiveMember,EstimatedSalary
10,0,31,6,102016.72,0,80181.12
1242,1,30,4,114027.70,1,193716.56
4395,0,20,6,167685.56,0,57929.90
4079,0,31,4,158978.79,0,12538.92
7672,0,30,1,0.00,0,88146.86
...,...,...,...,...,...,...
8499,1,35,2,121968.11,1,188343.05
2064,0,44,6,0.00,1,159899.97
7985,1,27,2,96129.32,1,5983.70
2303,2,45,0,124693.48,1,187194.15


In [12]:
y_test

9415    0
6377    1
8019    0
7754    1
4961    0
7364    0
5316    0
4265    0
6302    0
8581    0
1667    0
42      0
6841    0
4794    0
9649    0
694     0
9677    1
5918    0
4908    1
1554    0
5728    0
466     0
7984    0
6772    0
2995    0
6347    0
1911    0
5149    0
6184    1
1377    0
4587    0
7205    0
4644    0
Name: Exited, dtype: int64

# -->Initalizing ML Models

# 1. Logistic Regression

In [13]:
lr = LogisticRegression()

In [25]:
lr.fit(X_train,y_train)

LogisticRegression()

In [29]:
lr.score(X_test,y_test)

0.8484848484848485

In [15]:
lr_predict = lr.predict(X_test)

In [16]:
print(classification_report(lr_predict,y_test))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        33
           1       0.00      0.00      0.00         0

    accuracy                           0.85        33
   macro avg       0.50      0.42      0.46        33
weighted avg       1.00      0.85      0.92        33



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 2. Random Forest

In [21]:
rf = RandomForestClassifier()

In [30]:
rf.fit(X_train,y_train)

RandomForestClassifier()

In [31]:
rf.score(X_test,y_test)

0.7272727272727273

In [23]:
rf_predict = rf.predict(X_test)

In [24]:
print(classification_report(rf_predict,y_test))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85        26
           1       0.40      0.29      0.33         7

    accuracy                           0.76        33
   macro avg       0.61      0.59      0.59        33
weighted avg       0.73      0.76      0.74        33

