In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
# Reading data

data_frame = pd.read_csv("Churn_Modelling.csv")
data_frame.sample(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
7044,7045,15738487,Leworthy,678,France,Male,26,3,0.0,2,1,0,4989.33,0
4914,4915,15679062,Morrison,734,Germany,Female,47,10,91522.04,2,1,1,138835.91,0
7572,7573,15567919,Lazarev,586,Germany,Male,37,8,167735.69,2,0,1,104665.79,0
3710,3711,15601796,Chizuoke,645,France,Male,30,1,125739.26,1,1,1,193441.23,0
5184,5185,15677146,Obiajulu,728,France,Female,28,4,142243.54,2,1,0,33074.51,0


In [3]:
# Showing data info to know columns data_types and printing null values number in each column

data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [4]:
# By notic , 'CustomerId',"Surname" are not important , so we're gonna drop them

data_frame.drop(['CustomerId',"Surname","RowNumber"],axis='columns',inplace=True)

In [5]:
# Geography data type is object so we about to know its values to decide if it will be dropped or converted to numeric 

data_frame["Geography"].value_counts()

France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64

In [6]:
# It has only 3 values , so we're gonna replace each value with a number

data_frame["Geography"] = data_frame["Geography"].replace({"France":1,"Germany":2,"Spain":3})

In [7]:
data_frame["Geography"]

0       1
1       3
2       1
3       1
4       3
       ..
9995    1
9996    1
9997    1
9998    2
9999    1
Name: Geography, Length: 10000, dtype: int64

In [8]:
# Gender data type is object so we about to know its values to decide if it will be dropped or converted to numeric 

data_frame["Gender"].value_counts()

Male      5457
Female    4543
Name: Gender, dtype: int64

In [9]:
# It has only 2 values , so we're gonna replace each value with a number

data_frame["Gender"] = data_frame["Gender"].replace({"Male":1,"Female":2})

In [10]:
data_frame["Gender"] 

0       2
1       2
2       2
3       2
4       2
       ..
9995    1
9996    1
9997    2
9998    1
9999    2
Name: Gender, Length: 10000, dtype: int64

In [11]:
data_frame.dtypes

CreditScore          int64
Geography            int64
Gender               int64
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [12]:
data_frame2 = data_frame

In [13]:
# Scalling time , we should scall our data to get the best accuracy
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# specifying what columns to be scalled
cols_to_scale = ['CreditScore','Age','Balance',"EstimatedSalary","Tenure","NumOfProducts"]

# Now we're gonna replace all the columns with the scalled ones
data_frame2[cols_to_scale] = scaler.fit_transform(data_frame2[cols_to_scale])

data_frame2.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0.538,1,2,0.324324,0.2,0.0,0.0,1,1,0.506735,1
1,0.516,3,2,0.310811,0.1,0.334031,0.0,0,1,0.562709,0
2,0.304,1,2,0.324324,0.8,0.636357,0.666667,1,0,0.569654,1
3,0.698,1,2,0.283784,0.1,0.0,0.333333,0,0,0.46912,0
4,1.0,3,2,0.337838,0.2,0.500246,0.0,1,1,0.3954,0


In [14]:
# splitting data into freatures and targets

X = data_frame2.drop('Exited',axis='columns')
y = data_frame2['Exited']

In [15]:
# No We're gonna split data into train and test using the (freatures and targets) columns splitted in the last cell
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)
X_train

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
7751,0.800,3,2,0.283784,0.6,0.000000,0.333333,0,0,0.096273
4154,0.752,1,1,0.216216,0.3,0.000000,0.333333,1,0,0.981478
3881,0.476,1,2,0.621622,0.3,0.000000,0.000000,1,1,0.948551
9238,0.846,1,2,0.432432,0.4,0.000000,0.333333,1,0,0.646869
5210,0.402,1,1,0.229730,0.7,0.517012,0.333333,0,0,0.434670
...,...,...,...,...,...,...,...,...,...,...
3046,0.616,2,2,0.175676,0.3,0.530108,0.000000,0,1,0.731708
9917,0.570,2,1,0.472973,0.8,0.466347,0.000000,0,1,0.618234
4079,0.466,1,2,0.175676,0.4,0.633639,0.000000,1,0,0.062643
2254,0.658,1,1,0.243243,0.6,0.588819,0.000000,0,1,0.862531


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

model2 = LogisticRegression()
model2.fit(X_train, y_train)

train_pred = model2.predict(X_train)
test_pred = model2.predict(X_test)

train_score = metrics.accuracy_score( train_pred , y_train )
test_score = metrics.accuracy_score( test_pred , y_test )

logistic_regression_score = f"{'Logistic Regression score'.center(25)} | {'Train score'.center(15)} -> {round(train_score * 100,3)}% {'Test score'.center(15)} -> {round(test_score * 100,3)}%"

print("Train score -> ", train_score )
print("Test score -> " , test_score )

Train score ->  0.809875
Test score ->  0.8085


______  ___      
___   |/  /_____ 
__  /|_/ /_  __ \
_  /  / / / /_/ /
/_/  /_/  \____/ 
                 

