#### Churn Modelling

### 1. Loading the dataset

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import joblib
import os


In [12]:
df = pd.read_csv('Churn_Modelling.csv')

### 2. Exploratory  Data Analysis

#### 2.1 Quick look at the dataset 

In [13]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


There are no missing values and  correct type of data

In [15]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [16]:
### Drop the unnecessary columns
columns_to_drop = ['RowNumber','CustomerId','Surname']
df = df.drop(columns=columns_to_drop) # inplace=True, drop the column from the df


### Data Preprocessing

In [17]:
df.head(2)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [29]:
import os
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from pathlib import Path
import sys

FEATURES_DROP  = ['RowNumber','CustomerId','Surname']

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, variables_to_drop=None):
        # If variables_to_drop is None, default to config.FEATURES_DROP
        self.variables_to_drop = variables_to_drop or FEATURES_DROP
    
    def fit(self, X, y=None):
        # No fitting needed for this transformer
        return self
    
    def transform(self, X):
        # Drop the specified columns from the DataFrame
        X = X.drop(columns=self.variables_to_drop)
        return X

# Transformer to encode and create dummy variables
class EncodeAndBind(BaseEstimator, TransformerMixin):
    def __init__(self, encode=None, dummy=None):
        self.encode = encode
        self.dummy = dummy
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Replace 'Male' with 0 and 'Female' with 1
        X[self.encode].replace({'Male': 0, 'Female': 1}, inplace=True)
        # Create dummy variables
        X = pd.get_dummies(X, columns=[self.dummy])
        # Replace boolean values with 1 and 0
        X.replace({True: 1, False: 0}, inplace=True)
        return X

# Transformer to normalize specified variables
class Scale(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        self.variables = variables or config.FEATURES_TO_SCALE
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Scale each variable to range 0-1
        for variable in self.variables:
            X[variable] = (X[variable] - X[variable].min()) / (X[variable].max() - X[variable].min())
        return X

In [34]:
DropColumns(variables_to_drop=['RowNumber', 'CustomerId', 'Surname'])

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
classification_pipeline = Pipeline(
    steps =  [(
    'Drop_columns', DropColumns(variables_to_drop =['RowNumber','CustomerId','Surname'])),
    ('Encode_and_bind', EncodeAndBind(encode='Gender', dummy='Geography')),
    ('Scale',Scale(variables=['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary'])),
    ('Model', LogisticRegression(random_state=12))])

Pipeline(steps=[('Drop_columns',
                 DropColumns(variables_to_drop=['RowNumber', 'CustomerId',
                                                'Surname'])),
                ('Encode_and_bind',
                 EncodeAndBind(dummy='Geography', encode='Gender')),
                ('Scale',
                 Scale(variables=['CreditScore', 'Age', 'Tenure', 'Balance',
                                  'NumOfProducts', 'EstimatedSalary'])),
                ('Model', LogisticRegression(random_state=12))])


In [40]:
from sklearn.model_selection import train_test_split
import joblib
def perform_pipeline():
    df = pd.read_csv('Churn_Modelling.csv')
    X  = df.drop(columns=['Exited'])
    y = df['Exited']
    X_train, X_test, y_train, y_test = train_test_split(X,y)
    test_data = X_test.copy()
    test_data['Exited'] = y_test
    test_data.to_csv('Testing.csv')
    classification_pipeline.fit(X_train,y_train)
    y_pred = classification_pipeline.predict(X_test)
    ## save model to working directory
    joblib.dump(classification_pipeline, 'classification_pipeline_model.pkl')
    print("Model saved to classification_pipeline_model.pkl")
if __name__=='__main__':
    perform_pipeline()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[self.encode].replace({'Male': 0, 'Female': 1}, inplace=True)
  X[self.encode].replace({'Male': 0, 'Female': 1}, inplace=True)
  X.replace({True: 1, False: 0}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[self.encode].replace({'Male': 0, 'Female': 1}, inplace=

In [2]:

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, variables_to_drop=None):
        self.variables_to_drop = variables_to_drop or config.FEATURES_DROP
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.drop(columns=self.variables_to_drop)
        return X

        return self
# Transformer to encode and create dummy variables
class EncodeAndBind(BaseEstimator, TransformerMixin):
    def __init__(self, encode=None, dummy=None):
        self.encode = encode
        self.dummy = dummy
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.encode].replace({'Male': 0, 'Female': 1}, inplace=True)
        X = pd.get_dummies(X, columns=[self.dummy])
        X.replace({True: 1, False: 0}, inplace=True)
        return X

# Transformer to normalize specified variables
class Scale(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        self.variables = variables or config.FEATURES_TO_SCALE
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for variable in self.variables:
            X[variable] = (X[variable] - X[variable].min()) / (X[variable].max() - X[variable].min())
        return X


In [31]:
df = pd.get_dummies(data = df, columns=['Geography'])