In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv("Churn_Modelling.csv")

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [5]:
X = df.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Exited'])
y = df['Exited']

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
categorical_features = ['Geography', 'Gender']

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [14]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [15]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


In [16]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [17]:
from sklearn.compose import ColumnTransformer

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [19]:
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


In [21]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)


In [22]:
y_val_pred = log_reg.predict(X_val)
print("Logistic Regression Validation Report:")
print(classification_report(y_val, y_val_pred))
print(confusion_matrix(y_val, y_val_pred))

Logistic Regression Validation Report:
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      1607
           1       0.50      0.14      0.22       393

    accuracy                           0.80      2000
   macro avg       0.66      0.55      0.55      2000
weighted avg       0.76      0.80      0.76      2000

[[1552   55]
 [ 338   55]]


In [23]:
from sklearn.ensemble import RandomForestClassifier


rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

In [24]:
y_val_pred = rf_clf.predict(X_val)
print("Random Forest Validation Report:")
print(classification_report(y_val, y_val_pred))
print(confusion_matrix(y_val, y_val_pred))

Random Forest Validation Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1607
           1       0.68      0.43      0.53       393

    accuracy                           0.85      2000
   macro avg       0.78      0.69      0.72      2000
weighted avg       0.84      0.85      0.84      2000

[[1528   79]
 [ 223  170]]


In [25]:
from sklearn.ensemble import GradientBoostingClassifier

In [26]:
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_clf.fit(X_train, y_train)


In [27]:
y_val_pred = gb_clf.predict(X_val)
print("Gradient Boosting Validation Report:")
print(classification_report(y_val, y_val_pred))
print(confusion_matrix(y_val, y_val_pred))

Gradient Boosting Validation Report:
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      1607
           1       0.71      0.44      0.54       393

    accuracy                           0.85      2000
   macro avg       0.79      0.70      0.73      2000
weighted avg       0.84      0.85      0.84      2000

[[1538   69]
 [ 221  172]]


In [30]:
def prediction_fun(cs, age, tenure, balance, num_of_products, has_card, am, es, geo, gender, cols):
    geo_spain = 0
    geo_germany = 0
    gender_male = 0
    
    if geo == 'Germany':
        geo_germany = 1
    elif geo == 'Spain':
        geo_spain = 1
        
    if gender == 'Male':
        gender_male = 1
    
    data_points = pd.DataFrame(columns = cols)
    vals = [cs, age, tenure, balance, num_of_products, has_card, am, es, geo_germany, geo_spain, gender_male]
    data_points.loc[len(data_points.index)] = vals
    
    scaler = StandardScaler()
    features = list(X_train.columns)

    for col in features:
        data_points[col] = scaler.fit_transform(data_points[col].to_numpy().reshape(-1,1))
        
    predict = svm_classifier.predict(data_points)[0]
    
    if predict == 0:
        
        print("This Customer is more likely to Stay")
    
    else:
        
        print("This Customer is more likely to exit the bank")
 