In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [5]:
# Load data
df = pd.read_csv('Cleaned_Data.csv')
df


Unnamed: 0,CustomerId,Surname,CreditScore,Country,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,Hargrave,619,France,0,42,2,119808.30,1,1,1,101348.88,1
1,15647311,Hill,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,15619304,Onio,502,France,0,42,8,159660.80,3,1,0,113931.57,1
3,15701354,Boni,699,France,0,39,1,119808.30,2,0,0,93826.63,0
4,15737888,Mitchell,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9621,15606229,Obijiaku,771,France,1,39,5,119808.30,2,1,0,96270.64,0
9622,15569892,Johnstone,516,France,1,35,10,57369.61,1,1,1,101699.77,0
9623,15584532,Liu,709,France,0,36,7,119808.30,1,0,1,42085.58,1
9624,15682355,Sabbatini,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1


In [7]:
# Drop rows with missing values (if any)
df = df.dropna()
df



Unnamed: 0,CustomerId,Surname,CreditScore,Country,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,Hargrave,619,France,0,42,2,119808.30,1,1,1,101348.88,1
1,15647311,Hill,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,15619304,Onio,502,France,0,42,8,159660.80,3,1,0,113931.57,1
3,15701354,Boni,699,France,0,39,1,119808.30,2,0,0,93826.63,0
4,15737888,Mitchell,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9621,15606229,Obijiaku,771,France,1,39,5,119808.30,2,1,0,96270.64,0
9622,15569892,Johnstone,516,France,1,35,10,57369.61,1,1,1,101699.77,0
9623,15584532,Liu,709,France,0,36,7,119808.30,1,0,1,42085.58,1
9624,15682355,Sabbatini,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1


In [9]:
# Feature engineering
df['Balance_to_Salary_Ratio'] = df['Balance'] / (df['EstimatedSalary'] + 1)
df['Age_to_Tenure_Ratio'] = df['Age'] / (df['Tenure'] + 1)
df['TotalValue'] = df['Balance'] + df['EstimatedSalary']
df['ProductsPerTenure'] = df['NumOfProducts'] / (df['Tenure'] + 1)
df['CreditScoreGroup'] = pd.qcut(df['CreditScore'], q=10, labels=False)
df['BalanceGroup'] = pd.qcut(df['Balance'].rank(method='first'), q=10, labels=False)
df['IsHighRisk'] = ((df['CreditScore'] < 600) & (df['Balance'] > df['Balance'].mean())).astype(int)
df['CustomerValue'] = df['Balance'] * 0.4 + df['EstimatedSalary'] * 0.3 + df['Tenure'] * 0.2
df['IsRetirementAge'] = (df['Age'] > 60).astype(int)
df['BalancePerAge'] = df['Balance'] / (df['Age'] + 1)
df


Unnamed: 0,CustomerId,Surname,CreditScore,Country,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,...,Balance_to_Salary_Ratio,Age_to_Tenure_Ratio,TotalValue,ProductsPerTenure,CreditScoreGroup,BalanceGroup,IsHighRisk,CustomerValue,IsRetirementAge,BalancePerAge
0,15634602,Hargrave,619,France,0,42,2,119808.30,1,1,...,1.182126,14.000000,221157.18,0.333333,3,3,0,78328.384,0,2786.239535
1,15647311,Hill,608,Spain,0,41,1,83807.86,1,0,...,0.744670,20.500000,196350.44,0.500000,3,0,0,67286.118,0,1995.425238
2,15619304,Onio,502,France,0,42,8,159660.80,3,1,...,1.401362,4.666667,273592.37,0.333333,0,9,1,98045.391,0,3713.041860
3,15701354,Boni,699,France,0,39,1,119808.30,2,0,...,1.276898,19.500000,213634.93,1.000000,6,3,0,76071.509,0,2995.207500
4,15737888,Mitchell,850,Spain,0,43,2,125510.82,1,1,...,1.587035,14.333333,204594.92,0.333333,9,7,0,73929.958,0,2852.518636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9621,15606229,Obijiaku,771,France,1,39,5,119808.30,2,1,...,1.244482,6.500000,216078.94,0.333333,8,6,0,76805.512,0,2995.207500
9622,15569892,Johnstone,516,France,1,35,10,57369.61,1,1,...,0.564102,3.181818,159069.38,0.090909,0,0,0,53459.775,0,1593.600278
9623,15584532,Liu,709,France,0,36,7,119808.30,1,0,...,2.846710,4.500000,161893.88,0.125000,7,6,0,60550.394,0,3238.062162
9624,15682355,Sabbatini,772,Germany,1,42,3,75075.31,2,1,...,0.808222,10.500000,167963.83,0.500000,8,0,0,57897.280,0,1745.937442


In [11]:
# Define target and features
X = df.drop(columns=['Exited'])  # assuming 'Exited' is the target column
y = df['Exited']



In [13]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ])


In [15]:

# Pipeline with classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])



In [17]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [19]:
# Hyperparameter tuning with GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5]
}



In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)





In [None]:
# Evaluation
y_pred = grid_search.predict(X_test)
print("Best parameters found:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
