In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score


In [22]:
# Load cleaned dataset
Customer_Churn = pd.read_csv(r"D:\Bank_Churn_Project\data\cleaned_data.csv")
print("Dataset shape:", Customer_Churn.shape)
Customer_Churn.head()

Dataset shape: (10000, 18)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


In [23]:
# Drop unnecessary columns
Customer_Churn = Customer_Churn.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
Customer_Churn.head(2)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456


Insight: The cleaned dataset is loaded. We will now convert categorical columns into numerical format and scale features.

### Encoding Categorical Features

In [24]:
# Encode Gender (binary → label encoding)
le = LabelEncoder()
Customer_Churn['Gender'] = le.fit_transform(Customer_Churn['Gender'])  # Female=0, Male=1

# OneHotEncode Geography & Card Type
categorical_cols = ['Geography', 'Card Type']
Customer_Churn = pd.get_dummies(Customer_Churn, columns=categorical_cols, drop_first=True)
print("Columns after encoding:\n", Customer_Churn.columns)


Columns after encoding:
 Index(['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited', 'Complain',
       'Satisfaction Score', 'Point Earned', 'Geography_Germany',
       'Geography_Spain', 'Card Type_GOLD', 'Card Type_PLATINUM',
       'Card Type_SILVER'],
      dtype='object')


Insight:
- Gender is converted to 0/1.
- Geography dummy columns: Germany, Spain → France is implied when both =0.
- Card Type dummy columns: GOLD, PLATINUM, SILVER → Diamond is implied when all =0.

### Feature Scaling

In [25]:
num_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
                'EstimatedSalary', 'Satisfaction Score', 'Point Earned']

scaler = StandardScaler()
Customer_Churn[num_features] = scaler.fit_transform(Customer_Churn[num_features])
Customer_Churn.head()


Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Point Earned,Geography_Germany,Geography_Spain,Card Type_GOLD,Card Type_PLATINUM,Card Type_SILVER
0,-0.326221,0,0.293517,-1.04176,-1.225848,-0.911583,1,1,0.021886,1,1,-0.72113,-0.630839,False,False,False,False,False
1,-0.440036,0,0.198164,-1.387538,0.11735,-0.911583,0,1,0.216534,0,1,-0.009816,-0.666251,False,True,False,False,False
2,-1.536794,0,0.293517,1.032908,1.333053,2.527057,1,0,0.240687,1,1,-0.009816,-1.015942,False,False,False,False,False
3,0.501521,0,0.007457,-1.387538,-1.225848,0.807737,0,0,-0.108918,0,0,1.412812,-1.135457,False,False,True,False,False
4,2.063884,0,0.388871,-1.04176,0.785728,-0.911583,1,1,-0.365276,0,0,1.412812,-0.803472,False,True,True,False,False


Insight:
- Numerical features are scaled to mean=0, std=1.
- Scaling ensures features with large ranges (e.g., Balance, Salary) don’t dominate ML models.


### Define Features and Target

In [28]:
X = Customer_Churn.drop('Exited', axis=1)  # Features
y = Customer_Churn['Exited']               # Target

print("Feature shape:", X.shape)
print("Target distribution:\n", y.value_counts(normalize=True))

Feature shape: (10000, 17)
Target distribution:
 Exited
0    0.7962
1    0.2038
Name: proportion, dtype: float64


Insight:
- Features (X) include all processed numerical & categorical columns.
- Target (y) is binary: 1=Exited, 0=Stayed.
- Class imbalance may exist → consider in model evaluation.


### Train-Test Split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (8000, 17)
Test set shape: (2000, 17)


Insight:
- Split data into 80% train, 20% test.
- stratify=y ensures same churn ratio in both sets.


### Save Processed Dataset

In [30]:
processed_path = "../data/processed_data.csv"
pd.concat([X, y], axis=1).to_csv(processed_path, index=False)
print(f"Processed dataset saved to {processed_path}")

Processed dataset saved to ../data/processed_data.csv


Final Insight:
- Processed dataset is now ready for Step 4: Model Building.
- All categorical features safely encoded.
- Numerical features scaled.
- Train-test split done.