# Feature Engineering

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
df=pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


#### Feature encoding

In [11]:
df = pd.get_dummies(df, columns=['Geography'], drop_first=True)
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain
0,1,15634602,Hargrave,619,Female,42,2,0.0,1,1,1,101348.88,1,False,False
1,2,15647311,Hill,608,Female,41,1,83807.86,1,0,1,112542.58,0,False,True
2,3,15619304,Onio,502,Female,42,8,159660.8,3,1,0,113931.57,1,False,False
3,4,15701354,Boni,699,Female,39,1,0.0,2,0,0,93826.63,0,False,False
4,5,15737888,Mitchell,850,Female,43,2,125510.82,1,1,1,79084.1,0,False,True


In [13]:
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain
0,1,15634602,Hargrave,619,0,42,2,0.0,1,1,1,101348.88,1,False,False
1,2,15647311,Hill,608,0,41,1,83807.86,1,0,1,112542.58,0,False,True
2,3,15619304,Onio,502,0,42,8,159660.8,3,1,0,113931.57,1,False,False
3,4,15701354,Boni,699,0,39,1,0.0,2,0,0,93826.63,0,False,False
4,5,15737888,Mitchell,850,0,43,2,125510.82,1,1,1,79084.1,0,False,True


Dropping "RowNumber","CustomerId" and "Surname" as we don't need them as they don’t help the model and can introduce noise 

In [15]:
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

#### Feature Scaling

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_cols = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']

df[scaled_cols] = scaler.fit_transform(df[scaled_cols])

#### Creating new Features

In [21]:
df['BalanceSalaryRatio'] = df['Balance'] / (df['EstimatedSalary'] + 1)
df['AgeTenureRatio'] = df['Age'] / (df['Tenure'] + 1)

In [23]:
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,BalanceSalaryRatio,AgeTenureRatio
0,-0.326221,0,0.293517,2,-1.225848,1,1,1,0.021886,1,False,False,-1.199593,0.097839
1,-0.440036,0,0.198164,1,0.11735,1,0,1,0.216534,0,False,True,0.096463,0.099082
2,-1.536794,0,0.293517,8,1.333053,3,1,0,0.240687,1,False,False,1.074448,0.032613
3,0.501521,0,0.007457,1,-1.225848,2,0,0,-0.108918,0,False,False,-1.375684,0.003728
4,2.063884,0,0.388871,2,0.785728,1,1,1,-0.365276,0,False,True,1.237904,0.129624


Final dataset after the feature engineering which contains only numeric columns which is helpfull while training and testing the dataset

#### Spliting into Train/Test

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

X = df.drop(columns=['Exited'])
y = df['Exited']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [10]:
# Save the cleaned data after feature engineering
df.to_csv("processed_churn_data.csv", index=False)