#### 1.Import Dependencies

In [14]:
import os
import pandas as pd 
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv("../Week 01/Data/processed/churnModelling_feature_encoded.csv")
df.head(5)

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins,Gender_Female,Geography_France,Geography_Germany,Geography_Spain
0,0,0,0,42.0,2,0.0,1,1,1,101348.88,1,1,True,True,False,False
1,1,1,1,41.0,1,83807.86,1,0,1,112542.58,0,1,True,False,False,True
2,2,2,2,42.0,8,159660.8,3,1,0,113931.57,1,0,True,True,False,False
3,3,3,3,38.91,1,0.0,2,0,0,93826.63,0,2,True,True,False,False
4,4,4,4,43.0,2,125510.82,1,1,1,79084.1,0,4,True,False,False,True


#### 2.Important Concepts

##### 2.1 Normalization vs Standerization 

##### 2.1.1 What is Normalization

Scaling technique in which values are shifted and rescaled so that they end up ranging between 0 and 1. It's also known as Min-Max Scaling

##### 2.1.2 What is Standardization

Scaling technique where the values are centered around the mean with a unit standard deviation. This means that the mean of that attribute becomes zero and the resultant distribution has a unit standard deviation>


#### 3. Basic Processing

| Condition                                             | Min-Max Scaling                                | Standardization (Z-score)                         |
|-------------------------------------------------------|--------------------------------------------------|--------------------------------------------------|
| Data has a known, fixed range                         | ✅ Yes                                           | ❌ Not ideal                                     |
| Data contains outliers                                | ❌ Sensitive to outliers                         | ✅ More robust to outliers                        |
| Data is normally distributed                          | ❌ Not necessary                                 | ✅ Preferred                                      |
| Data is not normally distributed (e.g., skewed)       | ✅ If shape needs to be preserved                | ✅ Often works well after log-transform          |
| Model is distance-based (KNN, SVM)                    | ✅ Recommended                                   | ✅ Also acceptable                                |
| Model is neural network                               | ✅ Strongly recommended                          | ❌ May slow training                              |
| Model is linear or uses regularization                | ❌ Not ideal                                     | ✅ Helps with convergence                         |
| Input features need bounded values (0–1)              | ✅ Required                                      | ❌ Not bounded                                    |
| Applying PCA or LDA                                   | ❌ May distort variance                          | ✅ Required (centering needed)                   |
| Want to preserve original distribution shape          | ✅ Maintains feature shape                       | ✅ Maintains shape but centers data              |
| Working with tree-based models                        | ❌ Not needed                                    | ❌ Not needed                                     |


In [18]:
columns_need_to_be_scaled = ['Age','Tenure', 'Balance','EstimatedSalary']

for col in columns_need_to_be_scaled:
    standard_scalar = StandardScaler()
    df[col] = standard_scalar.fit_transform(df[col].values.reshape(10000,1))

df

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins,Gender_Female,Geography_France,Geography_Germany,Geography_Spain
0,0,0,0,0.302983,-1.041760,-1.225848,1,1,1,0.021886,1,1,True,True,False,False
1,1,1,1,0.204867,-1.387538,0.117350,1,0,1,0.216534,0,1,True,False,False,True
2,2,2,2,0.302983,1.032908,1.333053,3,1,0,0.240687,1,0,True,True,False,False
3,3,3,3,-0.000196,-1.387538,-1.225848,2,0,0,-0.108918,0,2,True,True,False,False
4,4,4,4,0.401100,-1.041760,0.785728,1,1,1,-0.365276,0,4,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,9995,9995,0.008634,-0.004426,-1.225848,2,1,0,-0.066419,0,3,True,True,False,False
9996,9996,9996,9996,-0.383831,1.724464,-0.306379,1,1,1,0.027988,0,0,True,True,False,False
9997,9997,9997,9997,-0.285715,0.687130,-1.225848,1,0,1,-1.008643,1,2,True,True,False,False
9998,9998,9998,9998,0.302983,-0.695982,-0.022608,2,1,0,-0.125231,1,3,True,False,True,False


In [19]:
df = df.loc[:, ~df.columns.str.startswith("Unnamed")]
df

Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins,Gender_Female,Geography_France,Geography_Germany,Geography_Spain
0,0.302983,-1.041760,-1.225848,1,1,1,0.021886,1,1,True,True,False,False
1,0.204867,-1.387538,0.117350,1,0,1,0.216534,0,1,True,False,False,True
2,0.302983,1.032908,1.333053,3,1,0,0.240687,1,0,True,True,False,False
3,-0.000196,-1.387538,-1.225848,2,0,0,-0.108918,0,2,True,True,False,False
4,0.401100,-1.041760,0.785728,1,1,1,-0.365276,0,4,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.008634,-0.004426,-1.225848,2,1,0,-0.066419,0,3,True,True,False,False
9996,-0.383831,1.724464,-0.306379,1,1,1,0.027988,0,0,True,True,False,False
9997,-0.285715,0.687130,-1.225848,1,0,1,-1.008643,1,2,True,True,False,False
9998,0.302983,-0.695982,-0.022608,2,1,0,-0.125231,1,3,True,False,True,False


In [20]:
df.to_csv("../Week 01/Data/processed/churnModelling_final_dataset.csv", index=False)