## 02_Data_Preparation & Cleaning

### Objectives
- Prepare data for modeling
- Handle categorical variables
- Scale numeric features
- Split data into training and test sets

In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load dataset
Customer_Churn = pd.read_csv(r"D:\Bank_Churn_Project\data\Customer-Churn-Records.csv")
print("Dataset shape:", Customer_Churn.shape)
Customer_Churn.head()

Dataset shape: (10000, 18)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


## Drop Irrelevant Columns
- Remove columns like `RowNumber`, `CustomerId` and `Surname` which are not predictive

In [3]:
# Correctly drop columns without creating NoneType
Customer_Churn.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

# Verify columns
print(Customer_Churn.columns)

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited', 'Complain', 'Satisfaction Score', 'Card Type',
       'Point Earned'],
      dtype='object')


### Keep original numeric copy

In [4]:
numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary',
                'Complain', 'Satisfaction Score', 'Point Earned']
Customer_Churn_original = Customer_Churn[numeric_cols].copy()
Customer_Churn_original.to_csv('../data/Customer_Churn_original_numeric.csv', index=False)

## Encode Categorical Variables
- `Gender`, `Geography` and `Card Type` are categorical  
- Use OneHotEncoding (drop_first=True) to avoid dummy variable trap

In [5]:
 # OneHotEncoding

categorical_cols = ['Gender', 'Geography', 'Card Type']
Customer_Churn = pd.get_dummies(Customer_Churn, columns=categorical_cols, drop_first=True)
print("Columns after encoding:\n", Customer_Churn.columns)

Columns after encoding:
 Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited', 'Complain',
       'Satisfaction Score', 'Point Earned', 'Gender_Male',
       'Geography_Germany', 'Geography_Spain', 'Card Type_GOLD',
       'Card Type_PLATINUM', 'Card Type_SILVER'],
      dtype='object')


## Separate Features and Target
- Target: `Exited`  
- Features: all remaining columns

In [6]:
X = Customer_Churn.drop('Exited', axis=1)
y = Customer_Churn['Exited']

## Train-Test Split
- 80% training, 20% test
- Random state for reproducibility

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

## Scale Numeric Features
- Numeric columns: `CreditScore`, `Age`, `Tenure`, `Balance`, `NumOfProducts`, `EstimatedSalary`, `Satisfaction Score`, `Point Earned`
- Scaling improves model performance

In [8]:
num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 
            'EstimatedSalary', 'Satisfaction Score', 'Point Earned']
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


## Save Prepared Data for Step 3

- X_train, X_test: scaled and encoded features  
- y_train, y_test: target labels  
- This ensures Step 3 can load prepared data directly

In [9]:
# Save prepared data
X_train.to_csv('../data/X_train_prepared.csv', index=False)
X_test.to_csv('../data/X_test_prepared.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)

print("Prepared data saved successfully.")

Prepared data saved successfully.
