In [3]:
# imports
# standard libraries
import pandas as pd
import numpy as np
import pickle  # For saving the scaler object
import os

# sklearn for train-test split, scaling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler 

# imbalanced data handling
from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import RandomUnderSampler

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Options to display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:,.3f}'.format

In [5]:
df = pd.read_csv("../data/cleaned/CustomerChurnCleaned.csv")

In [6]:
df.head()


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Satisfaction Score,CardType,PointEarned,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,2,3,464,True,False,False,True,False
1,608,41,1,83807.86,1,0,1,112542.58,0,3,3,456,False,False,True,True,False
2,502,42,8,159660.8,3,1,0,113931.57,1,3,3,377,True,False,False,True,False
3,699,39,1,0.0,2,0,0,93826.63,0,5,1,350,True,False,False,True,False
4,850,43,2,125510.82,1,1,1,79084.1,0,5,1,425,False,False,True,True,False


### 1. Train Test Split

In [12]:
def split_data(df: pd.DataFrame, target: str, test_size: float = 0.1, val_size: float = 0.1, random_state: int = 42) -> tuple:
    """
    Splits the dataset into training, validation, and testing sets with stratification on the target variable.

    Parameters:
    - df (pd.DataFrame): The input dataframe.
    - target (str): The name of the target column.
    - test_size (float): The proportion of the data to include in the test split.
    - val_size (float): The proportion of the train data to include in the validation set.
    - random_state (int): Controls the shuffling applied to the data before applying the split.

    Returns:
    - X_train, X_val, X_test, y_train, y_val, y_test: The training, validation, and test sets for features and target.
    """
    # First split: train + validation and test set
    X = df.drop(columns=[target])
    y = df[target]
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    
    # Second split: split train into train and validation
    # X_train, X_val, y_train, y_val = train_test_split(
    #     X_train, y_train, test_size=val_size / (1 - test_size), stratify=y_train, random_state=random_state
    # )
    
    return X_train, X_test, y_train, y_test #, X_val ,y_val

In [13]:
X_train, X_test, y_train, y_test = split_data(df, "Exited")

### 2. Scaling Numeric Feats

In [9]:
# List of features to scale (excluding ID and any encoded variables)
features_to_scale = [
    'Customer_Age',
    'Dependent_Count',
    'Months_on_book',
    'Total_Relationship_Count',
    'Months_Inactive_12_mon',
    'Contacts_Count_12_mon',
    'Credit_Limit',
    'Total_Revolving_Bal',
    'Total_Amt_Chng_Q4_Q1',
    'Total_Trans_Amt',
    'Total_Trans_Ct',
    'Total_Ct_Chng_Q4_Q1',
    'Avg_Utilization_Ratio'
]

In [14]:
def scale_data(X_train: pd.DataFrame, X_val: pd.DataFrame, X_test: pd.DataFrame, features_to_scale: list) -> tuple:
    """
    Scales the specified features in the training, validation, and testing data using MinMaxScaler.

    Parameters:
    - X_train (pd.DataFrame): The training data features.
    - X_val (pd.DataFrame): The validation data features.
    - X_test (pd.DataFrame): The testing data features.
    - features_to_scale (list): List of features to scale.

    Returns:
    - X_train_scaled, X_val_scaled, X_test_scaled, scaler: The scaled data, and the scaler object.
    """
    scaler = MinMaxScaler()

    # Create copies to avoid modifying the original DataFrames
    X_train_scaled = X_train.copy()
    X_val_scaled = X_val.copy()
    X_test_scaled = X_test.copy()

    # Fit and transform the training data, and transform validation and test data
    X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
    X_val_scaled[features_to_scale] = scaler.transform(X_val[features_to_scale])
    X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

    return X_train_scaled, X_val_scaled, X_test_scaled, scaler

In [16]:
#X_train_scaled, X_val_scaled, X_test_scaled, scaler = scale_data(X_train, X_val, X_test, features_to_scale)


### 3. Saving the Scaler

In [17]:
def save_scaler(scaler, filename: str):
    """
    Saves the scaler object as a pickle file for later use.

    Parameters:
    - scaler: The scaler object to save.
    - filename (str): The name of the pickle file.
    """
    with open(filename, 'wb') as file:
        pickle.dump(scaler, file)

In [None]:
## save_scaler(scaler, "../models/scaler_minmax.pkl")

### 4. Handling Data Imbalance

In [18]:
def handle_imbalance(X_train: pd.DataFrame, y_train: pd.Series) -> tuple:
    """
    Balances the training data using the specified method (e.g., SMOTE).

    Parameters:
    - X_train (pd.DataFrame): The training data features.
    - y_train (pd.Series): The training data target.

    Returns:
    - X_res, y_res: The resampled training data.
    """
    resampler = SMOTE(random_state=42)
    
    X_res, y_res = resampler.fit_resample(X_train, y_train)
    
    return X_res, y_res

### 5. Save Scaled Data to Pickle

In [21]:
def save_data(X: pd.DataFrame, y: pd.Series, data_type: str, stage: str, base_directory: str = '../data'):
    """
    Saves features and target variables to the specified stage in both CSV and Pickle format.

    Parameters:
    - X (pd.DataFrame): Features to save.
    - y (pd.Series): Target variable to save.
    - data_type (str): The type of data (e.g., 'train', 'val', 'test').
    - stage (str): The processing stage (e.g., 'transformed', 'processed').
    - base_directory (str): Base directory for saving data.
    """
    directory = os.path.join(base_directory, stage)
    
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Save as Pickle
    with open(f'{directory}/X_{data_type}.pkl', 'wb') as f:
        pickle.dump(X, f)
    
    with open(f'{directory}/y_{data_type}.pkl', 'wb') as f:
        pickle.dump(y, f)

    #Save as CSV
    X.to_csv(f'{directory}/X_{data_type}.csv', index=False)
    y.to_csv(f'{directory}/y_{data_type}.csv', index=False)


In [22]:
save_data(X_train, y_train, 'train', 'processed')
save_data(X_test, y_test, 'test', 'processed')

Why save as Pickle (pkl)?

Pickle preserves Python objects exactly as they are.

- DataFrame with dtypes, categories, index names, etc.

- Series/DataFrame metadata that CSV might lose.

Much faster to save and load than CSV (especially for large datasets).

Avoids potential issues like CSV mis-parsing, delimiter problems, or dtype conversion (e.g., float64 becoming object).

Handy for intermediate pipeline stages where you want to quickly reload and continue.