In [None]:
""" All modules for this steps of the pipeline are defined here. """

from _2_FeatureEngineeringAndSelection import run_feature_select
from dotenv import load_dotenv
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, RandomOverSampler

drop_columns: customerID


In [None]:
""" Loading environment variables from .env file"""
load_dotenv()

split_option = os.getenv("custom_split")

try:
    train_ratio = float(os.getenv("training_split"))
    val_ratio = float(os.getenv("validation_split"))
    test_ratio = float(os.getenv("testing_split"))
except (TypeError, ValueError):
    raise ValueError("Custom split ratios must be valid numbers in the .env file.")

balance_option = os.getenv("balance_option")
target_column = os.getenv("target_column")

train_path = os.getenv("train_path")
test_path = os.getenv("test_path")
validation_path = os.getenv("validation_path")
base_path = os.getenv("base_path")

In [None]:
""" Function to split data into training, validation, and testing sets based on defined ratios. """

def split_data(df):
    
    if split_option not in ["0", "1"]:
        raise ValueError("Invalid value for 'custom_split' in .env file. Use '0' or '1'.")

    if split_option == "0":
        print("------------------------")
        print("User chose default split")
        print("------------------------")
        # Default split: 70% train, 20% validation, 10% test
        train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
        val_df, test_df = train_test_split(temp_df, test_size=1/3, random_state=42)
    else:
        print("-------------------------------------------------------------------------------------------------------------")
        print(f"User custom split default split with Training: {train_ratio}; Testing: {test_ratio}; Validation: {val_ratio}")
        print("-------------------------------------------------------------------------------------------------------------")

        total = train_ratio + val_ratio + test_ratio
        
        if not abs(total - 1.0) < 1e-6:
            raise ValueError("Custom split ratios must sum to 1.0")

        # First split: train and temp
        train_df, temp_df = train_test_split(df, test_size=(1 - train_ratio), random_state=42)

        # Calculate proportions for validation and test from the remaining data
        val_proportion = val_ratio / (val_ratio + test_ratio)
        val_df, test_df = train_test_split(temp_df, test_size=(1 - val_proportion), random_state=42)

    return train_df, val_df, test_df

In [None]:
""" Function to balance the training data based on user-defined method. """

def balance_data():

    if not train_path or not target_column or not balance_option:
        raise ValueError("Missing required .env variables: input_data_path, target_column, or balance_option.")

    df = pd.read_csv(train_path)

    if balance_option not in ["1", "2", "3", "4", "5"]:
        raise ValueError("Invalid value for 'balance_option' in .env file. Use 1â€“5.")

    if balance_option == "1":
        print("-----------------------")
        print("User chose no Balancing")
        print("-----------------------")
        balanced_df = df  # No balancing

    else:
        X = df.drop(columns=[target_column])
        y = df[target_column]

        if balance_option == "2":
            print("---------------------------------------")
            print("User chose Undersampling majority class")
            print("---------------------------------------")

            # Undersample majority class
            df_majority = df[df[target_column] == y.value_counts().idxmax()]
            df_minority = df[df[target_column] == y.value_counts().idxmin()]
            df_majority_downsampled = resample(df_majority,
                                               replace=False,
                                               n_samples=len(df_minority),
                                               random_state=42)
            balanced_df = pd.concat([df_majority_downsampled, df_minority])
            balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

        elif balance_option == "3":
            print("------------------------------")
            print("User chose SMOTE oversampling")
            print("------------------------------")
            # SMOTE oversampling
            smote = SMOTE(random_state=42)
            X_resampled, y_resampled = smote.fit_resample(X, y)
            balanced_df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name=target_column)], axis=1)

        elif balance_option == "4":
            print("------------------------------")
            print("User chose Stratified sampling")
            print("------------------------------")
            # Stratified sampling (equal number of samples from each class)
            min_class_size = y.value_counts().min()
            dfs = [df[df[target_column] == label].sample(n=min_class_size, random_state=42) for label in y.unique()]
            balanced_df = pd.concat(dfs).sample(frac=1, random_state=42).reset_index(drop=True)

        elif balance_option == "5":
            # Random oversampling
            ros = RandomOverSampler(random_state=42)
            X_resampled, y_resampled = ros.fit_resample(X, y)
            balanced_df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name=target_column)], axis=1)

    # Save to train_data.csv
    balanced_df.to_csv(train_path, index=False)
    print(f"Balanced training data saved to {train_path}")


In [None]:
""" Function to split features and target variable into separate CSV files for train, validation, and test datasets."""

def SplitXandY(train_path, validation_path, test_path, target_column):
    # Read the datasets
    train_df = pd.read_csv(train_path)
    validation_df = pd.read_csv(validation_path)
    test_df = pd.read_csv(test_path)

    # Split train
    X_Train = train_df.drop(columns=[target_column])
    Y_Train = train_df[[target_column]]

    # Split validation
    X_Val = validation_df.drop(columns=[target_column])
    Y_Val = validation_df[[target_column]]

    # Split test
    X_Test = test_df.drop(columns=[target_column])
    Y_Test = test_df[[target_column]]

    # Write to CSV files
    X_Train.to_csv(f"{base_path}X_Train.csv", index=False)
    Y_Train.to_csv(f"{base_path}Y_Train.csv", index=False)
    X_Val.to_csv(f"{base_path}X_Val.csv", index=False)
    Y_Val.to_csv(f"{base_path}Y_Val.csv", index=False)
    X_Test.to_csv(f"{base_path}X_Test.csv", index=False)
    Y_Test.to_csv(f"{base_path}Y_Test.csv", index=False)

    print("Split and saved all datasets successfully.")

SplitXandY(train_path, validation_path, test_path, target_column)


Split and saved all datasets successfully.


In [None]:
"""  Main function to run feature engineering and selection. """

def run_data_prep_part2():
    
    dataFrame = run_feature_select()

    print("##################################")
    print("Running data preparation part 2...")
    print("##################################")

    train_df, val_df, test_df = split_data(dataFrame)

    # Save the validation and test split dataframes to CSV files
    train_df.to_csv(train_path, index=False)
    val_df.to_csv(validation_path, index=False)
    test_df.to_csv(test_path, index=False)

    balance_data()
    SplitXandY(train_path, validation_path, test_path, target_column)
    
    print("Data preparation part 2 completed.")

    return None

In [None]:
if __name__ == "__main__":
    
    run_data_prep_part2()
    