### Import modules

In [None]:
""" All modules for this steps of the pipeline are defined here. """

import os
from dotenv import load_dotenv
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np


### Import Paths

In [None]:
""" Loading environment variables from .env file """
load_dotenv()

# Load the dataset
original_dataset = os.getenv('ORIGINAL_DATASET')

drop_columns = os.getenv("drop_columns", '')
print(f"drop_columns: {drop_columns}")
userOption = os.getenv('missing_choice', "1")
default_value = os.getenv('DefaultMissingValue', 'Unknown')
convert_categorical = os.getenv('convert_categorical', '1')

mean_cols = os.getenv('statistical_measure_mean', '')
mode_cols = os.getenv('statistical_measure_mode', '')

mean_cols = [col.strip() for col in mean_cols.split(',') if col.strip()]
mode_cols = [col.strip() for col in mode_cols.split(',') if col.strip()]

ignore_columns = os.getenv("ignore_columns", "").split(",")
ignore_columns = [col.strip() for col in ignore_columns]

Scaling = os.getenv("Scaling", "0")
NormalScalingColumns = os.getenv("NormalScalingColumns", "")
StandardScalingColumns = os.getenv("StandardScalingColumns", "")

normal_cols = [col.strip() for col in NormalScalingColumns.split(",") if col.strip()]
standard_cols = [col.strip() for col in StandardScalingColumns.split(",") if col.strip()]


### Data Preparation and Cleaning

In [None]:
""" Function to drop specified columns from the dataframe."""

def drop_columns_from_df(filePath):
    # Load the file based on extension
    if filePath.endswith('.xlsx'):
        df_dropped = pd.read_excel(filePath)
    elif filePath.endswith('.csv'):
        df_dropped = pd.read_csv(filePath)
    else:
        print("Unsupported file format. Please use a .xlsx or .csv file.")
        return None

    print("Shape after loading:", df_dropped.shape)

    # Process drop_columns from global .env-style string
    if drop_columns:
        try:
            columns = [col.strip() for col in drop_columns.split(",")]
            columns_to_drop = [col for col in columns if col in df_dropped.columns]

            if columns_to_drop:
                df_dropped = df_dropped.drop(columns=columns_to_drop)
                print(f"Columns dropped: {columns_to_drop}")
            else:
                print("No matching columns found to drop.")

        except Exception as e:
            print(f"Error while dropping columns: {e}")
            return None
    else:
        print("No columns specified for dropping.")

    # Reset index to avoid any alignment issues downstream
    df_dropped = df_dropped.reset_index(drop=True)
    print("Shape after dropping columns and resetting index:", df_dropped.shape)

    return df_dropped


In [None]:
""" Function to handle missing data in the dataframe using various techniques."""

def handle_missing_data(df):
    print(f"User Option: {userOption}")
    
    # Replace empty strings or whitespaces with NaN first
    df = df.replace(r'^\s*$', np.nan, regex=True)

    conversion_feedback = []
    for col in df.columns:
        if df[col].dtype == 'object':
            converted = pd.to_numeric(df[col], errors='coerce')
            if not converted.isna().all():
                df[col] = converted
                conversion_feedback.append(f"Converted column '{col}' to numeric (float).")
            else:
                conversion_feedback.append(f"Column '{col}' remains as object (non-numeric values detected).")

    if userOption == "1":
        # Drop rows with any missing values
        print("Dropping rows with empty or missing values")
        df_cleaned = df.dropna()

    elif userOption == "2":
        # Use default value from .env
        print(f"Using default value selected by user: {default_value}")
        df_cleaned = df.fillna(default_value)

    elif userOption == "3":
        print("Using statistical measures from .env")
        df_cleaned = df.copy()

        # Impute missing values column by column
        for col in df.columns:
            if not df[col].isnull().any():
                continue  # Skip if no missing values

            # Use mean if specified
            if col in mean_cols:
                fill_val = df[col].astype(float).mean()
                method = 'mean'

            # Use mode if specified
            elif col in mode_cols:
                fill_val = df[col].mode()[0]
                method = 'mode'

            # Fallback: object → mode, numeric → mean
            else:
                if df[col].dtype == 'object':
                    fill_val = df[col].mode()[0]
                    method = 'mode (fallback)'
                else:
                    fill_val = df[col].astype(float).mean()
                    method = 'mean (fallback)'

            df_cleaned[col] = df[col].fillna(fill_val)
            print(f"Filling missing values in column '{col}' with {method}: {fill_val}")

    else:
        raise ValueError("Option must be 1 (drop), 2 (default), or 3 (statistical)")

    # Reset index to avoid downstream misalignment
    df_cleaned = df_cleaned.reset_index(drop=True)

    return df_cleaned


In [None]:
""" Function to check missing values """

def FindMissingValues(df):

    # Get missing values location
    missingValues = df.isnull()
    missingLocations = []

    for row in range(len(missingValues)):
        for col in range(len(missingValues.columns)):
            if missingValues.iat[row, col]:
                cellName = f"{missingValues.columns[col]}{row + 2}"
                missingLocations.append((row + 2, col + 1, cellName))

    # Display Result
    if missingLocations:
        print("Missing values found at the following locations: ")
        for location in missingLocations:
            print(f"Row: {location[0]}, Column: {location[1]}, Cell: {location[2]}")
        
        df_cleaned = handle_missing_data(df)
        return df_cleaned
    else:
        print("No missing value found in the file")
        return df

### Feature Values

In [None]:
""" Function to convert categorical variables to numerical """

def convertCategoricalToNumerical(df):
    df_converted = pd.DataFrame()
    label_encoder = LabelEncoder()

    for column in df.columns:
        if column in ignore_columns:
            # Keep column as-is
            df_converted[column] = df[column]

        elif df[column].dtype in ['object', 'category']:
            unique_values = df[column].nunique()

            if unique_values == 2:
                # Label encode binary columns
                df_converted[column] = label_encoder.fit_transform(df[column])
            else:
                # One-hot encode and add all dummy columns at once
                dummies = pd.get_dummies(df[column], prefix=column)
                df_converted = pd.concat([df_converted, dummies], axis=1)
        else:
            # Keep numeric columns as-is
            df_converted[column] = df[column]

    # Optional: defragment memory
    df_converted = df_converted.copy()

    return df_converted

### Scaling

In [None]:
""" Function to scale data """

def ScaleData(df):

    df_scaled = df.copy()

    if Scaling == "0":
        print("No scaling applied.")
        return df_scaled

    # Apply Min-Max Scaling
    if Scaling in ["1", "3"] and normal_cols:
        minmax_scaler = MinMaxScaler()
        cols_to_scale = [col for col in normal_cols if col in df_scaled.columns]
        if cols_to_scale:
            df_scaled[cols_to_scale] = minmax_scaler.fit_transform(df_scaled[cols_to_scale])
            print(f"Min-Max scaling applied to columns: {cols_to_scale}")
        else:
            print("No matching columns found for Min-Max scaling.")

    # Apply Z-score Standardisation
    if Scaling in ["2", "3"] and standard_cols:
        zscore_scaler = StandardScaler()
        cols_to_scale = [col for col in standard_cols if col in df_scaled.columns]
        if cols_to_scale:
            df_scaled[cols_to_scale] = zscore_scaler.fit_transform(df_scaled[cols_to_scale])
            print(f"Z-score scaling applied to columns: {cols_to_scale}")
        else:
            print("No matching columns found for Z-score scaling.")

    return df_scaled

### Reading Data

In [None]:
""" Function to read data from the original dataset path """

def readData():
    filePath = f"{original_dataset}"
    print(filePath)

    return filePath

### Main Function

In [None]:
""" Main function to run data preparation steps """

def run_data_prep():
    filePath = readData()

    dataFrame = drop_columns_from_df(filePath)

    # Replaces / Removes missing values
    dataFrame = FindMissingValues(dataFrame)
    
    # Convert categorical variables to numerical
    if convert_categorical == "1":
        
        dataFrame = convertCategoricalToNumerical(dataFrame)
        print("Categorical variables converted to numerical.")

    # Scale data if specified
    dataFrame = ScaleData(dataFrame)

    print("Data preparation completed.")
    return dataFrame 
    

In [None]:
if __name__ == "__main__":
    run_data_prep()