In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Function to handle missing values
def handle_missing_values(df, strategy='mean'):
    """
    Handle missing values in the dataframe.
    Available strategies: 'mean', 'median', 'mode'.
    """
    if strategy == 'mean':
        df = df.fillna(df.mean())
    elif strategy == 'median':
        df = df.fillna(df.median())
    elif strategy == 'mode':
        df = df.fillna(df.mode().iloc[0])
    else:
        raise ValueError(f"Strategy {strategy} not recognized")
    
    print(f"Missing values handled using {strategy} strategy.")
    return df

# Function to handle outliers
def handle_outliers(df, method='IQR'):
    """
    Remove outliers from the dataset using the specified method.
    Available methods: 'IQR', 'Z-Score'.
    """
    if method == 'IQR':
        Q1 = df.quantile(0.25)
        Q3 = df.quantile(0.75)
        IQR = Q3 - Q1
        df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
        print("Outliers removed using IQR method.")
    elif method == 'Z-Score':
        from scipy.stats import zscore
        z_scores = np.abs(zscore(df.select_dtypes(include=[np.number])))
        df = df[(z_scores < 3).all(axis=1)]
        print("Outliers removed using Z-Score method.")
    else:
        raise ValueError(f"Outlier method {method} not recognized")
    
    return df

# Function to handle class imbalance (SMOTE)
def handle_class_imbalance(X_train, y_train):
    """
    Handle class imbalance using SMOTE (Synthetic Minority Over-sampling Technique).
    """
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X_train, y_train)
    
    print(f"Class imbalance handled. Resampled data: {y_res.value_counts()}")
    return X_res, y_res

# Function to optimize memory usage
def optimize_memory(df):
    """
    Optimizes memory usage by downcasting numerical columns.
    """
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        # Downcast numerical columns
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')

    # Optimize object types (optional)
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype('category')

    print("Memory optimization complete.")
    return df

# Function to scale features
def scale_features(df):
    """
    Scales numerical features using StandardScaler.
    """
    numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])

    print(f"Features scaled: {', '.join(numerical_features)}")
    return df

# Function to preprocess the data (combines all preprocessing steps)
def preprocess_data(df, missing_value_strategy='mean', outlier_method='IQR'):
    """
    A combined function to preprocess data by handling missing values, outliers, and scaling.
    """
    # Handle missing values
    df = handle_missing_values(df, strategy=missing_value_strategy)
    
    # Handle outliers
    df = handle_outliers(df, method=outlier_method)

    # Optimize memory
    df = optimize_memory(df)
    
    # Scale features
    df = scale_features(df)
    
    return df
