In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import PowerTransformer
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler

# Splitting the dataset

def create_sets(df, train_temp_ratio, val_test_ratio, target):
    X = df.drop(columns=[target])
    y = df[target]
    
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=train_temp_ratio, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = val_test_ratio, random_state = 42)
    
    return X_train, X_val, X_test, y_train, y_val, y_test


# Missing Values inspection

def handle_missing_values(df):
    missing_counts = df.isnull().sum()
    missing_counts = missing_counts[missing_counts > 0]
    if missing_counts.empty:
        print("No missing values found!")
    else:
        for col, count in missing_counts.items():
            percent = 100 * count / len(df)
            print(f"{col}: {count} missing values ({percent:.2f}%)")
        print(f"Total missing values: {missing_counts.sum()}")
    return df



# Numerical + Categorical Imputation

def fit_impute_data(train_df, n_neighbors=2):
    numerical_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = train_df.select_dtypes(exclude=[np.number]).columns.tolist()

    imputed_train = train_df.copy()

    # Numerical Imputer
    knn_imputer = None
    if numerical_features:
        knn_imputer = KNNImputer(n_neighbors=n_neighbors)
        imputed_numerical = knn_imputer.fit_transform(imputed_train[numerical_features])
        imputed_train[numerical_features] = pd.DataFrame(
            imputed_numerical, columns=numerical_features, index=imputed_train.index
        )

    # Categorical Imputer (mode)
    cat_imputers = {}
    for col in categorical_features:
        mode_val = imputed_train[col].mode()[0] if not imputed_train[col].mode().empty else "Unknown"
        imputed_train[col] = imputed_train[col].fillna(mode_val)
        cat_imputers[col] = mode_val

    print("Imputations fitted and applied (numerical → KNN, categorical → mode)")
    return knn_imputer, cat_imputers, imputed_train


def transform_impute_data(test_df, knn_imputer, cat_imputers):
    imputed_test = test_df.copy()

    # Numerical
    numerical_features = test_df.select_dtypes(include=[np.number]).columns.tolist()
    if knn_imputer is not None and numerical_features:
        imputed_numerical = knn_imputer.transform(imputed_test[numerical_features])
        imputed_test[numerical_features] = pd.DataFrame(
            imputed_numerical, columns=numerical_features, index=imputed_test.index
        )

    # Categorical
    for col, mode_val in cat_imputers.items():
        imputed_test[col] = imputed_test[col].fillna(mode_val)

    print("Imputations applied to test set")
    return imputed_test



# Feature Construction (row-wise, safe before split)

def feature_construct(df, transformations):
    operations = {
        'add': lambda cols: cols[0] + cols[1],
        'sub': lambda cols: cols[0] - cols[1],
        'mul': lambda cols: cols[0] * cols[1],
        'div': lambda cols: cols[0] / cols[1],
        'mean': lambda cols: sum(cols) / len(cols),
        'max': lambda cols: pd.concat(cols, axis=1).max(axis=1),
        'min': lambda cols: pd.concat(cols, axis=1).min(axis=1),
    }

    for t in transformations:
        new_feature = t['new_feature']
        columns = t['columns']
        operation = t['operation']

        col_data = [df[col] for col in columns]
        if callable(operation):
            df[new_feature] = operation(col_data)
        elif isinstance(operation, str) and operation in operations:
            df[new_feature] = operations[operation](col_data)

    print("New features successfully constructed")
    return df



# Adaptive Transform (fit/transform)

def fit_adaptive_transform(train_df, skew_threshold=0.75, log1p_pos_threshold=0.9):
    transformers = {}
    transformed_train = train_df.copy()

    numerical_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
    if 'isFraud' in numerical_features:
        numerical_features.remove('isFraud')

    for feature in numerical_features:
        feature_data = transformed_train[feature].dropna()
        feature_skew = skew(feature_data)
        positive_ratio = (transformed_train[feature] > 0).sum() / len(transformed_train[feature])

        if abs(feature_skew) < skew_threshold:
            transformers[feature] = None
            continue

        if feature_skew > skew_threshold and positive_ratio >= log1p_pos_threshold:
            transformed_train[feature] = np.log1p(transformed_train[feature])
            transformers[feature] = "log1p"
        else:
            pt = PowerTransformer(method='yeo-johnson', standardize=False)
            transformed_train[[feature]] = pt.fit_transform(transformed_train[[feature]])
            transformers[feature] = pt

    print("Adaptive transform fitted and applied to train")
    return transformers, transformed_train


def transform_adaptive_transform(test_df, transformers):
    transformed_test = test_df.copy()
    for feature, transformer in transformers.items():
        if transformer is None:
            continue
        elif transformer == "log1p":
            transformed_test[feature] = np.log1p(transformed_test[feature])
        else:
            transformed_test[[feature]] = transformer.transform(transformed_test[[feature]])
    print("Adaptive transform applied to test")
    return transformed_test


# Sampling

def create_sample_size(df, fraud_ratio, sample_ratio):
    total_sample_size = int(len(df) * sample_ratio)
    n_fraud = int(total_sample_size * fraud_ratio)
    n_nonfraud = total_sample_size - n_fraud

    class1 = df[df['isFraud'] == 1]
    class0 = df[df['isFraud'] == 0]

    n_fraud = min(n_fraud, len(class1))
    n_nonfraud = min(n_nonfraud, len(class0))

    sample1 = class1.sample(n=n_fraud, random_state=42, replace=False)
    sample0 = class0.sample(n=n_nonfraud, random_state=42, replace=False)

    sampled_df = pd.concat([sample1, sample0])
    sampled_df = shuffle(sampled_df, random_state=42).reset_index(drop=True)

    print(f"Sampled dataset size: {len(sampled_df)} (target was {total_sample_size})")
    print(f"Fraud ratio in sample: {sampled_df['isFraud'].mean():.4f}")
    return sampled_df

# Dropping features

def drop_features(df, irrel_features):
    df = df.drop(irrel_features, axis=1, errors='ignore')
    print("Dropping of features successful")
    return df

# Encoding categorical features (fit/transform)

def fit_encode_cat(train_df, cat_features):
    # Create a copy to avoid modifying the original dataframe
    train_df = train_df.copy()
    
    # Get all unique categories from the training data for each feature
    all_categories = {}
    for feature in cat_features:
        all_categories[feature] = sorted(train_df[feature].astype('category').unique())
    
    # Perform one-hot encoding
    encoded_train = pd.get_dummies(train_df, columns=cat_features, dtype=int)
    
    # Ensure correct column order for consistency
    train_encoded_df = encoded_train.reindex(columns=encoded_train.columns)
    
    print("Categorical encoding applied to train")
    
    return all_categories, train_encoded_df


def transform_encode_cat(test_df, cat_features, all_categories):
    test_df = test_df.copy()
    
    # Perform one-hot encoding on the test set
    encoded_test = pd.get_dummies(test_df, columns=cat_features, dtype=int)
    
    # Create a list of all expected columns based on training categories
    expected_columns = []
    for feature, categories in all_categories.items():
        for category in categories:
            expected_columns.append(f"{feature}_{category}")
    
    # Reindex the encoded test dataframe to align with the training columns
    # and fill any missing columns with zeros for unseen categories
    test_encoded_df = encoded_test.reindex(columns=expected_columns, fill_value=0)
    
    # Preserve the original non-categorical columns
    non_cat_columns = [col for col in test_df.columns if col not in cat_features]
    test_encoded_df = pd.concat([test_df[non_cat_columns], test_encoded_df], axis=1)

    print("Categorical encoding applied to test set")
    
    return test_encoded_df


def normalize_x(X_train, X_val, X_test):
    # Create copies to prevent modifying the original dataframes
    X_train_normalized = X_train.copy()
    X_val_normalized = X_val.copy()
    X_test_normalized = X_test.copy()

    # Identify numeric columns for scaling
    numeric_cols = [
        col for col in X_train.columns
        if not (X_train[col].dtype.kind in 'i' and X_train[col].nunique() == 2)
    ]

    # Initialize the scaler
    scaler = StandardScaler()

    # Fit and transform the training data, then transform the validation and test data
    X_train_normalized[numeric_cols] = scaler.fit_transform(X_train_normalized[numeric_cols])
    X_val_normalized[numeric_cols] = scaler.transform(X_val_normalized[numeric_cols])
    X_test_normalized[numeric_cols] = scaler.transform(X_test_normalized[numeric_cols])

    print("Normalization applied on numeric features successful")

    return X_train_normalized, X_val_normalized, X_test_normalized
