# **compare_methods.ipynb**

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
%load_ext cudf.pandas

# Standard libraries
import numpy as np
import pandas as pd

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# XGBoost
from xgboost import XGBClassifier

In [3]:
def prepare_data(df):
    """
    Splits the dataset into features and target, applies an 80-20 train-test split,
    handles missing values, replaces infinite values, and balances the training set using SMOTE.

    Parameters:
        df (pd.DataFrame): Input dataset with "Label" as the target column.

    Returns:
        tuple: (X_train, X_test, y_train, y_test)
    """
    # Split into X (features) and y (target)
    X = df.drop(columns=[" Label"])
    y = df[" Label"]
    
    # Convert categorical labels to numeric
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    
    # Replace infinite values with NaN
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Handle missing values
    imputer = SimpleImputer(strategy="median")
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    # Train-test split (80-20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    # Apply SMOTE to balance the training set
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    return X_train_resampled, X_test, y_train_resampled, y_test


def train_xgboost(X_train, X_test, y_train, y_test):
    """
    Trains an XGBoost classifier and evaluates it using accuracy and F1-score.

    Parameters:
        X_train (pd.DataFrame): Training features.
        X_test (pd.DataFrame): Test features.
        y_train (pd.Series): Training labels.
        y_test (pd.Series): Test labels.

    Returns:
        tuple: (trained XGBoost model, accuracy, F1-score)
    """
    # Initialize XGBoost classifier with default settings
    model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")  # Weighted F1-score for class imbalance
    recall = recall_score(y_test, y_pred, average="weighted")
    precision = precision_score(y_test, y_pred, average="weighted")
    confusion = confusion_matrix(y_test, y_pred)

    return model, accuracy, f1, recall, precision


In [4]:
results_df = pd.DataFrame(columns=["Dataset", "Accuracy", "F1-Score"])

dataset_paths = [
	"./data/exported/Tuesday/Winsorized.csv",
	"./data/exported/Tuesday/Log-transformed.csv",
	"./data/exported/Tuesday/Box-Cox transformed.csv",
	"./data/exported/Tuesday/Robust-scaled.csv",
	"./data/exported/Tuesday/Isolation Forest.csv",
	"./data/exported/Tuesday/Median-imputed.csv",
]

for dataset_path in dataset_paths:
    print(f"\n\nDataset: {dataset_path.split('/')[-1]}")
    
    # Load the dataset
    df = pd.read_csv(dataset_path)
    print("\t✅ (1/4) Dataset loaded successfully.")
    
    # Prepare the dataset
    X_train, X_test, y_train, y_test = prepare_data(df)
    print("\t✅ (2/4) Dataset prepared successfully.")
    
    # Train the XGBoost model
    model, accuracy, f1, recall, precision = train_xgboost(X_train, X_test, y_train, y_test)
    print("\t✅ (3/4) XGBoost model trained successfully.")
    
    # Store the results
    new_row = pd.DataFrame([{
        "Dataset": dataset_path.split("/")[-1],
		"Accuracy": accuracy,
		"F1-Score": f1,
		"Recall": recall,
		"Precision": precision
	}])
    
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    
    print("\t✅ (4/4) Results stored successfully.")

print("\n Final Results:")
results_df



Dataset: Winsorized.csv
	✅ (1/4) Dataset loaded successfully.
	✅ (2/4) Dataset prepared successfully.
	✅ (3/4) XGBoost model trained successfully.
	✅ (4/4) Results stored successfully.


Dataset: Log-transformed.csv
	✅ (1/4) Dataset loaded successfully.
	✅ (2/4) Dataset prepared successfully.
	✅ (3/4) XGBoost model trained successfully.
	✅ (4/4) Results stored successfully.


Dataset: Box-Cox transformed.csv
	✅ (1/4) Dataset loaded successfully.
	✅ (2/4) Dataset prepared successfully.
	✅ (3/4) XGBoost model trained successfully.
	✅ (4/4) Results stored successfully.


Dataset: Robust-scaled.csv
	✅ (1/4) Dataset loaded successfully.
	✅ (2/4) Dataset prepared successfully.
	✅ (3/4) XGBoost model trained successfully.
	✅ (4/4) Results stored successfully.


Dataset: Isolation Forest.csv
	✅ (1/4) Dataset loaded successfully.
	✅ (2/4) Dataset prepared successfully.
	✅ (3/4) XGBoost model trained successfully.
	✅ (4/4) Results stored successfully.


Dataset: Median-imputed.csv
	✅ (1/4) Dat

Unnamed: 0,Dataset,Accuracy,F1-Score,Recall,Precision
0,Winsorized.csv,1.0,1.0,1.0,1.0
1,Log-transformed.csv,1.0,1.0,1.0,1.0
2,Box-Cox transformed.csv,0.999989,0.999989,0.999989,0.999989
3,Robust-scaled.csv,1.0,1.0,1.0,1.0
4,Isolation Forest.csv,0.999989,0.999989,0.999989,0.999989
5,Median-imputed.csv,0.999989,0.999989,0.999989,0.999989
