In [1]:
import pandas as pd
import numpy as np

df = pd.read_parquet('../data/cleaned_data.parquet')

In [2]:
df['margin_rate'] = (df['seller_price'] - df['seller_earning']) / df['seller_price']
df.drop(columns=['product_color', 'product_id'], inplace=True)
df.drop(columns=['product_category_encoded','has_cross_border_fees_encoded'], inplace=True)

df1 = df
# Create Derived Features
df1['price_to_earning_ratio'] = df1['price_usd'] / (df1['seller_earning'] + 1)  # Avoid division by zero
df1['price_per_like'] = df1['price_usd'] / (df1['product_like_count'] + 1)      # Avoid division by zero
df1['seller_activity_ratio'] = df1['seller_products_sold'] / (df1['seller_num_products_listed'] + 1)  # Avoid division by zero


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow
import optuna
from sklearn.model_selection import train_test_split, learning_curve, StratifiedShuffleSplit
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score, recall_score,
    f1_score, log_loss, roc_curve, precision_recall_curve, confusion_matrix
)
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import shap
import time
import tempfile
import os

# Set MLflow experiment to save logs in default './mlruns/' folder
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("Vestiaire_Model_Comparison")
if mlflow.active_run():
    mlflow.end_run()

# Filter dataset to the most important features
important_features = [
    'seller_price', 'seller_badge_encoded', 'should_be_gone', 'seller_pass_rate',
    'price_to_earning_ratio', 'seller_products_sold', 'price_per_like', 'brand_id',
    'product_type', 'product_material', 'product_like_count', 'seller_num_products_listed',
    'seller_community_rank', 'seller_activity_ratio', 'product_color_encoded',
    'seller_num_followers', 'margin_rate', 'available', 'seller_country', 'in_stock',
    'product_season_encoded', 'usually_ships_within_encoded', 'product_condition_encoded',
    'warehouse_name_encoded'
]

# Assuming your DataFrame is named 'df1'
X = df1[important_features]
y = df1['sold']

# Use a sample of the dataset for faster experimentation (30% of data)
X_sample, _, y_sample, _ = train_test_split(
    X, y, train_size=0.3, random_state=42, stratify=y
)

# Split sample into train (60%), validation (20%), and test (20%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_sample, y_sample, test_size=0.4, random_state=42, stratify=y_sample
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# ─── BAYESIAN HYPERPARAMETER TUNING WITH OPTUNA ────────────────────────────────
with mlflow.start_run(run_name="Hyperparameter_Tuning") as tuning_run:
    parent_run_id = tuning_run.info.run_id

    def objective(trial):
        # sample hyperparameters
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 10, 100),
            "max_depth":    trial.suggest_int("max_depth",  3,  10),
            "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
            "subsample":    trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        }

        with mlflow.start_run(run_name=f"trial_{trial.number}",
                              nested=True,
                              parent_run_id=parent_run_id):
            mlflow.log_params(params)

            # train & eval on validation set
            model = xgb.XGBClassifier(
                **params,
                eval_metric='auc',
                random_state=42,
                tree_method='hist',
                n_jobs=-1
            )
            model.fit(X_train, y_train)
            y_pred_prob = model.predict_proba(X_val)[:, 1]
            auc = roc_auc_score(y_val, y_pred_prob)

            mlflow.log_metric("validation_roc_auc", auc)

        return auc

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20)

    # log best trial
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_validation_roc_auc", study.best_value)

    # optionally retrieve best params for downstream training
    best_params_xgb = study.best_params

# ─── MODEL INITIALIZATION ─────────────────────────────────────────────────────
# use the tuned params for XGBoost
xgb_model = xgb.XGBClassifier(
    **best_params_xgb,
    eval_metric='auc',
    random_state=42,
    tree_method='hist',
    n_jobs=-1
)
# keep your LightGBM & CatBoost defaults or tune separately
lgb_model = lgb.LGBMClassifier(random_state=42, n_estimators=30, max_depth=3, device='cpu')
cat_model = cb.CatBoostClassifier(verbose=0, random_state=42, iterations=30, depth=3, task_type='CPU')

def train_and_evaluate_with_mlflow(model, model_name, X_train, y_train, X_val, y_val, parent_run_id=None):
    with mlflow.start_run(run_name=model_name, nested=True, parent_run_id=parent_run_id):
        start_time = time.time()

        # Train the model on the training set
        model.fit(X_train, y_train)
        y_pred_prob = model.predict_proba(X_val)[:, 1]
        y_pred = model.predict(X_val)

        # Calculate metrics on validation set
        roc_auc = roc_auc_score(y_val, y_pred_prob)
        accuracy = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        logloss = log_loss(y_val, y_pred_prob)

        elapsed_time = (time.time() - start_time) / 60
        print(f"{model_name} Validation ROC-AUC: {roc_auc:.4f} | Time: {elapsed_time:.2f} min")

        # Log parameters and validation metrics
        mlflow.log_params(model.get_params())
        mlflow.log_metrics({
            "roc_auc": roc_auc,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "log_loss": logloss
        })

        # Log ROC & PR curves as artifacts
        temp_dir = tempfile.mkdtemp()
        fpr, tpr, _ = roc_curve(y_val, y_pred_prob)
        plt.figure(); plt.plot(fpr, tpr, label=f"ROC (AUC={roc_auc:.2f})"); plt.plot([0,1],[0,1],'k--')
        plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title(f"{model_name} ROC (Validation)"); plt.legend()
        roc_path = os.path.join(temp_dir, f"{model_name}_validation_roc.png")
        plt.savefig(roc_path); mlflow.log_artifact(roc_path); plt.close()

        precision_vals, recall_vals, _ = precision_recall_curve(y_val, y_pred_prob)
        plt.figure(); plt.plot(recall_vals, precision_vals, label="PR Curve")
        plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title(f"{model_name} PR (Validation)"); plt.legend()
        prc_path = os.path.join(temp_dir, f"{model_name}_validation_prc.png")
        plt.savefig(prc_path); mlflow.log_artifact(prc_path); plt.close()

        # ─── EXTRA ARTIFACTS ────────────────────────────────────────────────────

        # 1. Confusion Matrix
        cm = confusion_matrix(y_val, y_pred)
        plt.figure(); plt.imshow(cm); plt.title(f"{model_name} Confusion Matrix")
        plt.ylabel('True Label'); plt.xlabel('Predicted Label')
        cm_path = os.path.join(temp_dir, f"{model_name}_confusion_matrix.png")
        plt.savefig(cm_path); mlflow.log_artifact(cm_path); plt.close()

        # 2. SHAP Feature Importance Summary Plot
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_val)
        plt.figure()
        shap.summary_plot(shap_values, X_val, show=False)
        shap_path = os.path.join(temp_dir, f"{model_name}_shap_summary.png")
        plt.savefig(shap_path); mlflow.log_artifact(shap_path); plt.close()

        # 3. Sample Predictions CSV
        sample_df = X_val.copy()
        sample_df['actual'] = y_val
        sample_df['predicted'] = y_pred
        sample_df['pred_proba'] = y_pred_prob
        sample_csv = os.path.join(temp_dir, f"{model_name}_sample_predictions.csv")
        sample_df.head(20).to_csv(sample_csv, index=False)
        mlflow.log_artifact(sample_csv)

        return model
    
def plot_learning_curve_and_log(model, X_train, y_train, model_name):
    start_time = time.time()
    cv = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=42)
    train_sizes, train_scores, test_scores = learning_curve(
        model, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1,
        train_sizes=np.linspace(0.1, 0.5, 3)
    )

    train_mean = np.mean(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)

    plt.figure(figsize=(8, 6))
    plt.plot(train_sizes, train_mean, label=f"{model_name} Train")
    plt.plot(train_sizes, test_mean, label=f"{model_name} Validation")
    plt.title(f"{model_name} Learning Curve")
    plt.xlabel("Training Size"); plt.ylabel("ROC-AUC Score"); plt.legend(); plt.grid()

    temp_dir = tempfile.mkdtemp()
    plot_path = os.path.join(temp_dir, f"{model_name}_learning_curve.png")
    plt.savefig(plot_path); mlflow.log_artifact(plot_path); plt.close()

    elapsed_time = (time.time() - start_time) / 60
    print(f"{model_name} Learning Curve completed in {elapsed_time:.2f} min")

# Parent run to group all child runs
with mlflow.start_run(run_name="Model Comparison") as parent_run:
    parent_run_id = parent_run.info.run_id

    # Validation experiments
    xgb_model = train_and_evaluate_with_mlflow(xgb_model, "XGBoost", X_train, y_train, X_val, y_val, parent_run_id)
    plot_learning_curve_and_log(xgb_model, X_train, y_train, "XGBoost")

    lgb_model = train_and_evaluate_with_mlflow(lgb_model, "LightGBM", X_train, y_train, X_val, y_val, parent_run_id)
    plot_learning_curve_and_log(lgb_model, X_train, y_train, "LightGBM")

    cat_model = train_and_evaluate_with_mlflow(cat_model, "CatBoost", X_train, y_train, X_val, y_val, parent_run_id)
    plot_learning_curve_and_log(cat_model, X_train, y_train, "CatBoost")

    # Final evaluation on the held-out test set
    with mlflow.start_run(run_name="Test_Evaluation", nested=True, parent_run_id=parent_run_id):
        print("\nLogging Final Performance on Test Set to MLflow:")
        for model, model_name in [(xgb_model, "XGBoost"), (lgb_model, "LightGBM"), (cat_model, "CatBoost")]:
            y_test_pred_prob = model.predict_proba(X_test)[:, 1]
            y_test_pred = model.predict(X_test)

            metrics = {
                f"{model_name}_test_roc_auc": roc_auc_score(y_test, y_test_pred_prob),
                f"{model_name}_test_accuracy": accuracy_score(y_test, y_test_pred),
                f"{model_name}_test_precision": precision_score(y_test, y_test_pred),
                f"{model_name}_test_recall": recall_score(y_test, y_test_pred),
                f"{model_name}_test_f1": f1_score(y_test, y_test_pred),
                f"{model_name}_test_log_loss": log_loss(y_test, y_test_pred_prob)
            }
            print(metrics)
            mlflow.log_metrics(metrics)

            # Log test ROC and PR curves
            temp_dir = tempfile.mkdtemp()
            fpr, tpr, _ = roc_curve(y_test, y_test_pred_prob)
            plt.figure(); plt.plot(fpr, tpr, label=f"ROC (AUC={metrics[f'{model_name}_test_roc_auc']:.2f})"); plt.plot([0,1],[0,1],'k--')
            plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title(f"{model_name} ROC (Test)"); plt.legend()
            roc_path = os.path.join(temp_dir, f"{model_name}_test_roc.png")
            plt.savefig(roc_path); mlflow.log_artifact(roc_path); plt.close()

            precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_test_pred_prob)
            plt.figure(); plt.plot(recall_vals, precision_vals, label="PR Curve"); plt.xlabel('Recall'); plt.ylabel('Precision');
            plt.title(f"{model_name} PR (Test)"); plt.legend()
            prc_path = os.path.join(temp_dir, f"{model_name}_test_prc.png")
            plt.savefig(prc_path); mlflow.log_artifact(prc_path); plt.close()

[I 2025-04-29 01:35:45,831] A new study created in memory with name: no-name-bcf7db87-57c6-4da2-9155-b1f11514ec40
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
[I 2025-04-29 01:35:46,969] Trial 0 finished with value: 0.8541265584860414 and parameters: {'n_estimators': 90, 'max_depth': 8, 'learning_rate': 0.0014334322255991764, 'subsample': 0.7437716157508607, 'colsample_bytree': 0.8286175946046745}. Best is trial 0 with value: 0.8541265584860414.
[I 2025-04-29 01:35:46,969] Trial 0 finished with value: 0.8541265584860414 and parameters: {'n_estimators': 90, 'max_depth': 8, 'learning_rate': 0.0014334322255991764, 'subsample': 0.7437716157508607, 'colsample_bytree': 0.8286175946046745}. Best is trial 0 with value: 0.8541265584860414.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
[I 2025-

XGBoost Validation ROC-AUC: 0.8832 | Time: 0.03 min
XGBoost Learning Curve completed in 0.15 min
XGBoost Learning Curve completed in 0.15 min
[LightGBM] [Info] Number of positive: 2471, number of negative: 159399
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3317
[LightGBM] [Info] Number of data points in the train set: 161870, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.015265 -> initscore=-4.166788
[LightGBM] [Info] Start training from score -4.166788
[LightGBM] [Info] Number of positive: 2471, number of negative: 159399
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3317
[LightGBM] [Info] Number of data points in the train set: 161870, number of used features: 23
[L



LightGBM Learning Curve completed in 0.08 min
CatBoost Validation ROC-AUC: 0.8684 | Time: 0.01 min
CatBoost Validation ROC-AUC: 0.8684 | Time: 0.01 min
CatBoost Learning Curve completed in 0.10 min

Logging Final Performance on Test Set to MLflow:
CatBoost Learning Curve completed in 0.10 min

Logging Final Performance on Test Set to MLflow:
{'XGBoost_test_roc_auc': 0.8907427636246119, 'XGBoost_test_accuracy': 0.9872491057694089, 'XGBoost_test_precision': 0.9245283018867925, 'XGBoost_test_recall': 0.17861482381530985, 'XGBoost_test_f1': 0.29938900203665986, 'XGBoost_test_log_loss': 0.05411891426214484}
{'XGBoost_test_roc_auc': 0.8907427636246119, 'XGBoost_test_accuracy': 0.9872491057694089, 'XGBoost_test_precision': 0.9245283018867925, 'XGBoost_test_recall': 0.17861482381530985, 'XGBoost_test_f1': 0.29938900203665986, 'XGBoost_test_log_loss': 0.05411891426214484}
{'LightGBM_test_roc_auc': 0.8605019515298695, 'LightGBM_test_accuracy': 0.9855811108845933, 'LightGBM_test_precision': 0.784