In [107]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import os

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score

In [108]:
# Cell 2: Load data
X_train = pd.read_csv("../data/raw/X_train_G3tdtEn.csv")
y_train = pd.read_csv("../data/raw/Y_train_2_XPXJDyy.csv")
X_test = pd.read_csv("../data/raw/X_test_8skS2ey.csv")

  X_train = pd.read_csv("../data/raw/X_train_G3tdtEn.csv")
  X_test = pd.read_csv("../data/raw/X_test_8skS2ey.csv")


In [109]:
# Cell 3: Format label
y_train = y_train["fraud_flag"].astype(int)

In [110]:
# Cell 4: Feature engineering
def enrich_features(df):
    price_cols = [col for col in df.columns if "cash_price" in col]
    qty_cols = [col for col in df.columns if "Nbr_of_prod_purchas" in col]

    df["price_mean"] = df[price_cols].mean(axis=1)
    df["price_std"] = df[price_cols].std(axis=1)
    df["price_sum"] = df[price_cols].sum(axis=1)

    df["qty_mean"] = df[qty_cols].mean(axis=1)
    df["qty_std"] = df[qty_cols].std(axis=1)
    df["qty_sum"] = df[qty_cols].sum(axis=1)

    df["non_null_prices"] = df[price_cols].notnull().sum(axis=1)
    df["non_null_qty"] = df[qty_cols].notnull().sum(axis=1)

    df["avg_price_per_unit"] = df["price_sum"] / (df["qty_sum"] + 1e-5)
    if "Nb_of_items" in df.columns:
        df["price_per_item"] = df["price_sum"] / (df["Nb_of_items"] + 1e-5)

    return df

X_train = enrich_features(X_train)
X_test = enrich_features(X_test)

In [111]:
# Cell 5: Uniformiser les colonnes object
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        X_train[col] = X_train[col].astype(str)
        X_test[col] = X_test[col].astype(str)

In [112]:
# Cell 6: Split + préparation
X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)

In [113]:
# Cell 7: Column selection
num_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64'] and col != 'ID']
cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

In [114]:
# Cell 8: Transformers
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

In [115]:
# Cell 9: Base learners
base_learners = [
    ("rf", RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")),
    ("lgbm", LGBMClassifier(n_estimators=100, random_state=42, class_weight="balanced")),
    ("xgb", XGBClassifier(n_estimators=100, random_state=42, eval_metric="logloss", scale_pos_weight=5))
]

In [None]:
# Cell 10: Entraînement
stack_model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", StackingClassifier(
        estimators=base_learners,
        final_estimator=LogisticRegression(max_iter=1000, class_weight="balanced"),
        passthrough=True,
        n_jobs=-1
    ))
])

stack_model.fit(X_train_part, y_train_part)

[LightGBM] [Info] Number of positive: 1055, number of negative: 73177
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.358267 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6752
[LightGBM] [Info] Number of data points in the train set: 74232, number of used features: 1411
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [None]:
# Cell 11: Évaluation
y_val_proba = stack_model.predict_proba(X_val_part)[:, 1]
pr_auc = average_precision_score(y_val_part, y_val_proba)
print(f"Stacked Model Validation PR-AUC: {pr_auc:.4f}")

In [None]:
# Cell 12: Prédiction finale
y_test_pred = stack_model.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({
    "ID": X_test["ID"],
    "fraud_flag": y_test_pred
})
os.makedirs("output", exist_ok=True)
submission.to_csv("../output/submission_stacking.csv", index=False)
print("Submission saved to output/submission_stacking.csv")