In [1]:
# Cell 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import average_precision_score, precision_recall_curve
import matplotlib.pyplot as plt
import joblib
import os

  from pandas import MultiIndex, Int64Index


ModuleNotFoundError: No module named 'lightgbm'

In [None]:
# Cell 2: Load data
X_train = pd.read_csv("../data/raw/X_train_G3tdtEn.csv")
y_train = pd.read_csv("../data/raw/Y_train_2_XPXJDyy.csv")
X_test = pd.read_csv("../data/raw/X_test_8skS2ey.csv")

In [None]:
# Cell 3: Format label
y_train = y_train["fraud"]

In [None]:
# Cell 4: Preprocessing pipeline
num_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64'] and col != 'ID']
cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

In [None]:
# Cell 5: Define models
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss", random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=100, random_state=42)
}

In [None]:
# Cell 6: Train and evaluate models
results = {}
X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)

for name, model in models.items():
    clf = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    clf.fit(X_train_part, y_train_part)
    y_val_proba = clf.predict_proba(X_val_part)[:, 1]
    ap_score = average_precision_score(y_val_part, y_val_proba)
    results[name] = {
        "model": clf,
        "pr_auc": ap_score
    }
    print(f"{name} - Validation PR-AUC: {ap_score:.4f}")

In [None]:
# Cell 7: Select best model
best_model_name = max(results, key=lambda x: results[x]["pr_auc"])
best_model = results[best_model_name]["model"]
print(f"\n✅ Best model selected: {best_model_name}")

In [None]:
# Cell 8: Retrain on full training set and predict
best_model.fit(X_train, y_train)
y_test_pred = best_model.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({
    "ID": X_test["ID"],
    "fraud": y_test_pred
})

os.makedirs("../output", exist_ok=True)
submission.to_csv("../output/submission.csv", index=False)
print("Submission saved to ../output/submission.csv")

In [None]:
# Cell 9: Plot Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_train, best_model.predict_proba(X_train)[:, 1])
plt.figure(figsize=(8, 5))
plt.plot(recall, precision, label=f"PR-AUC = {results[best_model_name]['pr_auc']:.2f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve (Best Model)")
plt.legend()
plt.grid()
plt.show()