# 06_model_train

Goal:

- Load model-ready datasets
- Train a LightGBM model with class weighting for imbalance
- Evaluate ROC-AUC on a validation split
- Plot ROC curve and feature importance
- Train final model on full data and generate test predictions

Inputs:
- `data/clean/model_train.csv`  (has TARGET)
- `data/clean/model_test.csv`   (same features, no TARGET)

Outputs:
- `data/clean/oof_val_preds.csv`      validation predictions (SK_ID_CURR, TARGET, PRED_PROB)
- `data/clean/test_predictions.csv`   test predictions (SK_ID_CURR, PRED_PROB)

Metric:
- ROC-AUC on validation


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

import lightgbm as lgb  # pip install lightgbm

CLEAN_DIR = Path("data/clean")
OUT_VAL_PATH = CLEAN_DIR / "oof_val_preds.csv"
OUT_TEST_PATH = CLEAN_DIR / "test_predictions.csv"

train_df = pd.read_csv(CLEAN_DIR / "model_train.csv")
test_df  = pd.read_csv(CLEAN_DIR / "model_test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Default rate in train (TARGET==1):", train_df["TARGET"].mean())


## 1. Prepare features / labels / ids

In [None]:
TARGET_COL = "TARGET"
ID_COL = "SK_ID_CURR"

# target
y = train_df[TARGET_COL].astype(int).values

# copy feature frames
X = train_df.drop(columns=[TARGET_COL]).copy()
X_test_final = test_df.copy()

# save ids separately
id_train = X[ID_COL].values
id_test  = X_test_final[ID_COL].values

# remove ID column from model features
X = X.drop(columns=[ID_COL])
X_test_final = X_test_final.drop(columns=[ID_COL])

print("X shape:", X.shape)
print("X_test_final shape:", X_test_final.shape)


## 2. Train/validation split (stratified)

In [None]:
X_tr, X_val, y_tr, y_val, id_tr, id_val = train_test_split(
    X, y, id_train,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train split shape:", X_tr.shape)
print("Valid split shape:", X_val.shape)
print("Validation default rate:", y_val.mean())


## 3. LightGBM training with imbalance handling

In [None]:
# wrap data for LightGBM
lgb_train = lgb.Dataset(X_tr, label=y_tr)
lgb_valid = lgb.Dataset(X_val, label=y_val)

# scale_pos_weight ~ (#neg / #pos)
pos_weight = (y_tr==0).sum() / max((y_tr==1).sum(), 1)
print("scale_pos_weight =", pos_weight)

params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "min_data_in_leaf": 50,
    "scale_pos_weight": pos_weight,
    "verbose": -1,
}

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=["train","valid"],
    num_boost_round=5000,
    early_stopping_rounds=200,
    verbose_eval=100
)

print("Best iteration:", model.best_iteration)


## 4. Validation performance (ROC-AUC + ROC curve)

In [None]:
val_pred_proba = model.predict(X_val, num_iteration=model.best_iteration)
val_auc = roc_auc_score(y_val, val_pred_proba)
print("Validation ROC-AUC:", val_auc)

fpr, tpr, _ = roc_curve(y_val, val_pred_proba)
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1],"--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(f"ROC Curve (AUC={val_auc:.4f})")
plt.show()


## 5. Feature importance

In [None]:
importances = model.feature_importance(importance_type="gain")
feat_imp = (
    pd.DataFrame({
        "feature": X.columns,
        "importance_gain": importances
    })
    .sort_values("importance_gain", ascending=False)
)

print(feat_imp.head(20))

plt.figure(figsize=(6,8))
feat_imp.head(25).sort_values("importance_gain").plot(
    kind="barh",
    x="feature",
    y="importance_gain"
)
plt.title("Top 25 Feature Importances (gain)")
plt.tight_layout()
plt.show()


## 6. Save validation predictions for analysis / threshold tuning

In [None]:
CLEAN_DIR = Path("data/clean")
OUT_VAL_PATH = CLEAN_DIR / "oof_val_preds.csv"

val_results = pd.DataFrame({
    "SK_ID_CURR": id_val,
    "TARGET": y_val,
    "PRED_PROB": val_pred_proba
})
val_results.to_csv(OUT_VAL_PATH, index=False)
print("Wrote validation preds to:", OUT_VAL_PATH)

val_results.head()


## 7. Final model on FULL data + test predictions

In [None]:
# train final model using all rows
full_lgb_train = lgb.Dataset(X, label=y)

final_model = lgb.train(
    {**params, "verbose": -1},
    full_lgb_train,
    num_boost_round=model.best_iteration  # reuse best iteration from earlier fit
)

# predict on test
test_pred_proba = final_model.predict(X_test_final)

submission_like = pd.DataFrame({
    "SK_ID_CURR": id_test,
    "PRED_PROB": test_pred_proba
})

OUT_TEST_PATH = CLEAN_DIR / "test_predictions.csv"
submission_like.to_csv(OUT_TEST_PATH, index=False)

print("Wrote test predictions to:", OUT_TEST_PATH)
submission_like.head()
