# Model Training

Training LightGBM model using `src` modules on Real Lending Club Data.

In [None]:
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath('../src'))
from data_utils import load_data, split_data
from model import RiskModel

# Load PRE-PROCESSED data (cleaning happened in 00_real_data_processing)
data_path = '../data/processed/real_bnpl_features.csv'
if not os.path.exists(data_path):
    raise FileNotFoundError("Run notebooks/00_real_data_processing.ipynb first")

df = load_data(data_path)
print(f"Loaded processed data: {df.shape}")

In [None]:
target = 'is_default'
X = df.drop(columns=[target])
y = df[target]

print(f"Target distribution:\n{y.value_counts(normalize=True)}")

# Add target to X for split_data util, then separate
X[target] = y
X_train_full, X_test_full = split_data(X, target)

y_train = X_train_full[target]
X_train = X_train_full.drop(columns=[target])
y_test = X_test_full[target]
X_test = X_test_full.drop(columns=[target])

print(f'Train shape: {X_train.shape}, Test shape: {X_test.shape}')

In [None]:
model = RiskModel()
model.train(X_train, y_train, X_val=X_test, y_val=y_test)
metrics = model.evaluate(X_test, y_test)
print(f'AUC: {metrics["auc"]:.4f}')

In [None]:
model.save('../models/lightgbm_model.pkl')