# Classification Training Example
This notebook mirrors `examples/train_classification.py` for a step-by-step walkthrough of training a simple model.

In [None]:
from pathlib import Path
import pandas as pd
from joblib import dump
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve)
import matplotlib.pyplot as plt
import seaborn as sns

## Load dataset

In [None]:
base = Path("data/processed/classification")
X = pd.read_parquet(base / "X_v1.parquet")
y = pd.read_parquet(base / "y_v1.parquet").squeeze()

X = X.sort_index()
y = y.loc[X.index]
print(X.shape, y.shape)

## Train/test split and logistic regression

In [None]:
split_idx = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

In [None]:
lr_train_preds = lr.predict(X_train)
lr_train_proba = lr.predict_proba(X_train)[:, 1]
lr_test_preds = lr.predict(X_test)
lr_test_proba = lr.predict_proba(X_test)[:, 1]

train_metrics = {
    "accuracy": accuracy_score(y_train, lr_train_preds),
    "precision": precision_score(y_train, lr_train_preds),
    "recall": recall_score(y_train, lr_train_preds),
    "f1": f1_score(y_train, lr_train_preds),
    "roc_auc": roc_auc_score(y_train, lr_train_proba),
}

test_metrics = {
    "accuracy": accuracy_score(y_test, lr_test_preds),
    "precision": precision_score(y_test, lr_test_preds),
    "recall": recall_score(y_test, lr_test_preds),
    "f1": f1_score(y_test, lr_test_preds),
    "roc_auc": roc_auc_score(y_test, lr_test_proba),
}
print("LogisticRegression Train Metrics:")
for k, v in train_metrics.items():
    print(f"  {k.capitalize()}: {v:.4f}")
print("LogisticRegression Test Metrics:")
for k, v in test_metrics.items():
    print(f"  {k.capitalize()}: {v:.4f}")
print("Confusion Matrix:
", confusion_matrix(y_test, lr_test_preds))
print(classification_report(y_test, lr_test_preds))

In [None]:
plots_dir = Path("plots")
plots_dir.mkdir(exist_ok=True)

fpr, tpr, _ = roc_curve(y_test, lr_test_proba)
plt.figure()
plt.plot(fpr, tpr, label="LogisticRegression")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()
plt.close()

precision, recall, _ = precision_recall_curve(y_test, lr_test_proba)
plt.figure()
plt.plot(recall, precision, label="LogisticRegression")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()
plt.close()

cm = confusion_matrix(y_test, lr_test_preds)
plt.figure()
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
plt.close()

## Optional: XGBoost Classifier

In [None]:
try:
    from xgboost import XGBClassifier

    xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
    xgb.fit(X_train, y_train)
    xgb_train_preds = xgb.predict(X_train)
    xgb_train_proba = xgb.predict_proba(X_train)[:, 1]
    xgb_test_preds = xgb.predict(X_test)
    xgb_test_proba = xgb.predict_proba(X_test)[:, 1]

    train_metrics = {
        "accuracy": accuracy_score(y_train, xgb_train_preds),
        "precision": precision_score(y_train, xgb_train_preds),
        "recall": recall_score(y_train, xgb_train_preds),
        "f1": f1_score(y_train, xgb_train_preds),
        "roc_auc": roc_auc_score(y_train, xgb_train_proba),
    }
    test_metrics = {
        "accuracy": accuracy_score(y_test, xgb_test_preds),
        "precision": precision_score(y_test, xgb_test_preds),
        "recall": recall_score(y_test, xgb_test_preds),
        "f1": f1_score(y_test, xgb_test_preds),
        "roc_auc": roc_auc_score(y_test, xgb_test_proba),
    }
    print("XGBClassifier Train Metrics:")
    for k, v in train_metrics.items():
        print(f"  {k.capitalize()}: {v:.4f}")
    print("XGBClassifier Test Metrics:")
    for k, v in test_metrics.items():
        print(f"  {k.capitalize()}: {v:.4f}")
    print(confusion_matrix(y_test, xgb_test_preds))
except Exception as e:
    print(f"XGBClassifier not available: {e}")

## Save model

In [None]:
models_dir = Path("models")
models_dir.mkdir(exist_ok=True)
dump(lr, models_dir / "classification_model_v1.joblib")

This notebook saves the trained logistic regression model and visualizes common metrics. Results may vary depending on your dataset version.