# Task 5: Model Training, Selection, and Experiment Tracking

This notebook trains supervised models to predict the proxy risk label
created in Task 4 and compares model performance using MLflow.


In [None]:
import sys
from pathlib import Path
# ensure repository root is on sys.path so `src` is importable from notebooks
sys.path.insert(0, str(Path('..').resolve()))
import pandas as pd
import mlflow
from src.model_training import (
    prepare_data,
    train_and_evaluate,
    hash_dataframe
)


In [None]:
df = pd.read_csv("../data/processed/features_with_target.csv")
df.shape
df.head()

In [None]:
df["is_high_risk"].value_counts(normalize=True)
#df["is_high_risk"].value_counts()

In [None]:
X_train, X_test, y_train, y_test = prepare_data(
    df,
    target_col="is_high_risk",
    test_size=0.2
)



In [None]:
mlflow.set_experiment("Task_5_Model_Training")
data_hash = hash_dataframe(df)


In [None]:
results = []

for model_name in ["logistic", "random_forest", "gradient_boosting"]:
    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("data_hash", data_hash)

        output = train_and_evaluate(
            X_train, X_test, y_train, y_test, model_name
        )

        for k, v in output["metrics"].items():
            mlflow.log_metric(k, v)

        mlflow.sklearn.log_model(
            output["best_estimator"],
            name="model"
        )

        results.append({
            "model": model_name,
            **output["metrics"]
        })


In [None]:
results_df = pd.DataFrame(results).sort_values("roc_auc", ascending=False)
results_df


- The champion model is selected based on ROC-AUC.
Tree-based models typically outperform linear models,
while logistic regression remains valuable for interpretability.


- “All evaluated models achieved strong predictive performance, with ROC-AUC values above 0.998, indicating excellent class separability. Logistic Regression outperformed more complex tree-based models across accuracy, precision, F1 score, and ROC-AUC, while maintaining full interpretability. Although Gradient Boosting achieved marginally higher recall, the trade-off in precision and transparency led to the selection of Logistic Regression as the champion model.”