# Task 5: Model Training, Selection, and Experiment Tracking

This notebook trains supervised models to predict the proxy risk label
created in Task 4 and compares model performance using MLflow.


In [2]:
import sys
from pathlib import Path
# ensure repository root is on sys.path so `src` is importable from notebooks
sys.path.insert(0, str(Path('..').resolve()))
import pandas as pd
import mlflow

# For local MLflow server (run: mlflow ui in terminal first)
mlflow.set_tracking_uri("http://127.0.0.1:5000")
# Option 1: Local file system (creates mlruns folder in current directory)
#mlflow.set_tracking_uri("file:///D:/Python/Week-4/Credit-Risk-Probability-Model/mlruns")

# Option 2: SQLite backend (recommended for better organization)
# mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Option 3: If you want to use the default local path
# mlflow.set_tracking_uri("file:./mlruns")

# Enable automatic logging of scikit-learn models
mlflow.sklearn.autolog()
# ===== END MLFLOW SETUP =====


from src.model_training import (
    prepare_data,
    train_and_evaluate,
    hash_dataframe
)


In [3]:
import pandas as pd
df = pd.read_csv("../data/processed/features_with_target.csv")
df.shape
df.head()

Unnamed: 0,CustomerId,Amount_sum,Amount_mean,Amount_max,Amount_std,Amount_skew,Value_sum,Value_mean,Value_max,Value_std,...,Amount_skew_log_std,Value_mean_log_std,Value_skew_log_std,Recent30_Amount_sum_log_std,Recent30_Amount_mean_log_std,Recent30_TransactionStartTime_count_log_std,ProductCategory_woe,ChannelId_woe,ProviderId_woe,is_high_risk
0,CustomerId_1,-10000.0,-10000.0,-10000.0,,,10000,10000.0,10000,,...,,0.759624,,,,,0.527011,1.091977,0.927025,0
1,CustomerId_10,-10000.0,-10000.0,-10000.0,,,10000,10000.0,10000,,...,,0.759624,,,,,0.527011,1.091977,0.927025,0
2,CustomerId_1001,20000.0,4000.0,10000.0,6558.963333,-0.545422,30400,6080.0,10000,4100.243895,...,-1.143349,0.391925,-1.341493,,,,-0.218502,-0.07548,0.927025,0
3,CustomerId_1002,4225.0,384.090909,1500.0,560.498966,0.958495,4775,434.090909,1500,518.805446,...,0.346388,-1.557271,0.259315,-1.727572,-1.667262,-0.792125,-0.218502,1.091977,0.927025,1
4,CustomerId_1003,20000.0,3333.333333,10000.0,6030.478146,-0.098567,32000,5333.333333,10000,3945.461528,...,-0.445057,0.295104,-0.240662,-0.007687,0.193369,-0.263726,0.527011,-0.07548,0.537873,1


In [4]:
df["is_high_risk"].value_counts(normalize=True)
#df["is_high_risk"].value_counts()

is_high_risk
1    0.618653
0    0.381347
Name: proportion, dtype: float64

In [5]:
X_train, X_test, y_train, y_test = prepare_data(
    df,
    target_col="is_high_risk",
    test_size=0.2
)



In [6]:


mlflow.set_experiment("Task_5_Model_Training")
data_hash = hash_dataframe(df)


In [None]:
import os

# ensure artifact directory exists under the notebooks folder
ARTIFACT_DIR = "artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

results = []

for model_name in ["logistic", "random_forest", "gradient_boosting"]:
    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("data_hash", data_hash)

        output = train_and_evaluate(
            X_train, X_test, y_train, y_test, model_name
        )

        # Log metrics
        for k, v in output["metrics"].items():
            mlflow.log_metric(k, v)
        
        # ===== CRITICAL FIX: Log the model to MLflow =====
        if "best_estimator" in output and output["best_estimator"] is not None:
            # Log the sklearn model to MLflow
            mlflow.sklearn.log_model(
                sk_model=output["best_estimator"],
                artifact_path="model"  # This creates the "model" artifact
            )
            print(f"‚úÖ Model artifact saved for {model_name}")
        else:
            print(f"‚ö†Ô∏è Warning: No model found for {model_name}")
        # =================================================

        # Log plots as artifacts
        plot_files = [
            f"{ARTIFACT_DIR}/{model_name}_cm.png",
            f"{ARTIFACT_DIR}/{model_name}_roc.png", 
            f"{ARTIFACT_DIR}/{model_name}_pr.png",
            f"{ARTIFACT_DIR}/{model_name}_fi.png"
        ]
        
        for plot_file in plot_files:
            if os.path.exists(plot_file):
                mlflow.log_artifact(plot_file)
        
        # Log feature importance if exists
        if output.get("feature_importance_path") and os.path.exists(output["feature_importance_path"]):
            mlflow.log_artifact(output["feature_importance_path"])

        results.append({
            "model": model_name,
            **output["metrics"]
        })



In [None]:
results_df = pd.DataFrame(results).sort_values("roc_auc", ascending=False)
results_df


Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc
0,logistic,0.985314,0.989177,0.987041,0.988108,0.998731
2,gradient_boosting,0.981308,0.976645,0.993521,0.985011,0.998376
1,random_forest,0.975968,0.976445,0.984881,0.980645,0.998225


In [None]:
# ===== SIMPLIFIED DYNAMIC CHAMPION SELECTION =====
import mlflow
from mlflow.tracking import MlflowClient

# Get the best run
experiment = mlflow.get_experiment_by_name("Task_5_Model_Training")
runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.roc_auc DESC"]
)

if not runs.empty:
    # Take the best run
    best_run = runs.iloc[0]
    best_run_id = best_run["run_id"]
    best_model_name = best_run["tags.mlflow.runName"]
    best_roc_auc = best_run["metrics.roc_auc"]
    
    print(f"üèÜ Champion: {best_model_name} (ROC-AUC: {best_roc_auc:.6f})")
    
    # FIRST, ensure the run has a model artifact
    client = MlflowClient()
    
    # Check artifacts - if no 'model' artifact, log a placeholder
    artifacts = client.list_artifacts(best_run_id)
    has_model = any('model' in a.path.lower() for a in artifacts)
    
    if not has_model:
        print("‚ö†Ô∏è No 'model' artifact found. Creating one...")
        # You would need to load your actual model here
        # model = load_your_model()  # You need to implement this
        # with mlflow.start_run(run_id=best_run_id):
        #     mlflow.sklearn.log_model(model, "model")
        print("‚ùå Cannot proceed - no model artifact exists")
        print("üí° Re-run your training cell to ensure model is saved")
    else:
        # Try to register
        try:
            # Try different common paths
            for path in ["model", "sklearn-model", "artifacts/model"]:
                try:
                    model_uri = f"runs:/{best_run_id}/{path}"
                    registered_model = mlflow.register_model(
                        model_uri=model_uri,
                        name="credit_risk_champion_model"
                    )
                    print(f"‚úÖ Registered using path '{path}': version {registered_model.version}")
                    break
                except:
                    continue
            else:
                print("‚ùå Could not register with any path")
                
        except Exception as e:
            print(f"‚ö†Ô∏è Registration failed: {e}")
            print("Tagging run as champion instead...")
            client.set_tag(best_run_id, "champion", "true")

# ===== END =====

üèÜ Champion: logistic (ROC-AUC: 0.998731)
‚ö†Ô∏è No 'model' artifact found. Creating one...
‚ùå Cannot proceed - no model artifact exists
üí° Re-run your training cell to ensure model is saved


## Results Summary:

After comprehensive model evaluation, the performance metrics are:

| Model | Accuracy | Precision | Recall | F1 | ROC-AUC |
|-------|----------|-----------|--------|----|---------|
| Logistic Regression | 0.985 | 0.989 | 0.987 | 0.988 | **0.99873** |
| Gradient Boosting | 0.981 | 0.977 | 0.994 | 0.985 | 0.99838 |
| Random Forest | 0.976 | 0.976 | 0.985 | 0.981 | 0.99823 |

**Champion Model Selection**: 

**Logistic Regression** is selected as the champion model based on:
- **Highest ROC-AUC (0.99873)** - primary evaluation metric
- **Best precision (0.989)** - crucial for minimizing false positives in credit risk assessment
- **Excellent interpretability** - coefficients provide clear feature importance
- **Strong balance** across all metrics (accuracy, F1, recall)

While Gradient Boosting achieved the highest recall (0.994), the superior precision and ROC-AUC of Logistic Regression, combined with its transparency for stakeholders, make it the optimal choice for this credit risk prediction task.