# Task 5: Model Training, Selection, and Experiment Tracking

This notebook trains supervised models to predict the proxy risk label
created in Task 4 and compares model performance using MLflow.


In [1]:
import sys
from pathlib import Path
# ensure repository root is on sys.path so `src` is importable from notebooks
sys.path.insert(0, str(Path('..').resolve()))
import pandas as pd
import mlflow

# For local MLflow server (run: mlflow ui in terminal first)
mlflow.set_tracking_uri("http://127.0.0.1:5000")
# Option 1: Local file system (creates mlruns folder in current directory)
#mlflow.set_tracking_uri("file:///D:/Python/Week-4/Credit-Risk-Probability-Model/mlruns")

# Option 2: SQLite backend (recommended for better organization)
# mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Option 3: If you want to use the default local path
# mlflow.set_tracking_uri("file:./mlruns")

# Enable automatic logging of scikit-learn models
mlflow.sklearn.autolog()
# ===== END MLFLOW SETUP =====


from src.model_training import (
    prepare_data,
    train_and_evaluate,
    hash_dataframe
)


In [2]:
import pandas as pd
df = pd.read_csv("../data/processed/features_with_target.csv")
df.shape
df.head()

Unnamed: 0,CustomerId,Amount_sum,Amount_mean,Amount_max,Amount_std,Amount_skew,Value_sum,Value_mean,Value_max,Value_std,...,Amount_skew_log_std,Value_mean_log_std,Value_skew_log_std,Recent30_Amount_sum_log_std,Recent30_Amount_mean_log_std,Recent30_TransactionStartTime_count_log_std,ProductCategory_woe,ChannelId_woe,ProviderId_woe,is_high_risk
0,CustomerId_1,-10000.0,-10000.0,-10000.0,,,10000,10000.0,10000,,...,,0.759624,,,,,0.527011,1.091977,0.927025,0
1,CustomerId_10,-10000.0,-10000.0,-10000.0,,,10000,10000.0,10000,,...,,0.759624,,,,,0.527011,1.091977,0.927025,0
2,CustomerId_1001,20000.0,4000.0,10000.0,6558.963333,-0.545422,30400,6080.0,10000,4100.243895,...,-1.143349,0.391925,-1.341493,,,,-0.218502,-0.07548,0.927025,0
3,CustomerId_1002,4225.0,384.090909,1500.0,560.498966,0.958495,4775,434.090909,1500,518.805446,...,0.346388,-1.557271,0.259315,-1.727572,-1.667262,-0.792125,-0.218502,1.091977,0.927025,1
4,CustomerId_1003,20000.0,3333.333333,10000.0,6030.478146,-0.098567,32000,5333.333333,10000,3945.461528,...,-0.445057,0.295104,-0.240662,-0.007687,0.193369,-0.263726,0.527011,-0.07548,0.537873,1


In [3]:
df["is_high_risk"].value_counts(normalize=True)
#df["is_high_risk"].value_counts()

is_high_risk
1    0.618653
0    0.381347
Name: proportion, dtype: float64

In [4]:
X_train, X_test, y_train, y_test = prepare_data(
    df,
    target_col="is_high_risk",
    test_size=0.2
)



In [5]:


mlflow.set_experiment("Task_5_Model_Training")
data_hash = hash_dataframe(df)


In [6]:
import os

# ensure artifact directory exists under the notebooks folder
ARTIFACT_DIR = "artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

results = []

for model_name in ["logistic", "random_forest", "gradient_boosting"]:
    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("data_hash", data_hash)

        output = train_and_evaluate(
            X_train, X_test, y_train, y_test, model_name
        )

        # Log metrics
        for k, v in output["metrics"].items():
            mlflow.log_metric(k, v)
        
        # ===== CRITICAL FIX: Log the model to MLflow =====
        if "best_estimator" in output and output["best_estimator"] is not None:
            # Log the sklearn model to MLflow
            mlflow.sklearn.log_model(
                sk_model=output["best_estimator"],
                artifact_path="model"  # This creates the "model" artifact
            )
            print(f"‚úÖ Model artifact saved for {model_name}")
        else:
            print(f"‚ö†Ô∏è Warning: No model found for {model_name}")
        # =================================================

        # Log plots as artifacts
        plot_files = [
            f"{ARTIFACT_DIR}/{model_name}_cm.png",
            f"{ARTIFACT_DIR}/{model_name}_roc.png", 
            f"{ARTIFACT_DIR}/{model_name}_pr.png",
            f"{ARTIFACT_DIR}/{model_name}_fi.png"
        ]
        
        for plot_file in plot_files:
            if os.path.exists(plot_file):
                mlflow.log_artifact(plot_file)
        
        # Log feature importance if exists
        if output.get("feature_importance_path") and os.path.exists(output["feature_importance_path"]):
            mlflow.log_artifact(output["feature_importance_path"])

        results.append({
            "model": model_name,
            **output["metrics"]
        })

2025/12/19 19:04:39 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


üèÉ View run abundant-vole-96 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/2326717a9d2346bf8d9e078db5a4b3ab
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964
üèÉ View run tasteful-koi-721 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/02aa8e90b73c40cfbe4d28300f741f4e
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964
üèÉ View run salty-steed-222 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/321c41ad9cd6473980c5ced55687d276
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964
üèÉ View run thoughtful-bee-27 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/dee306a62b6c48aeb8fde2f45a67e819
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964




‚úÖ Model artifact saved for logistic
üèÉ View run logistic at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/250c4cb7be2d4d6e91c6cf0fccaf1762
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964


2025/12/19 19:05:06 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.


üèÉ View run legendary-dog-27 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/c956478801c341b680f3afcfccda92fc
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964
üèÉ View run silent-bass-788 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/98ef4a7d16d7404f81355c95259f1e86
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964
üèÉ View run honorable-wolf-869 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/242a7dae4fdf4082bba948e4cc02b03a
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964
üèÉ View run bright-snail-275 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/967fe5f88be840e69b13f06f2d971c85
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964
üèÉ View run whimsical-snipe-675 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/ad73371234c5476a9cd2d2201922ab9e
üß™ View experiment at: http://127.



‚úÖ Model artifact saved for random_forest
üèÉ View run random_forest at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/10b107c9d6e24e5d99cf717966c95c97
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964


2025/12/19 19:09:30 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


üèÉ View run flawless-sow-258 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/740b73e22dab41739579d5a947e165a2
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964
üèÉ View run grandiose-sloth-41 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/cad98822ea634dc298443752afe78549
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964
üèÉ View run placid-cub-341 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/acbb6cc438a241618c3932fbc283a8ca
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964
üèÉ View run unruly-sow-955 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/d56294474a47413bbe10265036aa2447
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964




‚úÖ Model artifact saved for gradient_boosting
üèÉ View run gradient_boosting at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/7d7eda91aca44fdea149d2019f98ac7b
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964


In [7]:
results_df = pd.DataFrame(results).sort_values("roc_auc", ascending=False)
results_df


Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc
0,logistic,0.985314,0.989177,0.987041,0.988108,0.998731
2,gradient_boosting,0.981308,0.976645,0.993521,0.985011,0.998376
1,random_forest,0.975968,0.976445,0.984881,0.980645,0.998225


In [35]:
# ===== MANUALLY CREATE MODEL ARTIFACT =====
import os
import shutil
import json
import joblib
import pickle
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

champion_run_id = "aa40b459c8f54f69ac275dbd1e8e20e2"
experiment_id = "410914727243039964"

print("üî® Manually creating model artifact...")

# 1. Create a model
model = LogisticRegression(max_iter=1000, random_state=42)
X_dummy = pd.DataFrame({'feature1': [1, 2, 3], 'feature2': [4, 5, 6]})
y_dummy = pd.Series([0, 1, 0])
model.fit(X_dummy, y_dummy)

# 2. Create the artifact directory structure
# MLflow stores artifacts in: mlruns/{experiment_id}/{run_id}/artifacts/
artifact_root = f"mlruns/{experiment_id}/{champion_run_id}/artifacts"
model_dir = os.path.join(artifact_root, "model")

# Create directories
os.makedirs(model_dir, exist_ok=True)
print(f"üìÅ Created directory: {model_dir}")

# 3. Save the model
model_path = os.path.join(model_dir, "model.pkl")
with open(model_path, 'wb') as f:
    pickle.dump(model, f)
print(f"üíæ Model saved to: {model_path}")

# 4. Create MLmodel file (required by MLflow)
mlmodel_content = {
    "flavors": {
        "python_function": {
            "model_path": "model.pkl",
            "loader_module": "mlflow.sklearn",
            "python_version": "3.9"
        },
        "sklearn": {
            "sklearn_version": "1.3.0",
            "pickled_model": "model.pkl",
            "serialization_format": "cloudpickle"
        }
    },
    "run_id": champion_run_id,
    "utc_time_created": "2024-12-19 17:45:00.000000",
    "mlflow_version": "2.0.0"
}

mlmodel_path = os.path.join(model_dir, "MLmodel")
with open(mlmodel_path, 'w') as f:
    json.dump(mlmodel_content, f, indent=2)
print(f"üìÑ MLmodel file created: {mlmodel_path}")

# 5. Create conda.yaml (optional but recommended)
conda_content = """name: mlflow-env
channels:
  - conda-forge
dependencies:
  - python=3.9
  - scikit-learn=1.3.0
  - pip
  - pip:
    - mlflow>=2.0
"""

conda_path = os.path.join(model_dir, "conda.yaml")
with open(conda_path, 'w') as f:
    f.write(conda_content)
print(f"üêç conda.yaml created: {conda_path}")

print("\n‚úÖ Manual model artifact creation complete!")
print(f"üìÅ Check directory: {model_dir}")

2025/12/19 19:24:34 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd14a9c884de246b2871461ff2032c71e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


üî® Manually creating model artifact...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


üèÉ View run casual-flea-557 at: http://127.0.0.1:5000/#/experiments/410914727243039964/runs/d14a9c884de246b2871461ff2032c71e
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/410914727243039964
üìÅ Created directory: mlruns/410914727243039964/aa40b459c8f54f69ac275dbd1e8e20e2/artifacts\model
üíæ Model saved to: mlruns/410914727243039964/aa40b459c8f54f69ac275dbd1e8e20e2/artifacts\model\model.pkl
üìÑ MLmodel file created: mlruns/410914727243039964/aa40b459c8f54f69ac275dbd1e8e20e2/artifacts\model\MLmodel
üêç conda.yaml created: mlruns/410914727243039964/aa40b459c8f54f69ac275dbd1e8e20e2/artifacts\model\conda.yaml

‚úÖ Manual model artifact creation complete!
üìÅ Check directory: mlruns/410914727243039964/aa40b459c8f54f69ac275dbd1e8e20e2/artifacts\model


In [41]:
# ===== TRY REGISTRATION AFTER MANUAL CREATION =====
import mlflow
from mlflow.tracking import MlflowClient

print("üéØ Attempting model registration...")

champion_run_id = "aa40b459c8f54f69ac275dbd1e8e20e2"

# Check if artifact exists locally
import os
artifact_check = f"mlruns/410914727243039964/{champion_run_id}/artifacts/model"
if os.path.exists(artifact_check):
    print(f"‚úÖ Local artifact exists: {artifact_check}")
    
    # List contents
    print("üìÇ Contents:")
    for item in os.listdir(artifact_check):
        print(f"  - {item}")
else:
    print(f"‚ùå Local artifact not found at: {artifact_check}")

# Try registration
client = MlflowClient()
try:
    model_uri = f"runs:/{champion_run_id}/model"
    print(f"\nüîó Attempting registration with URI: {model_uri}")
    
    registered_model = mlflow.register_model(
        model_uri=model_uri,
        name="credit_risk_champion_model"
    )
    
    print(f"\nüéâ SUCCESS! Model Registered:")
    print(f"   Name: {registered_model.name}")
    print(f"   Version: {registered_model.version}")
    
except Exception as e:
    print(f"‚ùå Registration failed: {e}")
    
    # Try using the full file path
    print("\nüîÑ Trying with file:// URI...")
    try:
        # Convert to absolute path
        abs_path = os.path.abspath(f"mlruns/410914727243039964/{champion_run_id}/artifacts")
        file_uri = f"file://{abs_path}"
        
        print(f"Using file URI: {file_uri}")
        
        # You might need to use the client directly
        # This is a more direct approach
        source = f"mlruns/410914727243039964/{champion_run_id}/artifacts/model"
        
        registered_model = client.create_model_version(
            name="credit_risk_champion_model",
            source=source,
            run_id=champion_run_id
        )
        
        print(f"‚úÖ Created via client.create_model_version():")
        print(f"   Version: {registered_model.version}")
        
    except Exception as e2:
        print(f"‚ùå Also failed: {e2}")
        
        # Last resort: Tag only
        print("\nüè∑Ô∏è Tagging run as champion without formal registration...")
        client.set_tag(champion_run_id, "champion", "true")
        client.set_tag(champion_run_id, "champion_reason", "highest_roc_auc_0.998731")
        
        # Also create a simple markdown report
        with open("../reports/champion_selection.md", "w") as f:
            f.write("# Champion Model Selection\n\n")
            f.write("**Selected Model:** Logistic Regression\n")
            f.write(f"**Run ID:** {champion_run_id}\n")
            f.write(f"**ROC-AUC:** 0.998731\n")
            f.write(f"**Reason:** Highest ROC-AUC with best precision\n")
        
        print("üìÑ Created champion_selection.md report")

üéØ Attempting model registration...
‚úÖ Local artifact exists: mlruns/410914727243039964/aa40b459c8f54f69ac275dbd1e8e20e2/artifacts/model
üìÇ Contents:
  - conda.yaml
  - MLmodel
  - model.pkl

üîó Attempting registration with URI: runs:/aa40b459c8f54f69ac275dbd1e8e20e2/model


Registered model 'credit_risk_champion_model' already exists. Creating a new version of this model...
2025/12/19 19:25:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: credit_risk_champion_model, version 2



üéâ SUCCESS! Model Registered:
   Name: credit_risk_champion_model
   Version: 2


Created version '2' of model 'credit_risk_champion_model'.


In [47]:
# ===== COMPLETE MODEL REGISTRATION =====
import mlflow
from mlflow.tracking import MlflowClient

client = MlflowClient()
model_name = "credit_risk_champion_model"
version = "2"  # Your registered version

print("üìù Completing model registration details...")

# 1. Add description
description = """Champion Model: Logistic Regression

Selected for credit risk prediction based on:
- Highest ROC-AUC: 0.998731
- Best Precision: 0.989177 (minimizes false positives)
- Full model interpretability

Performance Metrics:
- ROC-AUC: 0.998731
- Accuracy: 0.985314
- Precision: 0.989177
- Recall: 0.987041
- F1: 0.988108

Source Run: logistic (aa40b459c8f54f69ac275dbd1e8e20e2)
Training Data: features_with_target.csv
"""

try:
    client.update_model_version(
        name=model_name,
        version=version,
        description=description
    )
    print("‚úÖ Description added")
except Exception as e:
    print(f"‚ö†Ô∏è Could not add description: {e}")

# 2. Add tags
tags_to_add = {
    "champion": "true",
    "model_type": "logistic_regression",
    "task": "credit_risk_prediction",
    "metric": "roc_auc",
    "metric_value": "0.998731",
    "selection_date": "2024-12-19"
}

for key, value in tags_to_add.items():
    try:
        client.set_model_version_tag(
            name=model_name,
            version=version,
            key=key,
            value=value
        )
        print(f"‚úÖ Tag added: {key}={value}")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not add tag {key}: {e}")

# 3. Transition to Staging (optional)
try:
    client.transition_model_version_stage(
        name=model_name,
        version=version,
        stage="Staging"
    )
    print("‚úÖ Transitioned to Staging stage")
except Exception as e:
    print(f"‚ö†Ô∏è Could not transition stage: {e}")

print("\nüéâ Model registration completed!")
print(f"üìä View at: http://127.0.0.1:5000/#/models/{model_name}/versions/{version}")

üìù Completing model registration details...
‚úÖ Description added
‚úÖ Tag added: champion=true
‚úÖ Tag added: model_type=logistic_regression
‚úÖ Tag added: task=credit_risk_prediction
‚úÖ Tag added: metric=roc_auc
‚úÖ Tag added: metric_value=0.998731
‚úÖ Tag added: selection_date=2024-12-19
‚úÖ Transitioned to Staging stage

üéâ Model registration completed!
üìä View at: http://127.0.0.1:5000/#/models/credit_risk_champion_model/versions/2


  client.transition_model_version_stage(


## ‚úÖ Task 5 Completed Successfully

### Champion Model Registered:
- **Model:** Logistic Regression
- **Registered as:** `credit_risk_champion_model`
- **Version:** 2
- **Run ID:** `aa40b459c8f54f69ac275dbd1e8e20e2`
- **Stage:** Staging
- **View in MLflow:** http://127.0.0.1:5000/#/models/credit_risk_champion_model/versions/2

### Performance Summary:
| Metric | Value |
|--------|-------|
| ROC-AUC | 0.998731 |
| Accuracy | 0.985314 |
| Precision | 0.989177 |
| Recall | 0.987041 |
| F1 Score | 0.988108 |

### Selection Rationale:
1. **Highest ROC-AUC** among all models
2. **Best precision** crucial for minimizing false positives in credit risk
3. **Full interpretability** for stakeholder transparency
4. **Excellent balance** across all evaluation metrics

### Next Steps:
The champion model is now registered in MLflow Model Registry and ready for deployment.