# Credit Risk Model - Model Training and Evaluation

This notebook demonstrates the training and evaluation of our credit risk model using LightGBM and MLflow for experiment tracking.

## Process Overview
1. Data loading and preparation
2. Model configuration and training
3. Model evaluation using ROC AUC
4. Experiment tracking with MLflow

## Setup and Imports

## Prerequisites Check

Let's verify all required dependencies and data files are available before proceeding with the model training.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import mlflow
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Import our custom model
from src.models.lightgbm_model import LightGBMModel

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Set up MLflow experiment
EXPERIMENT_NAME = "Credit_Risk_Baseline"
mlflow.set_experiment(EXPERIMENT_NAME)

# Set up data paths
DATA_DIR = Path('../data/processed')
TRAIN_PATH = DATA_DIR / 'application_train_processed.csv'

## Load and Prepare Data

Load the processed training data and prepare it for model training.

In [None]:
# Load the processed training data
df = pd.read_csv(TRAIN_PATH)

# Separate features and target
X = df.drop('TARGET', axis=1)
y = df['TARGET']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))

## Model Training with MLflow Tracking

Train the LightGBM model and track the experiment with MLflow.

In [None]:
# Define model hyperparameters
params = {
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'num_leaves': 31,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': RANDOM_STATE
}

# Start MLflow run
with mlflow.start_run(run_name="lightgbm_baseline"):
    # Log hyperparameters
    mlflow.log_params(params)
    
    # Initialize and train the model
    model = LightGBMModel(**params)
    model.fit(X_train, y_train)
    
    # Make predictions on validation set
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    
    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y_val, y_pred_proba)
    
    # Log metrics
    mlflow.log_metric("roc_auc_score", roc_auc)
    
    # Log feature importance
    feature_importance = model.get_feature_importance(importance_type='gain')
    
    # Convert feature importance to a format suitable for MLflow
    for feature, importance in feature_importance.items():
        mlflow.log_metric(f"feature_importance_{feature}", importance)
    
    # SHAP Explainability Analysis
    print("Generating SHAP explanations...")
    
    # Create SHAP explainer
    explainer = shap.TreeExplainer(model.model)
    
    # Calculate SHAP values for validation set (sample subset for performance)
    # Use a sample of validation data for SHAP to improve performance
    sample_size = min(1000, len(X_val))
    X_val_sample = X_val.sample(n=sample_size, random_state=RANDOM_STATE)
    
    shap_values = explainer.shap_values(X_val_sample)
    
    # Generate SHAP summary bar plot
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, X_val_sample, plot_type="bar", show=False)
    plt.tight_layout()
    
    # Save the plot
    shap_plot_path = "shap_summary.png"
    plt.savefig(shap_plot_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    # Log the SHAP plot as MLflow artifact
    mlflow.log_artifact(shap_plot_path)
    
    # Also create and log a SHAP waterfall plot for a single prediction
    plt.figure(figsize=(10, 6))
    shap.waterfall_plot(explainer.expected_value, shap_values[0], X_val_sample.iloc[0], show=False)
    plt.tight_layout()
    
    waterfall_plot_path = "shap_waterfall.png"
    plt.savefig(waterfall_plot_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    # Log the waterfall plot as well
    mlflow.log_artifact(waterfall_plot_path)
    
    # Log SHAP statistics
    mlflow.log_metric("shap_mean_abs_value", np.mean(np.abs(shap_values)))
    mlflow.log_metric("shap_sample_size", sample_size)
    
    # Log the model
    mlflow.lightgbm.log_model(
        model.model,
        "model",
        registered_model_name="credit_risk_lightgbm"
    )
    
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print(f"SHAP values calculated for {sample_size} samples")
    
    # Display top 10 most important features
    importance_df = pd.DataFrame({
        'feature': feature_importance.keys(),
        'importance': feature_importance.values()
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    display(importance_df.head(10))
    
    # Display SHAP feature importance (mean absolute SHAP values)
    shap_importance = pd.DataFrame({
        'feature': X_val_sample.columns,
        'mean_abs_shap_value': np.mean(np.abs(shap_values), axis=0)
    }).sort_values('mean_abs_shap_value', ascending=False)
    
    print("\nTop 10 Features by SHAP Importance:")
    display(shap_importance.head(10))

## Model Artifacts and Results

The trained model and its metrics are now tracked in MLflow. You can access them using the MLflow UI or programmatically.

To view the MLflow UI, run:
```bash
mlflow ui
```

The logged artifacts include:
- Model hyperparameters
- ROC AUC score
- Feature importance scores
- Trained model (can be loaded for predictions)

In [None]:
# Example: Load the saved model from MLflow
loaded_model = mlflow.lightgbm.load_model("runs:/[RUN_ID]/model")

# Make predictions with the loaded model
predictions = loaded_model.predict_proba(X_val)[:, 1]

# Verify the predictions are the same as before
print(f"Predictions match: {np.allclose(predictions, y_pred_proba)}")

In [None]:
import sys
from pathlib import Path

def check_package_version(package_name):
    try:
        package = __import__(package_name)
        version = getattr(package, '__version__', 'unknown')
        return f"✓ {package_name} (version {version})"
    except ImportError:
        return f"✗ {package_name} is not installed"

# Check required packages
required_packages = [
    'pandas',
    'numpy',
    'scikit-learn',
    'lightgbm',
    'mlflow',
    'shap',
    'matplotlib'
]

print("Checking required packages:")
for package in required_packages:
    print(check_package_version(package))

# Check Python version
print(f"\nPython version: {sys.version.split()[0]}")

# Check if data file exists
data_path = Path('../data/processed/application_train_processed.csv')
print(f"\nChecking data file:")
print(f"{'✓' if data_path.exists() else '✗'} {data_path} "
      f"({'exists' if data_path.exists() else 'does not exist'})")

# If any package is missing, show installation command
missing_packages = [pkg for pkg in required_packages 
                   if not check_package_version(pkg).startswith('✓')]

if missing_packages:
    print("\nTo install missing packages, run:")
    print(f"pip install {' '.join(missing_packages)}")

if not data_path.exists():
    print("\nWarning: Training data file not found!")
    print("Please ensure the processed data file is available at:")
    print(data_path)

## MLflow Setup Verification

Let's verify MLflow is properly configured and we can create experiments.

In [None]:
import mlflow

# Test MLflow setup
print("Testing MLflow setup:")

try:
    # Get MLflow tracking URI
    tracking_uri = mlflow.get_tracking_uri()
    print(f"✓ MLflow tracking URI: {tracking_uri}")
    
    # Test experiment creation
    experiment_name = "test_experiment"
    mlflow.set_experiment(experiment_name)
    print(f"✓ Successfully created test experiment: {experiment_name}")
    
    # Test run creation
    with mlflow.start_run(run_name="test_run"):
        mlflow.log_param("test_param", "test_value")
        print("✓ Successfully created test run and logged parameter")
        
    print("\nMLflow is properly configured!")
    
except Exception as e:
    print(f"✗ MLflow setup error: {str(e)}")
    print("\nTo configure MLflow:")
    print("1. Ensure MLflow is installed: pip install mlflow")
    print("2. Set MLFLOW_TRACKING_URI environment variable if using remote tracking server")
    print("3. Ensure you have write permissions in the local directory for MLflow files")

## SHAP (SHapley Additive exPlanations) Analysis

SHAP provides model explainability by calculating the contribution of each feature to individual predictions. The analysis includes:

1. **SHAP Summary Bar Plot**: Shows the mean absolute SHAP values for each feature, indicating their overall importance
2. **SHAP Waterfall Plot**: Demonstrates how each feature contributes to a single prediction
3. **Feature Importance Comparison**: Compares traditional feature importance with SHAP-based importance

The SHAP plots are saved as artifacts in MLflow for future reference and model documentation.