# Model Training and Selection

This notebook loads the processed features, splits the data, and trains two models (Logistic Regression and Random Forest) with hyperparameter tuning using Grid Search. Results and best parameters are reported for each model.

In [1]:
import pandas as pd
import mlflow

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.train import (split_data, grid_search_logistic_regression, 
                       grid_search_random_forest, evaluate_model_full)

In [2]:
# Load processed data
df = pd.read_csv('../data/processed/features_with_risk.csv')

In [3]:
# Split the data
target_col = 'is_high_risk'
X_train, X_test, y_train, y_test = split_data(df, target_col=target_col)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print(f"Train target distribution:\n{y_train.value_counts(normalize=True)}")
print(f"Test target distribution:\n{y_test.value_counts(normalize=True)}")

Train shape: (76529, 36), Test shape: (19133, 36)
Train target distribution:
is_high_risk
0    0.884802
1    0.115198
Name: proportion, dtype: float64
Test target distribution:
is_high_risk
0    0.884806
1    0.115194
Name: proportion, dtype: float64


In [4]:
# Hyperparameter tuning for Logistic Regression
tuned_logreg = grid_search_logistic_regression(X_train, y_train)
print('Tuned Logistic Regression Results:')
evaluate_model_full(tuned_logreg, X_test, y_test)

Best params: {'C': 0.01, 'max_iter': 1000, 'solver': 'liblinear'}
Tuned Logistic Regression Results:
Accuracy: 0.8848
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
ROC-AUC: 0.7740343758747299
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     16929
           1       0.00      0.00      0.00      2204

    accuracy                           0.88     19133
   macro avg       0.44      0.50      0.47     19133
weighted avg       0.78      0.88      0.83     19133

Accuracy: 0.8848
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
ROC-AUC: 0.7740343758747299
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     16929
           1       0.00      0.00      0.00      2204

    accuracy                           0.88     19133
   macro avg       0.44      0.50      0.47     19133
weighted avg       0.78      0.88      0.83     19133



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


{'accuracy': 0.8848063555114201,
 'precision': 0.0,
 'recall': 0.0,
 'f1': 0.0,
 'roc_auc': 0.7740343758747299}

In [5]:
# Hyperparameter tuning for Random Forest
tuned_rf = grid_search_random_forest(X_train, y_train)
print('Tuned Random Forest Results:')
evaluate_model_full(tuned_rf, X_test, y_test)

Best params: {'max_depth': None, 'n_estimators': 100}
Tuned Random Forest Results:
Accuracy: 0.9966
Precision: 0.9842
Recall: 0.9864
F1 Score: 0.9853
ROC-AUC: 0.9997970733754158
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16929
           1       0.98      0.99      0.99      2204

    accuracy                           1.00     19133
   macro avg       0.99      0.99      0.99     19133
weighted avg       1.00      1.00      1.00     19133

Accuracy: 0.9966
Precision: 0.9842
Recall: 0.9864
F1 Score: 0.9853
ROC-AUC: 0.9997970733754158
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16929
           1       0.98      0.99      0.99      2204

    accuracy                           1.00     19133
   macro avg       0.99      0.99      0.99     19133
weighted avg       1.00      1.00      1.00     19133



{'accuracy': 0.9966027282705273,
 'precision': 0.98415572657311,
 'recall': 0.9863883847549909,
 'f1': 0.98527079084523,
 'roc_auc': 0.9997970733754158}

The best model is Random Forest

In [6]:
# Register the best model (Random Forest) in MLflow Model Registry
mlflow.set_experiment('Credit Risk Modeling')
with mlflow.start_run(run_name='Best Random Forest Model'):
    mlflow.sklearn.log_model(tuned_rf, 'random_forest_model')
    mlflow.log_param('model_type', 'RandomForestClassifier')
    mlflow.log_params(tuned_rf.get_params())
    mlflow.log_metric('test_roc_auc', tuned_rf.score(X_test, y_test))
    # Register the model
    result = mlflow.register_model(
        "runs:/" + mlflow.active_run().info.run_id + "/random_forest_model",
        "CreditRiskRandomForest"
    )
    print(f"Model registered in MLflow Model Registry: {result.name} (version {result.version})")

Registered model 'CreditRiskRandomForest' already exists. Creating a new version of this model...
Registered model 'CreditRiskRandomForest' already exists. Creating a new version of this model...
Created version '4' of model 'CreditRiskRandomForest'.
Created version '4' of model 'CreditRiskRandomForest'.


Model registered in MLflow Model Registry: CreditRiskRandomForest (version 4)
