# Imports

In [1]:
import pandas as pd
import numpy as np

import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)


# Load Data

In [2]:
df = pd.read_csv(r"C:\Users\user\Desktop\Project\Bati_Bank\data\processed\loan.csv")
df.head()


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,is_high_risk
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0,1
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0,0


In [8]:
#lists the columns with categorical data
object_train_df=df.select_dtypes(include=['object'])    
object_train_df.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'ProviderId', 'ProductId', 'ProductCategory',
       'ChannelId', 'TransactionStartTime'],
      dtype='object')

In [14]:
X = df.drop(columns=["is_high_risk"])
y = df["is_high_risk"]


# Define Target and Features

In [3]:
target = "is_high_risk"

X = df.drop(columns=[target])
y = df[target]

print(X.shape, y.shape)


(95662, 16) (95662,)


# Safety Check – Numeric Data

In [17]:
# Ensure only numeric features are used
df_model = df.select_dtypes(include=["int64", "float64"])

assert "is_high_risk" in df_model.columns, "Target column missing!"

X = df_model.drop(columns=["is_high_risk"])
y = df_model["is_high_risk"]

print("✅ Model input is numeric only")


✅ Model input is numeric only


# Train-Test Spli

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# Define Models and Hyperparameters

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

models = {
    "LogisticRegression": (
        LogisticRegression(
            max_iter=1000,
            class_weight="balanced"
        ),
        {
            "C": [0.01, 0.1, 1, 10]
        }
    ),
    "RandomForest": (
        RandomForestClassifier(random_state=42),
        {
            "n_estimators": [100, 200],
            "max_depth": [None, 10, 20]
        }
    )
}


# MLflow Experiment Setup

In [20]:
import mlflow
import mlflow.sklearn

mlflow.set_experiment("Credit_Risk_Model_Training")


<Experiment: artifact_location='file:///c:/Users/user/Desktop/Project/Bati_Bank/notebook/mlruns/2', creation_time=1765726255513, experiment_id='2', last_update_time=1765726255513, lifecycle_stage='active', name='Credit_Risk_Model_Training', tags={}>

# Train, Tune, and Track Models

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

results = []

for model_name, (model, params) in models.items():

    with mlflow.start_run(run_name=model_name):

        grid = GridSearchCV(
            estimator=model,
            param_grid=params,
            scoring="roc_auc",
            cv=3,
            n_jobs=-1
        )

        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_

        y_pred = best_model.predict(X_test)
        y_prob = best_model.predict_proba(X_test)[:, 1]

        metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred),
            "recall": recall_score(y_test, y_pred),
            "f1_score": f1_score(y_test, y_pred),
            "roc_auc": roc_auc_score(y_test, y_prob)
        }

        mlflow.log_params(grid.best_params_)
        mlflow.log_metrics(metrics)

        mlflow.sklearn.log_model(
            best_model,
            artifact_path="model"
        )

        results.append({
            "Model": model_name,
            **metrics
        })




# Compare Model Results

In [24]:
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,accuracy,precision,recall,f1_score,roc_auc
0,LogisticRegression,0.218837,0.113818,0.852861,0.200834,0.518448
1,RandomForest,0.885381,0.541284,0.026794,0.05106,0.615198


# Best Model Selection

In [31]:
best_model_name = results_df.sort_values(
    "roc_auc", ascending=False
).iloc[0]["Model"]

mlflow.sklearn.log_model(
    best_model,
    artifact_path="best_model",
    registered_model_name="CreditRiskModel"
)




2025/12/14 18:58:16 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/14 18:58:16 INFO mlflow.store.db.utils: Updating database tables
2025/12/14 18:58:16 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/14 18:58:16 INFO alembic.runtime.migration: Will assume non-transactional DDL.
Successfully registered model 'CreditRiskModel'.
Created version '1' of model 'CreditRiskModel'.


<mlflow.models.model.ModelInfo at 0x18903aedac0>