In [12]:
import sys
import os

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

print("Project root added:", PROJECT_ROOT)


Project root added: c:\Users\Josh\Desktop\credit-risk-model


In [8]:
import mlflow
import mlflow.sklearn


In [9]:
mlflow.set_experiment("credit-risk-modeling")


<Experiment: artifact_location='file:c:/Users/Josh/Desktop/credit-risk-model/notebooks/mlruns/1', creation_time=1765894038142, experiment_id='1', last_update_time=1765894038142, lifecycle_stage='active', name='credit-risk-modeling', tags={}>

In [16]:
import pandas as pd

df = pd.read_csv("../data/raw_data/data.csv")
print(df.shape)
df.head()


(95662, 16)


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [17]:
customer_features_df = (
    df.groupby("CustomerId")
    .agg(
        total_transaction_value=("Amount", "sum"),
        avg_transaction_value=("Amount", "mean"),
        transaction_count=("TransactionId", "count"),
        negative_transaction_count=("Amount", lambda x: (x < 0).sum()),
    )
    .reset_index()
)

customer_features_df.head()


Unnamed: 0,CustomerId,total_transaction_value,avg_transaction_value,transaction_count,negative_transaction_count
0,CustomerId_1,-10000.0,-10000.0,1,1
1,CustomerId_10,-10000.0,-10000.0,1,1
2,CustomerId_1001,20000.0,4000.0,5,2
3,CustomerId_1002,4225.0,384.090909,11,6
4,CustomerId_1003,20000.0,3333.333333,6,2


In [18]:
df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"])

snapshot_date = df["TransactionStartTime"].max()

rfm = (
    df.groupby("CustomerId")
    .agg(
        recency=("TransactionStartTime", lambda x: (snapshot_date - x.max()).days),
        frequency=("TransactionId", "count"),
        monetary=("Amount", "sum"),
    )
    .reset_index()
)

rfm.head()


Unnamed: 0,CustomerId,recency,frequency,monetary
0,CustomerId_1,83,1,-10000.0
1,CustomerId_10,83,1,-10000.0
2,CustomerId_1001,89,5,20000.0
3,CustomerId_1002,25,11,4225.0
4,CustomerId_1003,11,6,20000.0


In [19]:
customer_features_df = customer_features_df.merge(
    rfm, on="CustomerId", how="left"
)

customer_features_df.head()


Unnamed: 0,CustomerId,total_transaction_value,avg_transaction_value,transaction_count,negative_transaction_count,recency,frequency,monetary
0,CustomerId_1,-10000.0,-10000.0,1,1,83,1,-10000.0
1,CustomerId_10,-10000.0,-10000.0,1,1,83,1,-10000.0
2,CustomerId_1001,20000.0,4000.0,5,2,89,5,20000.0
3,CustomerId_1002,4225.0,384.090909,11,6,25,11,4225.0
4,CustomerId_1003,20000.0,3333.333333,6,2,11,6,20000.0


In [20]:
customer_features_df["high_credit_risk"] = (
    (customer_features_df["total_transaction_value"] < 0)
    | (customer_features_df["negative_transaction_count"] > 0)
).astype(int)

customer_features_df["high_credit_risk"].value_counts(normalize=True)


high_credit_risk
1    0.735168
0    0.264832
Name: proportion, dtype: float64

In [10]:
with mlflow.start_run(run_name="sanity_check"):
    mlflow.log_param("test_param", 1)
    mlflow.log_metric("test_metric", 0.5)


In [23]:
from src.model_training import (
    split_features_target,
    train_test_data,
    train_logistic_regression,
    evaluate_model
)

X, y = split_features_target(
    customer_features_df,
    target_col="high_credit_risk"
)

X_train, X_test, y_train, y_test = train_test_data(X, y)

print(X_train.shape, X_test.shape)


(2993, 7) (749, 7)


In [25]:
# Train model (only training data)
log_model = train_logistic_regression(X_train, y_train)


In [26]:
auc, report = evaluate_model(
    log_model,
    X_test,
    y_test
)

print("ROC-AUC:", auc)
print(report)


ROC-AUC: 1.0
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 198.0}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 551.0}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 749.0}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 749.0}}


In [28]:
from pprint import pformat

report_text = pformat(report)


In [29]:
with mlflow.start_run(run_name="logistic_regression"):
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("solver", log_model.solver)
    mlflow.log_param("penalty", log_model.penalty)
    mlflow.log_param("C", log_model.C)

    mlflow.log_metric("roc_auc", auc)

    mlflow.log_text(report_text, "classification_report.txt")

    mlflow.sklearn.log_model(log_model, "model")




In [31]:
from src.model_training import train_random_forest

rf_model = train_random_forest(X_train, y_train)

rf_auc, rf_report = evaluate_model(
    rf_model,
    X_test,
    y_test
)

print("Random Forest ROC-AUC:", rf_auc)


Random Forest ROC-AUC: 0.9999999999999999


In [32]:
from pprint import pformat

rf_report_text = pformat(rf_report)

with mlflow.start_run(run_name="random_forest"):
    mlflow.log_param("model_type", "RandomForestClassifier")
    mlflow.log_param("n_estimators", rf_model.n_estimators)
    mlflow.log_param("max_depth", rf_model.max_depth)

    mlflow.log_metric("roc_auc", rf_auc)

    mlflow.log_text(rf_report_text, "classification_report.txt")

    mlflow.sklearn.log_model(rf_model, "model")


