In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.proxy_target_engineering import ProxyTargetEngineer
from src.data_processing import ModelReadyDataProcessor
from src.model_training import split_data, train_and_track


In [5]:
import pandas as pd


df = pd.read_csv("../data/raw/data.csv")

proxy_engineer = ProxyTargetEngineer(
    customer_col="CustomerId",
    date_col="TransactionStartTime",
    amount_col="Amount",
    n_clusters=3,
    random_state=42
)

# Fit proxy target model
proxy_engineer.fit(df)

# Inspect clusters (sanity check)
proxy_engineer.rfm_df_.groupby("cluster")[["Recency", "Frequency", "Monetary"]].mean()

Unnamed: 0_level_0,Recency,Frequency,Monetary
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,61.877279,7.720196,81720.68
1,12.726566,34.8,272574.1
2,29.0,4091.0,-104900000.0


In [6]:
# Identify high-risk cluster explicitly
HIGH_RISK_CLUSTER = 0

# Create proxy target
proxy_engineer.rfm_df_["is_high_risk"] = (
    proxy_engineer.rfm_df_["cluster"] == HIGH_RISK_CLUSTER
).astype(int)

# customer-level proxy target
customer_target = proxy_engineer.rfm_df_[["CustomerId", "is_high_risk"]]

# Merge at transaction-level
df = df.merge(customer_target, on="CustomerId", how="left")

# Sanity check
df[["CustomerId", "is_high_risk"]].head()


Unnamed: 0,CustomerId,is_high_risk
0,CustomerId_4406,0
1,CustomerId_4406,0
2,CustomerId_4683,1
3,CustomerId_988,0
4,CustomerId_988,0


In [7]:
X_raw = df.drop(columns=["is_high_risk"])  # all features
y = df["is_high_risk"].astype(int)         # new proxy target


In [None]:
processor = ModelReadyDataProcessor(
    
)

processor.fit(X_raw, y)
X_model_ready = processor.transform(X_raw)
processor.save_processed(X_model_ready)

X_model_ready.head()

✅ Saved to D:\AI mastery\credit-risk-model\data\processed\processed.csv


Unnamed: 0,Value,total_transaction_amount,average_transaction_amount,transaction_count,std_transaction_amount,transaction_hour,transaction_day,transaction_month,transaction_year,CurrencyCode_UGX,...,ProviderId_ProviderId_3,ProviderId_ProviderId_4,ProviderId_ProviderId_5,ProviderId_ProviderId_6,PricingStrategy_0,PricingStrategy_1,PricingStrategy_2,PricingStrategy_4,ProductCategory_woe,ChannelId_woe
0,-0.072291,0.170118,-0.067623,-0.311831,-0.167016,2,15,11,2018,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.15115,-0.021516
1,-0.080251,0.170118,-0.067623,-0.311831,-0.167016,2,15,11,2018,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.113799,0.054063
2,-0.076352,0.165122,-0.072568,-0.444993,-0.201209,2,15,11,2018,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.15115,-0.021516
3,0.096648,0.175567,-0.008155,-0.40402,-0.008243,3,15,11,2018,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.253605,-0.021516
4,-0.075183,0.175567,-0.008155,-0.40402,-0.008243,3,15,11,2018,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.113799,0.054063


In [14]:


# 1️⃣ Split processed data
X_train, X_test, y_train, y_test = split_data(X_model_ready, y, test_size=0.2, random_state=42)

# 2️⃣ Train & track multiple models in MLflow
results = train_and_track(
    X_train, X_test, y_train, y_test,
    models=["logistic", "random_forest", "gradient_boosting"],
    search_type="grid"
)

# The results dictionary contains models, params, and metrics
best_model = results["random_forest"]["model"]


Training logistic...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2025/12/16 22:51:07 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/16 22:51:07 INFO mlflow.store.db.utils: Updating database tables
2025/12/16 22:51:08 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/16 22:51:08 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/16 22:51:08 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2025/12/16 22:51:08 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2025/12/16 22:51:

✅ Logged logistic to MLflow
logistic metrics: {'accuracy': 0.8966706737051168, 'precision': 0.7102803738317757, 'recall': 0.17257039055404177, 'f1_score': 0.2776762879064669, 'roc_auc': 0.9116630137034802}
Training random_forest...




✅ Logged random_forest to MLflow
random_forest metrics: {'accuracy': 0.9948256938274186, 'precision': 0.9755766621438263, 'recall': 0.9795640326975477, 'f1_score': 0.9775662814411965, 'roc_auc': 0.999655075408651}
Training gradient_boosting...




✅ Logged gradient_boosting to MLflow
gradient_boosting metrics: {'accuracy': 0.9994250770919354, 'precision': 0.9959257582616569, 'recall': 0.9990917347865577, 'f1_score': 0.9975062344139651, 'roc_auc': 0.9999382813107278}
