In [1]:
import duckdb
import pandas as pd
import pickle
import os
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Load train & validation data

In [2]:
# Load TRAIN
con = duckdb.connect("../database/ML/X_train.duckdb")
X_train = con.execute("SELECT * FROM X_train").df()
con.close()

con = duckdb.connect("../database/ML/y_train.duckdb")
y_train = con.execute("SELECT * FROM y_train").df()
con.close()


# Load TEST
con = duckdb.connect("../database/ML/X_test.duckdb")
X_test = con.execute("SELECT * FROM X_test").df()
con.close()

con = duckdb.connect("../database/ML/y_test.duckdb")
y_test = con.execute("SELECT * FROM y_test").df()
con.close()

print(X_train.shape)
print(X_test.shape)

print(X_train.head())
print(len(y_train))

(203816, 36)
(87351, 36)
   ohe__product_Checking or savings account  ohe__product_Credit card  \
0                                       0.0                       0.0   
1                                       1.0                       0.0   
2                                       1.0                       0.0   
3                                       1.0                       0.0   
4                                       0.0                       1.0   

   ohe__product_Credit card or prepaid card  \
0                                       0.0   
1                                       0.0   
2                                       0.0   
3                                       0.0   
4                                       0.0   

   ohe__product_Credit reporting or other personal consumer reports  \
0                                                1.0                  
1                                                0.0                  
2                                       

In [3]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from collections import Counter
from lightgbm import LGBMClassifier

# 1. Apply SMOTE to balance the training data
from imblearn.over_sampling import ADASYN
adasyn = ADASYN(random_state=42, n_neighbors=6)
X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train, y_train)


# 2. Train with balanced class weights
# model = RandomForestClassifier(
#     n_estimators=200,
#     max_depth=10,
#     random_state=42,
#     n_jobs=-1,
#     class_weight='balanced_subsample'  # Keep this
# )

model = CatBoostClassifier(
    iterations=1200,        
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass', 
    verbose=100,
    random_seed=42,
)

model.fit(X_train_balanced, y_train_balanced)

0:	learn: 1.0820391	total: 120ms	remaining: 2m 23s
100:	learn: 0.8620429	total: 5.38s	remaining: 58.6s
200:	learn: 0.8297754	total: 10.3s	remaining: 51.1s
300:	learn: 0.8031652	total: 15.3s	remaining: 45.7s
400:	learn: 0.7827077	total: 20.8s	remaining: 41.5s
500:	learn: 0.7660574	total: 25.9s	remaining: 36.2s
600:	learn: 0.7529911	total: 31.4s	remaining: 31.3s
700:	learn: 0.7421525	total: 36.1s	remaining: 25.7s
800:	learn: 0.7335460	total: 41s	remaining: 20.4s
900:	learn: 0.7253489	total: 45.8s	remaining: 15.2s
1000:	learn: 0.7186604	total: 50.8s	remaining: 10.1s
1100:	learn: 0.7127442	total: 55.8s	remaining: 5.02s
1199:	learn: 0.7079296	total: 1m	remaining: 0us


<catboost.core.CatBoostClassifier at 0x784058d97fb0>

# Validate the model

In [4]:
y_pred = model.predict(X_test)

print("Accuracy:",  accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:",    recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:",  f1_score(y_test, y_pred, average='weighted'))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.604434980710009
Precision: 0.718136590086368
Recall: 0.604434980710009
F1 Score: 0.6347782128345943

Confusion Matrix:
 [[39133 13051 12696]
 [ 3048  6465   344]
 [ 3791  1623  7200]]


In [6]:
import os
import joblib

save_path = '../src/models/catboost.pkl'
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# compress=3 is a good balance between speed and size (range 0-9)
joblib.dump(model, save_path, compress=3)

print(f"Compressed model saved to {save_path}")

Compressed model saved to ../src/models/catboost.pkl
