In [9]:
# -----------------------------
# TASK 2: Model Building & Training
# -----------------------------

# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, auc
import joblib

# -----------------------------
# 1️⃣ Load E-commerce Dataset
# -----------------------------
X_ecom = pd.read_csv("../data/processed/fraud_data_task1_ready.csv")
y_ecom = X_ecom.pop('class')



In [10]:
# -----------------------------
# 2️⃣ Stratified Train-Test Split
# -----------------------------
X_train_ecom, X_test_ecom, y_train_ecom, y_test_ecom = train_test_split(
    X_ecom, y_ecom, test_size=0.2, stratify=y_ecom, random_state=42
)



In [11]:
# -----------------------------
# 3️⃣ Handle Imbalance with SMOTE (Training Only)
# -----------------------------
smote = SMOTE(random_state=42)
X_train_res_ecom, y_train_res_ecom = smote.fit_resample(X_train_ecom, y_train_ecom)
print("E-commerce Resampled Classes:", y_train_res_ecom.value_counts())



E-commerce Resampled Classes: class
0    109568
1    109568
Name: count, dtype: int64


In [12]:
# -----------------------------
# 4️⃣ Baseline Model: Logistic Regression
# -----------------------------
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_res_ecom, y_train_res_ecom)

y_pred_lr = lr.predict(X_test_ecom)
y_proba_lr = lr.predict_proba(X_test_ecom)[:,1]

print("=== Logistic Regression (E-commerce) ===")
print(confusion_matrix(y_test_ecom, y_pred_lr))
print(classification_report(y_test_ecom, y_pred_lr))

precision_lr, recall_lr, _ = precision_recall_curve(y_test_ecom, y_proba_lr)
auc_pr_lr = auc(recall_lr, precision_lr)
print("AUC-PR:", auc_pr_lr)



=== Logistic Regression (E-commerce) ===
[[26765   628]
 [ 1179  1651]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     27393
           1       0.72      0.58      0.65      2830

    accuracy                           0.94     30223
   macro avg       0.84      0.78      0.81     30223
weighted avg       0.94      0.94      0.94     30223

AUC-PR: 0.6429703878170832


In [13]:
# -----------------------------
# 5️⃣ Ensemble Model: Random Forest
# -----------------------------
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)
rf.fit(X_train_res_ecom, y_train_res_ecom)

y_pred_rf = rf.predict(X_test_ecom)
y_proba_rf = rf.predict_proba(X_test_ecom)[:,1]

print("=== Random Forest (E-commerce) ===")
print(confusion_matrix(y_test_ecom, y_pred_rf))
print(classification_report(y_test_ecom, y_pred_rf))

precision_rf, recall_rf, _ = precision_recall_curve(y_test_ecom, y_proba_rf)
auc_pr_rf = auc(recall_rf, precision_rf)
print("AUC-PR:", auc_pr_rf)



=== Random Forest (E-commerce) ===
[[27271   122]
 [ 1291  1539]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     27393
           1       0.93      0.54      0.69      2830

    accuracy                           0.95     30223
   macro avg       0.94      0.77      0.83     30223
weighted avg       0.95      0.95      0.95     30223

AUC-PR: 0.6687775208523684


In [14]:
# -----------------------------
# 6️⃣ Ensemble Model: XGBoost
# -----------------------------
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=len(y_train_res_ecom[y_train_res_ecom==0])/len(y_train_res_ecom[y_train_res_ecom==1]),
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb.fit(X_train_res_ecom, y_train_res_ecom)

y_pred_xgb = xgb.predict(X_test_ecom)
y_proba_xgb = xgb.predict_proba(X_test_ecom)[:,1]

print("=== XGBoost (E-commerce) ===")
print(confusion_matrix(y_test_ecom, y_pred_xgb))
print(classification_report(y_test_ecom, y_pred_xgb))

precision_xgb, recall_xgb, _ = precision_recall_curve(y_test_ecom, y_proba_xgb)
auc_pr_xgb = auc(recall_xgb, precision_xgb)
print("AUC-PR:", auc_pr_xgb)



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== XGBoost (E-commerce) ===
[[27284   109]
 [ 1296  1534]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     27393
           1       0.93      0.54      0.69      2830

    accuracy                           0.95     30223
   macro avg       0.94      0.77      0.83     30223
weighted avg       0.95      0.95      0.95     30223

AUC-PR: 0.6891956896153338


In [15]:
# -----------------------------
# 7️⃣ Cross-Validation (Random Forest example)
# -----------------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf, X_train_res_ecom, y_train_res_ecom, cv=skf, scoring='f1')
print("Random Forest CV F1-score mean:", np.mean(cv_scores), "std:", np.std(cv_scores))




Random Forest CV F1-score mean: 0.9038206546775183 std: 0.0017055932405337627


In [17]:
# -----------------------------
# 8️⃣ Save Best Model (Example: Random Forest)
# -----------------------------
joblib.dump(rf, "../models/rf_fraud_ecom.pkl")

['../models/rf_fraud_ecom.pkl']


9️⃣ Credit Card Dataset Workflow (Minimal Preprocessing)

In [18]:


credit_df = pd.read_csv("../data/raw/creditcard.csv")
X_cc = credit_df.drop('Class', axis=1)
y_cc = credit_df['Class']

# Stratified split
X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(
    X_cc, y_cc, test_size=0.2, stratify=y_cc, random_state=42
)

# SMOTE for training
X_train_res_cc, y_train_res_cc = smote.fit_resample(X_train_cc, y_train_cc)
print("Credit Card Resampled Classes:", y_train_res_cc.value_counts())



Credit Card Resampled Classes: Class
0    227451
1    227451
Name: count, dtype: int64


In [19]:
# Logistic Regression
lr_cc = LogisticRegression(max_iter=1000, random_state=42)
lr_cc.fit(X_train_res_cc, y_train_res_cc)

y_pred_lr_cc = lr_cc.predict(X_test_cc)
y_proba_lr_cc = lr_cc.predict_proba(X_test_cc)[:,1]

print("=== Logistic Regression (Credit Card) ===")
print(confusion_matrix(y_test_cc, y_pred_lr_cc))
print(classification_report(y_test_cc, y_pred_lr_cc))

precision_lr_cc, recall_lr_cc, _ = precision_recall_curve(y_test_cc, y_proba_lr_cc)
auc_pr_lr_cc = auc(recall_lr_cc, precision_lr_cc)
print("AUC-PR:", auc_pr_lr_cc)



=== Logistic Regression (Credit Card) ===
[[56247   617]
 [   10    88]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     56864
           1       0.12      0.90      0.22        98

    accuracy                           0.99     56962
   macro avg       0.56      0.94      0.61     56962
weighted avg       1.00      0.99      0.99     56962

AUC-PR: 0.7727261838301287


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
# Random Forest
rf_cc = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)
rf_cc.fit(X_train_res_cc, y_train_res_cc)

y_pred_rf_cc = rf_cc.predict(X_test_cc)
y_proba_rf_cc = rf_cc.predict_proba(X_test_cc)[:,1]

print("=== Random Forest (Credit Card) ===")
print(confusion_matrix(y_test_cc, y_pred_rf_cc))
print(classification_report(y_test_cc, y_pred_rf_cc))

precision_rf_cc, recall_rf_cc, _ = precision_recall_curve(y_test_cc, y_proba_rf_cc)
auc_pr_rf_cc = auc(recall_rf_cc, precision_rf_cc)
print("AUC-PR:", auc_pr_rf_cc)

# Save model
joblib.dump(rf_cc, "../models/rf_fraud_cc.pkl")

=== Random Forest (Credit Card) ===
[[56788    76]
 [   11    87]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.53      0.89      0.67        98

    accuracy                           1.00     56962
   macro avg       0.77      0.94      0.83     56962
weighted avg       1.00      1.00      1.00     56962

AUC-PR: 0.8133514988277757


['../models/rf_fraud_cc.pkl']