In [9]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# MLflow for tracking
import mlflow
import mlflow.sklearn
import mlflow.xgboost

# Modeling
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay)

# Custom Pipeline from src
sys.path.append(os.path.abspath(os.path.join('..')))
from src.data_processing import (FeatureAggregator, TimeFeatureExtractor, 
                                  ColumnDropper, get_preprocessing_pipeline)

%matplotlib inline

In [15]:
# 1. Load Data
df = pd.read_csv('../data/processed/labeled_data.csv')

# 2. RUN CUSTOM TRANSFORMERS MANUALLY FIRST
# This creates the 'Std_Dev_Amount' column so it actually exists
aggregator = FeatureAggregator()
time_ext = TimeFeatureExtractor()

df = time_ext.transform(df)

# 3. DEFINE COLUMNS (Now 'Std_Dev_Amount' is safe to use)
num_cols = ['Amount', 'Value', 'Total_Amount', 'Average_Amount', 'Transaction_Count', 'Std_Dev_Amount']
cat_cols = ['ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'PricingStrategy']
drop_cols = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 
             'TransactionStartTime', 'CurrencyCode', 'CountryCode', 'is_high_risk']

# 4. DROP NON-PREDICTIVE COLS
X = df.drop(columns=[col for col in drop_cols if col in df.columns])
y = df['is_high_risk']

# 5. INITIALIZE AND RUN THE PIPELINE
# The pipeline will now find 'Std_Dev_Amount' in the X dataframe
preprocessor = get_preprocessing_pipeline(num_cols, cat_cols)
X_processed = preprocessor.fit_transform(X)

In [16]:
# 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, 
                                                    random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")

X_train shape: (76529, 52)


In [17]:
mlflow.set_experiment("Credit_Risk_Project")

# --- Run 1: Random Forest ---
with mlflow.start_run(run_name="Random_Forest_Baseline"):
    rf = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)
    rf.fit(X_train, y_train)
    
    y_prob = rf.predict_proba(X_test)[:, 1]
    y_pred = rf.predict(X_test)
    
    # Log Metrics
    mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_prob))
    mlflow.log_metric("f1_score", f1_score(y_test, y_pred))
    
    # Log Model
    mlflow.sklearn.log_model(rf, "rf_model")
    print("Random Forest logged successfully.")

# --- Run 2: XGBoost with Hyperparameter Tuning ---
with mlflow.start_run(run_name="XGBoost_Tuned"):
    param_grid = {
        'n_estimators': [50, 100],
        'learning_rate': [0.05, 0.1],
        'max_depth': [4, 6]
    }
    
    xgb = XGBClassifier(eval_metric='logloss')
    search = RandomizedSearchCV(xgb, param_grid, n_iter=4, cv=3, scoring='roc_auc')
    search.fit(X_train, y_train)
    
    best_xgb = search.best_estimator_
    y_prob_xgb = best_xgb.predict_proba(X_test)[:, 1]
    
    mlflow.log_params(search.best_params_)
    mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_prob_xgb))
    mlflow.xgboost.log_model(best_xgb, "xgb_model")
    print("Tuned XGBoost logged successfully.")

2025/12/18 23:54:41 INFO mlflow.tracking.fluent: Experiment with name 'Credit_Risk_Project' does not exist. Creating a new experiment.


Random Forest logged successfully.




TypeError: `_estimator_type` undefined.  Please use appropriate mixin to define estimator type.

In [None]:
# Visualize Results for the Best Model (XGBoost)
cm = confusion_matrix(y_test, best_xgb.predict(X_test))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix: Predicted vs Actual High Risk")
plt.show()