In [1]:
# import sys
# sys.path.append("../src/")

# import pandas as pd
# import joblib
# from modeling import split_X_y, split_train_test, scale_features, train_simple_model, evaluate_model
# from imblearn.ensemble import BalancedRandomForestClassifier
# from sklearn.metrics import classification_report

# # --- Load processed CSV (already one-hot encoded) ---
# FEATURES_PATH = "../data/processed/bank_transactions_features.csv"
# df = pd.read_csv(FEATURES_PATH)

# # --- Split X / y ---
# X = df.drop(columns=["Is_Fraud"])
# y = df["Is_Fraud"]

# # --- Save column order BEFORE training (for dashboard consistency) ---
# columns_used = X.columns.tolist()
# joblib.dump(columns_used, "../model/columns_used.pkl")

# # --- Train/Test split ---
# X_train, X_test, y_train, y_test = split_train_test(X, y, stratify=y)

# # --- Optional: Scale numeric features ---
# X_train_s, X_test_s = scale_features(X_train.fillna(0), X_test.fillna(0))

# # --- Train model for imbalanced data ---
# clf = BalancedRandomForestClassifier(
#     n_estimators=300,
#     max_depth=10,
#     random_state=42,
#     replacement=True  # allows sampling with replacement
# )
# clf.fit(X_train_s, y_train)

# # --- Evaluate ---
# y_pred = clf.predict(X_test_s)
# print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))

# # ROC AUC for imbalanced data
# if hasattr(clf, "predict_proba"):
#     y_prob = clf.predict_proba(X_test_s)[:, 1]
#     from sklearn.metrics import roc_auc_score
#     auc = roc_auc_score(y_test, y_prob)
#     print(f"ROC AUC: {auc:.4f}")

# # --- Save trained model ---
# MODEL_PATH = "../model/rf_model_imbalanced.pkl"
# joblib.dump(clf, MODEL_PATH)
# print(f"Balanced Random Forest model saved at: {MODEL_PATH}")


In [2]:
# import sys
# import os
# import pandas as pd
# import joblib

# # Add src to path
# sys.path.append("../src/")

# # Import updated helpers
# from modeling import split_X_y, split_train_test, scale_features, train_simple_model, evaluate_model

# # =====================================================
# # 1. SETUP & LOAD
# # =====================================================
# os.makedirs("../model", exist_ok=True)
# FEATURES_PATH = "../data/processed/bank_transactions_features.csv"

# print(f"Loading data from {FEATURES_PATH}...")
# df = pd.read_csv(FEATURES_PATH)

# # =====================================================
# # 2. SPLITTING
# # =====================================================
# X, y = split_X_y(df, target="Is_Fraud")

# # --- SAVE METADATA 1: Column Names ---
# # The Streamlit app needs this to know the order of features
# cols_path = "../model/columns_used.pkl"
# joblib.dump(X.columns.tolist(), cols_path)
# print(f"Saved feature column list to {cols_path}")

# # Split Train/Test
# X_train, X_test, y_train, y_test = split_train_test(X, y, stratify=y)

# # =====================================================
# # 3. PREPROCESSING (Scaling)
# # =====================================================
# # 1. Handle Missing Values first
# # X_train = X_train.fillna(0)
# # X_test = X_test.fillna(0)

# # 2. Scale Features & CAPTURE THE SCALER
# # Even if Trees don't strictly need it, using a scaler allows you 
# # to easily swap to Logistic Regression later without changing the pipeline.
# print("Scaling features...")
# scaler, X_train_scaled, X_test_scaled = scale_features(X_train, X_test, method="standard")

# # --- SAVE METADATA 2: The Scaler ---
# # CRITICAL: We save the math (mean/std) so the Streamlit app 
# # can transform the user's single input exactly like the training data.
# if scaler:
#     scaler_path = "../model/scaler.pkl"
#     joblib.dump(scaler, scaler_path)
#     print(f"Saved scaler to {scaler_path}")

# # =====================================================
# # 4. TRAINING
# # =====================================================
# print("Training BalancedRandomForestClassifier via Helper...")

# clf = train_simple_model(
#     X_train_scaled,  # Use the SCALED data
#     y_train,
#     problem="classification",
#     model_name="balanced_random_forest",
#     resample=False, # Helper handles this internal logic
#     # Specific Params
#     n_estimators=300,
#     max_depth=10,
#     min_samples_leaf=5,
#     n_jobs=-1
# )

# # =====================================================
# # 5. EVALUATION
# # =====================================================
# print("\n--- Evaluating Model ---")
# metrics = evaluate_model(clf, X_test_scaled, y_test, problem="classification")
# print(f"Metrics: {metrics}")

# # =====================================================
# # 6. SAVE MODEL
# # =====================================================
# MODEL_PATH = "../model/rf_model_imbalanced.pkl"
# joblib.dump(clf, MODEL_PATH)
# print(f"\nModel saved successfully at: {MODEL_PATH}")

In [3]:
import sys
import os
import pandas as pd
import joblib
import numpy as np

# Add src to path
sys.path.append("../src/")

# Import helpers
from modeling import split_X_y, split_train_test, scale_features, train_simple_model, evaluate_model

# =====================================================
# 1. SETUP & LOAD
# =====================================================
os.makedirs("../model", exist_ok=True)
FEATURES_PATH = "../data/processed/bank_transactions_features.csv"

print(f"Loading data from {FEATURES_PATH}...")
df = pd.read_csv(FEATURES_PATH)

# =====================================================
# 2. SPLITTING
# =====================================================
X, y = split_X_y(df, target="Is_Fraud")

# --- SAVE METADATA 1: Column Names ---
cols_path = "../model/columns_used.pkl"
joblib.dump(X.columns.tolist(), cols_path)
print(f"Saved feature column list to {cols_path}")

# Split Train/Test
X_train, X_test, y_train, y_test = split_train_test(X, y, stratify=y)

# =====================================================
# 3. PREPROCESSING
# =====================================================
# 1. Handle Missing Values (Crucial for Logistic Regression)
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# 2. Scale Features
print("Scaling features...")
scaler, X_train_scaled, X_test_scaled = scale_features(X_train, X_test, method="standard")

# --- SAVE METADATA 2: The Scaler ---
if scaler:
    scaler_path = "../model/scaler.pkl"
    joblib.dump(scaler, scaler_path)
    print(f"Saved scaler to {scaler_path}")

# =====================================================
# 4. ITERATIVE TRAINING & SELECTION
# =====================================================
print("\n--- Starting Model Competition ---")

# Define the candidates
# Note: balanced_random_forest handles sampling internally, so resample=False
# Others typically benefit from SMOTE (resample=True) in fraud cases
model_candidates = [
    {
        "name": "logistic", 
        "resample": True, 
        "params": {"C": 1.0, "solver": "liblinear"}
    },
    {
        "name": "random_forest", 
        "resample": True, 
        "params": {"n_estimators": 100, "max_depth": 10, "n_jobs": -1}
    },
    {
        "name": "gbm", 
        "resample": True, 
        "params": {"n_estimators": 100, "learning_rate": 0.1, "max_depth": 5}
    },
    {
        "name": "balanced_random_forest", 
        "resample": False, 
        "params": {"n_estimators": 200, "max_depth": 10, "n_jobs": -1}
    }
]

results = []
trained_models = {}

for config in model_candidates:
    name = config["name"]
    print(f"\nTraining {name}...")
    
    try:
        # Train
        model = train_simple_model(
            X_train_scaled, 
            y_train,
            problem="classification",
            model_name=name,
            resample=config["resample"],
            **config["params"]
        )
        
        # Evaluate
        metrics = evaluate_model(model, X_test_scaled, y_test, problem="classification")
        
        # Store Result
        metrics["model_name"] = name
        results.append(metrics)
        trained_models[name] = model
        
    except Exception as e:
        print(f"Failed to train {name}: {e}")

# =====================================================
# 5. SELECT BEST MODEL
# =====================================================
print("\n--- Model Leaderboard ---")
results_df = pd.DataFrame(results)

# Sort by ROC_AUC (Best metric for fraud) or F1
# We use ROC_AUC because it measures ranking ability regardless of threshold
results_df = results_df.sort_values(by="roc_auc", ascending=False)

print(results_df[["model_name", "roc_auc", "f1", "precision", "recall"]])

# Get the winner
best_row = results_df.iloc[0]
best_model_name = best_row["model_name"]
best_model = trained_models[best_model_name]

print(f"\nüèÜ WINNER: {best_model_name}")
print(f"   ROC AUC: {best_row['roc_auc']:.4f}")
print(f"   F1 Score: {best_row['f1']:.4f}")

# =====================================================
# 6. SAVE BEST MODEL
# =====================================================
# We save it as 'rf_model_imbalanced.pkl' regardless of the type 
# so the dashboard code doesn't break, OR you can rename it 'best_model.pkl'
# and update your dashboard path. Here I keep the name consistent.
MODEL_PATH = "../model/rf_model_imbalanced.pkl" 

joblib.dump(best_model, MODEL_PATH)
print(f"\n‚úÖ Best model saved successfully at: {MODEL_PATH}")

Loading data from ../data/processed/bank_transactions_features.csv...


2025-12-15 17:22:19,457 - INFO - Split data: X.shape=(200000, 52), y.shape=(200000,)


Saved feature column list to ../model/columns_used.pkl


2025-12-15 17:22:19,690 - INFO - Train-test split: X_train=(160000, 52), X_test=(40000, 52)
2025-12-15 17:22:19,848 - INFO - Scaled 6 columns using standard scaler.
2025-12-15 17:22:19,920 - INFO - Applying SMOTE to training data...


Scaling features...
Saved scaler to ../model/scaler.pkl

--- Starting Model Competition ---

Training logistic...


2025-12-15 17:22:25,406 - INFO - SMOTE applied: New X_train=(303860, 52)
2025-12-15 17:22:31,235 - INFO - Trained logistic model for classification
2025-12-15 17:22:32,286 - INFO - Evaluation results: {'accuracy': 0.94955, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'roc_auc': np.float64(0.4956374933011667)}
2025-12-15 17:22:32,352 - INFO - Applying SMOTE to training data...



Training random_forest...


2025-12-15 17:22:32,974 - INFO - SMOTE applied: New X_train=(303860, 52)
2025-12-15 17:22:41,901 - INFO - Trained random_forest model for classification
2025-12-15 17:22:42,562 - INFO - Top 5 feature importances:
Device_Type_Mobile       0.069270
Account_Type_Business    0.068810
Device_Type_POS          0.068158
Device_Type_ATM          0.066329
Account_Type_Savings     0.065377
dtype: float64
2025-12-15 17:22:44,115 - INFO - Evaluation results: {'accuracy': 0.94955, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'roc_auc': np.float64(0.4972294267604409)}
2025-12-15 17:22:44,139 - INFO - Applying SMOTE to training data...



Training gbm...


2025-12-15 17:22:46,133 - INFO - SMOTE applied: New X_train=(303860, 52)
2025-12-15 17:25:18,276 - INFO - Trained gbm model for classification
2025-12-15 17:25:18,665 - INFO - Top 5 feature importances:
Device_Type_ATM            0.094176
Account_Type_Business      0.091490
Device_Type_Desktop        0.086781
Transaction_Type_Credit    0.085960
Account_Type_Savings       0.077619
dtype: float64
2025-12-15 17:25:19,528 - INFO - Evaluation results: {'accuracy': 0.949375, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'roc_auc': np.float64(0.49894665429908136)}



Training balanced_random_forest...


2025-12-15 17:25:24,544 - INFO - Trained balanced_random_forest model for classification
2025-12-15 17:25:24,696 - INFO - Top 5 feature importances:
Transaction_Amount    0.112253
Account_Balance       0.100593
Customer_Age          0.062968
Transaction_Hour      0.062464
Age                   0.060242
dtype: float64
2025-12-15 17:25:25,090 - INFO - Evaluation results: {'accuracy': 0.520775, 'precision': 0.050620971545354505, 'recall': 0.4786917740336967, 'f1': 0.09155964172314109, 'roc_auc': np.float64(0.5005184110213596)}



--- Model Leaderboard ---
               model_name   roc_auc       f1  precision    recall
3  balanced_random_forest  0.500518  0.09156   0.050621  0.478692
2                     gbm  0.498947  0.00000   0.000000  0.000000
1           random_forest  0.497229  0.00000   0.000000  0.000000
0                logistic  0.495637  0.00000   0.000000  0.000000

üèÜ WINNER: balanced_random_forest
   ROC AUC: 0.5005
   F1 Score: 0.0916

‚úÖ Best model saved successfully at: ../model/rf_model_imbalanced.pkl
