In [None]:
# ===============================
# Modeling
# ===============================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, average_precision_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE

# Load data
fraud = pd.read_csv("../data/processed/fraud_processed.csv")

# Target & features
y = fraud['class']
X = fraud.drop(columns=['class','signup_time','purchase_time','ip_address','device_id','user_id'])

# One-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)

# Logistic Regression
lr = LogisticRegression(class_weight='balanced', max_iter=1000)
lr.fit(X_train_res, y_train_res)

lr_preds = lr.predict(X_test_scaled)
lr_probs = lr.predict_proba(X_test_scaled)[:,1]

print("Logistic Regression")
print("F1:", f1_score(y_test, lr_preds))
print("AUC-PR:", average_precision_score(y_test, lr_probs))
print("Confusion Matrix:\n", confusion_matrix(y_test, lr_preds))

# Random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)

rf.fit(X_train_res, y_train_res)
rf_preds = rf.predict(X_test_scaled)
rf_probs = rf.predict_proba(X_test_scaled)[:,1]

print("\nRandom Forest")
print("F1:", f1_score(y_test, rf_preds))
print("AUC-PR:", average_precision_score(y_test, rf_probs))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_preds))
