In [None]:
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from joblib import load
import xgboost as xgb

DATA_PATH = '../Real_fraud_dataset.csv'
ARTIFACTS_DIR = '../models/artifacts'

assert os.path.exists(DATA_PATH), f"Missing dataset at {DATA_PATH}"

# Load data
raw = pd.read_csv(DATA_PATH)
raw.head()


In [None]:
import json

def build_features(df: pd.DataFrame, encoders: dict):
    df = df.copy()
    for col in ['is_international', 'is_high_risk_country', 'previous_transactions', 'fraudulent']:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
    for col in ['amount', 'avg_transaction_amount']:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
    df['amount_to_avg_ratio'] = df['amount'] / (df['avg_transaction_amount'] + 1e-6)
    ts = pd.to_datetime(df['timestamp'], errors='coerce', utc=True)
    df['hour_of_day'] = ts.dt.hour.fillna(0).astype(int)
    for dt in encoders['device_types']:
        df[f'device_type__{dt}'] = (df['device_type'].fillna('web') == dt).astype(int)
    for cat in encoders['categories']:
        df[f'category__{cat}'] = (df['category'].fillna('unknown') == cat).astype(int)
    X = df.reindex(columns=encoders['feature_order'], fill_value=0)
    y = df['fraudulent'].fillna(0).astype(int)
    return X.values, y.values

with open(os.path.join(ARTIFACTS_DIR, 'encoders.json')) as f:
    enc = json.load(f)

X, y = build_features(raw, enc)
print(X.shape, y.shape)



In [None]:
from joblib import load
import xgboost as xgb

xgb_model = xgb.XGBClassifier()
xgb_model.load_model(os.path.join(ARTIFACTS_DIR, 'xgboost_model.json'))
if_model = load(os.path.join(ARTIFACTS_DIR, 'isolation_forest.joblib'))

xgb_prob = xgb_model.predict_proba(X)[:, 1]
xgb_pred = (xgb_prob >= 0.5).astype(int)

# IF scores (positive => anomaly severity)
if_scores = (-if_model.score_samples(X))
if_thresh = float(np.percentile(if_scores, 98))
if_flag = (if_scores >= if_thresh).astype(int)

final = ((xgb_pred == 1) | (if_flag == 1)).astype(int)

acc = accuracy_score(y, final)
prec = precision_score(y, final, zero_division=0)
rec = recall_score(y, final, zero_division=0)
f1 = f1_score(y, final, zero_division=0)
try:
    auc = roc_auc_score(y, xgb_prob)
except Exception:
    auc = float('nan')

print({
    'accuracy': acc,
    'precision': prec,
    'recall': rec,
    'f1': f1,
    'roc_auc': auc,
})


In [None]:
fpr, tpr, thr = roc_curve(y, xgb_prob)
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f'ROC AUC={auc:.3f}')
plt.plot([0,1], [0,1], 'k--', alpha=0.5)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('XGBoost ROC Curve')
plt.legend()
plt.show()
