In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

df = pd.read_csv("onlinefraud.csv")
df = df.sample(n=300000, random_state=42)

# Feature Engineering
df["hour_of_day"] = df["step"] % 24
df["amount_ratio_orig"] = df["amount"] / (df["oldbalanceOrg"] + 1)
df["amount_ratio_dest"] = df["amount"] / (df["oldbalanceDest"] + 1)
df["balance_diff_orig"] = df["oldbalanceOrg"] - df["newbalanceOrig"]
df["balance_diff_dest"] = df["oldbalanceDest"] - df["oldbalanceDest"]
df["error_check_1"] = (df["oldbalanceOrg"] - df["amount"] == df["newbalanceOrig"]).astype(int)
df["error_check_2"] = (df["oldbalanceDest"] + df["amount"] == df["newbalanceDest"]).astype(int)

# One-Hot Encode 'type'
df = pd.get_dummies(df, columns=["type"], drop_first=True)

# Drop unnecessary columns
df.drop(["nameOrig", "nameDest", "step"], axis=1, inplace=True)

# Split data
X = df.drop("isFraud", axis=1)
y = df["isFraud"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
joblib.dump(rf_model, "rf_model_nn.pkl")

# Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train, y_train)
joblib.dump(xgb_model, "xgb_model_nn.pkl")

# Ensemble Voting Classifier (Random Forest + XGBoost)
ensemble = VotingClassifier(estimators=[
    ("rf", rf_model),
    ("xgb", xgb_model)
], voting="soft", n_jobs=-1)

ensemble.fit(X_train, y_train)
joblib.dump(ensemble, "ensemble_model_nn.pkl")

# Accuracy Check
rf_acc = accuracy_score(y_test, rf_model.predict(X_test))
xgb_acc = accuracy_score(y_test, xgb_model.predict(X_test))
ensemble_acc = accuracy_score(y_test, ensemble.predict(X_test))
ensemble_roc_auc = roc_auc_score(y_test, ensemble.predict_proba(X_test)[:, 1])

print(f"Random Forest Accuracy: {rf_acc:.4f}")
print(f"XGBoost Accuracy: {xgb_acc:.4f}")
print(f"Ensemble Model Accuracy: {ensemble_acc:.4f}")
print(f"Ensemble Model AUC-ROC: {ensemble_roc_auc:.4f}")

# Classification Report for detailed fraud detection performance
print("\nClassification Report for Ensemble Model:")
print(classification_report(y_test, ensemble.predict(X_test)))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Random Forest Accuracy: 0.9967
XGBoost Accuracy: 0.9967
Ensemble Model Accuracy: 0.9967
Ensemble Model AUC-ROC: 0.9978

Classification Report for Ensemble Model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       597
           1       1.00      0.33      0.50         3

    accuracy                           1.00       600
   macro avg       1.00      0.67      0.75       600
weighted avg       1.00      1.00      1.00       600

