In [2]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.8/72.0 MB 4.7 MB/s eta 0:00:16
   - -------------------------------------- 2.1/72.0 MB 5.7 MB/s eta 0:00:13
   - -------------------------------------- 3.4/72.0 MB 5.5 MB/s eta 0:00:13
   -- ------------------------------------- 4.2/72.0 MB 5.1 MB/s eta 0:00:14
   -- ------------------------------------- 4.7/72.0 MB 4.6 MB/s eta 0:00:15
   --- ------------------------------------ 5.5/72.0 MB 4.3 MB/s eta 0:00:16
   --- ------------------------------------ 6.0/72.0 MB 4.2 MB/s eta 0:00:16
   --- ------------------------------------ 6.8/72.0 MB 4.1 MB/s eta 0:00:17
   ---- -------------------

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, average_precision_score,
    precision_recall_curve, accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
import joblib

# -------------------------------
# 1. Define feature set
# -------------------------------
df = pd.read_csv("C:\\Users\\naman\\PaySafe UPI Fraud Detection\\Data\\cleaned_Data.csv")

print(df.head())
print(df.columns)
                 
feature_cols = [
    'amount','LogAmount','Hour','IsNight','DailyTxnCount',
    'OrigTxnCount','DestTxnCount','TxnCountWindow',
    'RuleHighValue','RuleRapidFire','Type_encoded'
]

X = df[feature_cols].copy()

# -------------------------------
# 2. Check for target column
# -------------------------------
if 'isFraud' in df.columns:
    y = df['isFraud'].astype(int)
    supervised = True
    print("✅ Found 'isFraud' column → proceeding with supervised + hybrid evaluation.")
else:
    y = None
    supervised = False
    print("⚠️ No 'isFraud' column → proceeding with unsupervised anomaly detection only.")

# -------------------------------
# 3. Train-test split
# -------------------------------
if supervised:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
else:
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# -------------------------------
# 4. Scale numeric features
# -------------------------------
numeric_cols = ['amount','LogAmount','Hour','DailyTxnCount','OrigTxnCount','DestTxnCount','TxnCountWindow']
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# -------------------------------
# 5. Train models
# -------------------------------
# Isolation Forest (always trained)
iso = IsolationForest(
    contamination=0.01,
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
iso.fit(X_train)

# If supervised, train XGBoost
if supervised:
    pos = y_train.sum()
    neg = len(y_train) - pos
    scale_pos_weight = max(1.0, neg / max(1, pos))

    xgb = XGBClassifier(
        max_depth=6,
        learning_rate=0.1,
        n_estimators=200,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        eval_metric="auc",
        scale_pos_weight=scale_pos_weight,
        n_jobs=-1,
        random_state=42
    )
    xgb.fit(X_train, y_train)

# -------------------------------
# 6. Get scores
# -------------------------------
iso_score_raw = -iso.decision_function(X_test)
iso_min, iso_max = iso_score_raw.min(), iso_score_raw.max()
iso_score = (iso_score_raw - iso_min) / (iso_max - iso_min + 1e-9)

rule_boost = (
    0.5 * X_test['RuleHighValue'].values +
    0.5 * X_test['RuleRapidFire'].values
)
rule_boost = np.clip(rule_boost, 0, 1)

if supervised:
    xgb_prob_test = xgb.predict_proba(X_test)[:, 1]
    hybrid_score = 0.6*xgb_prob_test + 0.3*iso_score + 0.1*rule_boost
else:
    hybrid_score = 0.7*iso_score + 0.3*rule_boost

# -------------------------------
# 7. Threshold & predictions
# -------------------------------
alert_threshold = 0.8
y_pred_alert = (hybrid_score >= alert_threshold).astype(int)

# -------------------------------
# 8. Evaluation
# -------------------------------
if supervised:
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred_alert))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_alert, digits=4))

    # Metrics for dashboard
    metrics_summary = {
        "Accuracy": accuracy_score(y_test, y_pred_alert),
        "Precision": precision_score(y_test, y_pred_alert),
        "Recall": recall_score(y_test, y_pred_alert),
        "F1": f1_score(y_test, y_pred_alert),
        "ROC_AUC_XGB": roc_auc_score(y_test, xgb_prob_test),
        "ROC_AUC_Hybrid": roc_auc_score(y_test, hybrid_score),
        "PR_AUC_Hybrid": average_precision_score(y_test, hybrid_score)
    }
    print("\nMetrics Summary:", metrics_summary)

    # Save metrics for dashboard
    pd.DataFrame([metrics_summary]).to_csv("C:/Users/naman/PaySafe UPI Fraud Detection/Models/metrics_summary.csv", index=False)

    # Save transaction-level scores
    eval_df = pd.DataFrame({
        "Index": X_test.index,
        "HybridScore": hybrid_score,
        "IsFraud": y_test.values,
        "Alert": y_pred_alert
    })
    eval_df.to_csv("C:/Users/naman/PaySafe UPI Fraud Detection/Models/hybrid_eval_test.csv", index=False)

else:
    print("\nUnsupervised mode → only anomaly scores available.")
    eval_df = pd.DataFrame({
        "TransactionID": df.loc[X_test.index, "TransactionID"].values,
        "HybridScore": hybrid_score,
        "Alert": y_pred_alert
    })
    eval_df.to_csv("C:/Users/naman/PaySafe UPI Fraud Detection/Models/hybrid_eval_unsupervised.csv", index=False)

# -------------------------------
# 9. Save models
# -------------------------------
joblib.dump(iso, "C:/Users/naman/PaySafe UPI Fraud Detection/Models/isolation_forest.joblib")
joblib.dump(scaler, "C:/Users/naman/PaySafe UPI Fraud Detection/Models/scaler.joblib")
if supervised:
    joblib.dump(xgb, "C:/Users/naman/PaySafe UPI Fraud Detection/Models/xgb_model.joblib")

print("\n✅ Models and evaluation saved for dashboard integration.")

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  ...  IsNight  \
0  M1979787155             0.0             0.0        0  ...        1   
1  M2044282225             0.0             0.0        0  ...        1   
2   C553264065             0.0             0.0        1  ...        1   
3    C38997010         21182.0             0.0        1  ...        1   
4  M1230701703             0.0             0.0        0  ...        1   

   LogAmount  DailyTxnCount  OrigTxnCount  DestTxnCount  StepWindow  \
0   9.194276              1            