In [2]:
# -------------------------------------------------------------------
# Import core data-science libraries
# -------------------------------------------------------------------
import pandas as pd      # pandas → tabular data manipulation (DataFrame)
import numpy as np       # NumPy  → fast numerical operations, arrays


In [None]:
# -------------------------------------------------------------------
# STEP 2 – Load the four raw CSV datasets we will later clean/merge
# -------------------------------------------------------------------
kag   = pd.read_csv("Kaggle_dataset.csv")             # Kaggle-hosted fraud dataset (~10 k rows)
farr  = pd.read_csv("Farrugia_dataset.csv")           # Farrugia et al. 2020 labelled set (~4k rows)
forta = pd.read_csv("Forta_dataset.csv")              # Forta/Etherscan phishing–hack addresses (all illicit)
bq    = pd.read_csv("BigQuery_crypto_ethereum.csv")   # Recent Legitimate sample with BigQuery-derived features

In [None]:
import pandas as pd
from pathlib import Path

def clean_dataset(path: str | Path, source_name: str, illicit_flag: int | None = None):
    """
    Clean and standardise a raw Ethereum-fraud CSV.

    Parameters
    ----------
    path : str | Path
        Location of the CSV file to load.
    source_name : str
        Short identifier used in log / summary rows (e.g. "Kaggle").
    illicit_flag : {0, 1} | None, optional
        • If None   → keep / fix existing FLAG column in the file.  
        • If 0 or 1 → force-create a constant FLAG column (useful for
          Forta=1, BigQuery=0).

    Returns
    -------
    df_clean : pandas.DataFrame
        Sanitised table with a lowercase `Address` column and integer FLAG.
    summary  : dict
        One-line dataset summary (rows, columns, class counts, duplicates).
    """

    # ─────────────────────────────────────────────────────────────
    # Load CSV into DataFrame
    # ─────────────────────────────────────────────────────────────
    df = pd.read_csv(path)

    # Tidy column headers: strip whitespace and drop junk index cols
    df.columns = df.columns.str.strip()
    junk = [c for c in df.columns
            if c.lower().startswith('unnamed') or c.lower() == 'index']
    df.drop(columns=junk, inplace=True, errors='ignore')

    # Ensure a single canonical 'Address' column in lower-case hex
    addr_col = next((c for c in df.columns
                     if c.lower() in {'address', 'addr'}), None)
    if addr_col is None:
        raise ValueError(f"No address column detected in {source_name}")
    if addr_col != 'Address':
        df.rename(columns={addr_col: 'Address'}, inplace=True)
    df['Address'] = df['Address'].str.lower()

    # Guarantee numeric FLAG column (0 = Legitimate, 1 = illicit)
    if 'FLAG' in df.columns:
        df['FLAG'] = df['FLAG'].fillna(0).astype(int)
    else:
        # For sources with only one class (e.g., Forta = illicit-only)
        df['FLAG'] = 0 if illicit_flag is None else illicit_flag

    # Remove duplicate addresses and log how many we dropped
    before = len(df)
    df = df.drop_duplicates('Address')
    dups_removed = before - len(df)

    # ────────────────────────────
    # Build a tiny summary for logging / sanity checks
    # ────────────────────────────
    summary = dict(
        source            = source_name,
        rows              = len(df),
        cols              = len(df.columns),
        illicit_cnt       = int(df['FLAG'].sum()),
        Legitimate_cnt        = int((df['FLAG'] == 0).sum()),
        duplicates_removed= dups_removed
    )

    return df, summary


In [None]:
# -------------------------------------------------------------------
# STEP 3 – Clean every raw dataset and collect quick stats
# -------------------------------------------------------------------
# Mapping of dataset nick-name → file path.                     
# (Edit here if your filenames differ.)
files = {
    'Kaggle'   : Path('Kaggle_dataset.csv'),
    'Farrugia' : Path('Farrugia_dataset.csv'),
    'Forta'    : Path('Forta_dataset.csv'),
    'BigQuery' : Path('BigQuery_crypto_ethereum.csv')
}

cleaned    = {}   # will hold the cleaned DataFrames keyed by name
summaries  = []   # list of one-line dicts for quick inspection

for name, path in files.items():
    # Forta list is known *all illicit*  → force FLAG = 1
    if name == 'Forta':
        df, s = clean_dataset(path, name, illicit_flag=1)

    # BigQuery sample is treated as *all Legitimate*  → force FLAG = 0
    elif name == 'BigQuery':
        df, s = clean_dataset(path, name, illicit_flag=0)

    # Kaggle & Farrugia already contain mixed labels
    else:
        df, s = clean_dataset(path, name)

    cleaned[name] = df      # stash cleaned DataFrame
    summaries.append(s)     # stash summary row

# Display a tidy summary table: rows, columns, class counts, duplicates removed
pd.DataFrame(summaries)


In [None]:
import matplotlib.pyplot as plt

# ─────────────────────────────────────────────────────────────
# Step 1 – Aggregate total counts across all cleaned datasets
# ─────────────────────────────────────────────────────────────
total_illicit = sum(s['illicit_cnt'] for s in summaries)
total_Legitimate  = sum(s['Legitimate_cnt']  for s in summaries)

# ─────────────────────────────────────────────────────────────
# Step 2 – Create and display the pie chart
# ─────────────────────────────────────────────────────────────
labels = ['Legitimate Accounts', 'Illicit Accounts']
sizes  = [total_Legitimate, total_illicit]
colors = ['#66b3ff', '#ff6666']   # blue for legitimate, red for illicit
explode = (0, 0.1)                # "explode" the illicit slice

plt.figure(figsize=(6, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.title('Class Distribution in Unified Ethereum Dataset')
plt.axis('equal')  # Equal aspect ratio ensures a circular pie.
plt.tight_layout()
plt.show()


In [None]:
# -------------------------------------------------------------------
# Persist each cleaned DataFrame to disk
# -------------------------------------------------------------------
#   - Makes subsequent notebooks faster (no need to re-clean raw CSVs)
#   - Gives you a frozen copy for reproducibility / sharing
#   - Filenames follow the pattern  clean_<Source>.csv
for name, df in cleaned.items():
    df.to_csv(f'clean_{name}.csv', index=False)


In [None]:
# -------------------------------------------------------------------
# Build a single “label registry” mapping each Address → final FLAG
# -------------------------------------------------------------------

# Extract only the identity and label columns from every cleaned source.
#     (This keeps the merge logic simple and memory-efficient.)
label_frames = [
    cleaned['Kaggle'  ][['Address', 'FLAG']],   # mixed labels
    cleaned['Farrugia'][['Address', 'FLAG']],   # mixed labels
    cleaned['Forta'   ][['Address', 'FLAG']],   # all 1’s
    cleaned['BigQuery'][['Address', 'FLAG']]    # all 0’s
]

# Concatenate the four label lists then group by Address.
#     - Some addresses appear in more than one source with possibly
#       conflicting labels.
#     - Taking the *maximum* FLAG value means 1 (“illicit”) always
#       overrides 0 (“Legitimate”) — conservative for security use-case.
registry = (
    pd.concat(label_frames, ignore_index=True)   # stack into one big table
      .groupby('Address', as_index=False)['FLAG']
      .max()                                     # 1 > 0  → illicit wins
)

# Quick sanity checks: total unique addresses and class balance
print("Registry size:", len(registry))                # e.g. 146 440
print(registry['FLAG'].value_counts())                # 0 vs 1 counts


In [None]:
# -------------------------------------------------------------------
# Build the “feature matrix” that holds engineered numeric features
# for every address.  We start with the two sources that already
# contain rich per-address statistics, then merge in extra metrics
# from BigQuery.
# -------------------------------------------------------------------

# 5-A  Stack Kaggle and Farrugia feature tables (they share column names)
#      - Drop FLAG because labels live in the registry now.
tx_feats = pd.concat(
    [
        cleaned['Kaggle'  ].drop(columns=['FLAG']),
        cleaned['Farrugia'].drop(columns=['FLAG'])
    ],
    ignore_index=True
)

# 5-B  Outer-join the additional *recent* numeric metrics that were
#      pre-computed via BigQuery (outbound tx count, ETH sent, avg gas).
#      outer → keep rows even if some addresses are missing these new cols
tx_feats = tx_feats.merge(
    cleaned['BigQuery'][['Address', 'tx_count_out', 'eth_sent', 'avg_gwei']],
    on='Address', how='outer'
)

# 5-C  Note: Forta rows currently have NaN in these numeric columns because
#      that CSV only had labels.  We'll replace NaNs later when we enrich
#      Forta addresses via BigQuery — or impute with column medians.
print("Feature matrix shape:", tx_feats.shape)   # e.g. (145 k, 49 features)


In [None]:
# -------------------------------------------------------------------
# Merge labels ↔ features to create the final “master” dataset
# -------------------------------------------------------------------

# Join the label registry with the numeric/categorical feature matrix.
# Every address now has:
#   - Address   (primary key)
#   - FLAG      (target label 0 / 1)
#   - ~50 engineered features
master = registry.merge(tx_feats, on='Address', how='left')

# ── Handle missing values ──────────────────────────────────────────
# Numeric columns → replace NaNs with the column median
num_cols = master.select_dtypes('number').columns
master[num_cols] = master[num_cols].fillna(master[num_cols].median())

# Categorical columns (e.g., token type strings) → replace NaNs
# with literal 'unknown' so One-Hot-Encoder has a stable category.
cat_cols = master.select_dtypes('object').columns.difference(['Address'])
master[cat_cols] = master[cat_cols].fillna('unknown')

print("Master dataframe ready:", master.shape)   # e.g. (146 440, 52)


In [None]:
# -------------------------------------------------------------------
# Persist the fully-cleaned, feature-complete dataset for reuse
# -------------------------------------------------------------------
#   - File: master_dataset_v1.csv
#   - Rows: one per unique Ethereum address
#   - Columns: Address, FLAG, and all engineered features
# merging pipeline and load this single CSV instead.
master.to_csv('master_dataset_v1.csv', index=False)


In [None]:
# -------------------------------------------------------------------
# STEP 7 – Encode categorical features & scale numeric ones
#           Produces a ColumnTransformer stored in preprocess_v1.joblib
# -------------------------------------------------------------------
import pandas as pd, numpy as np, joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# 7-A  Separate predictors (X_raw) from target label (y)
X_raw = master.drop(columns=['FLAG', 'Address'])  # raw feature matrix
y     = master['FLAG'].values                     # target vector

# 7-B  Identify feature types so we can apply type-specific transformers
cat_cols = X_raw.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X_raw.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric: {len(num_cols)}  |  Categorical: {len(cat_cols)}")

# 7-C  Define the per-type pipelines
numeric_pipeline      = MinMaxScaler()                            # rescale 0-1
categorical_pipeline  = OneHotEncoder(                            # one-hot encode
    handle_unknown='ignore',      # ignore unseen categories at test-time
    sparse_output=True            # keep sparse matrix for memory efficiency
)

# Build the unified transformer:
#   - 'num' pipe applied to num_cols
#   - 'cat' pipe applied to cat_cols
#   - any leftover cols (none expected) are dropped
preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, num_cols),
        ('cat', categorical_pipeline, cat_cols)
    ],
    remainder='drop',
    sparse_threshold=0.30          # keep sparse output when OHE dominates
)

# 7-D  Fit on the entire feature set so that every downstream model can
#      share the exact same scaling / encoding parameters.
#      (If you prefer strict train-only fitting, fit on X_train later.)
preprocess.fit(X_raw)

# 7-E  Quick sanity check: transform all rows and report shape / sparsity
X_ready = preprocess.transform(X_raw)
print("Shape after transform:", X_ready.shape,
      "| Sparse matrix?:", hasattr(X_ready, 'nnz'))

# 7-F  Persist the fitted transformer; future notebooks just load it
joblib.dump(preprocess, 'preprocess_v1.joblib')


In [None]:
# ---------------------------------------------------------------------------
# STEP 9  – Train / test split  +  SMOTE balancing
# STEP 10 – Baseline model training (RF, XGB, MLP) and metric logging
# ---------------------------------------------------------------------------
import joblib, numpy as np, pandas as pd, warnings, gc
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_auc_score, average_precision_score,
                             classification_report, confusion_matrix)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
warnings.filterwarnings('ignore')

# 9-A — Reload the fitted pre-processing pipeline (scaler + OHE)
preprocess = joblib.load('preprocess_v1.joblib')

# ------------------------------------------------------------------
# Choose the hold-out strategy
#   - time-aware  (default) :  train on Kaggle+Farrugia (≤2021),
#                              test on Forta+BigQuery (2022-24)
#   - random 80/20 split     :  quick sanity check
# ------------------------------------------------------------------
use_time_split = True      #  set False for random 80/20 baseline

if use_time_split:
    # ---------------- time-aware split ----------------
    train_mask = master['Address'].isin(
        pd.concat([cleaned['Kaggle'], cleaned['Farrugia']])['Address'])
    X_train_raw = master.loc[train_mask].drop(columns=['FLAG', 'Address'])
    y_train     = master.loc[train_mask, 'FLAG'].values
    X_test_raw  = master.loc[~train_mask].drop(columns=['FLAG', 'Address'])
    y_test      = master.loc[~train_mask, 'FLAG'].values
else:
    # ---------------- random stratified 80/20 split ----------------
    X_raw = master.drop(columns=['FLAG', 'Address'])
    y     = master['FLAG'].values
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X_raw, y, test_size=0.20, stratify=y, random_state=42)

# 9-B — Apply scaler + One-Hot encoding
X_train = preprocess.transform(X_train_raw)
X_test  = preprocess.transform(X_test_raw)

# 9-C — Balance the *training* set only (leave test skewed)
X_train, y_train = SMOTE(random_state=42).fit_resample(X_train, y_train)

print(f"After preprocessing + SMOTE:"
      f"\n  Train: {X_train.shape}  | illicit% = {y_train.mean():.3f}"
      f"\n  Test : {X_test.shape}   | illicit% = {y_test.mean():.3f}")

# ------------------------------------------------------------------
# STEP 10 – Fit baseline models and evaluate on the CLEAN test set
# ------------------------------------------------------------------
models = {
    "RF": RandomForestClassifier(
            n_estimators=400, class_weight='balanced',
            n_jobs=-1, random_state=42),

    "XGB": XGBClassifier(
            n_estimators=600, max_depth=6, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum(),
            eval_metric='auc', n_jobs=-1, random_state=42),

    "MLP": MLPClassifier(
            hidden_layer_sizes=(256, 128), max_iter=30, random_state=42)
}

results = []

for name, clf in models.items():
    print(f"\n────────  {name}  ────────")
    clf.fit(X_train, y_train)

    prob  = clf.predict_proba(X_test)[:, 1]           # illicit probability
    preds = (prob >= 0.5).astype(int)                 # hard threshold 0.5
    
    roc   = roc_auc_score(y_test, prob)               # ROC-AUC (threshold-free)
    prauc = average_precision_score(y_test, prob)     # PR-AUC (better for imbalance)
    cm    = confusion_matrix(y_test, preds)

    print(f"ROC-AUC: {roc:.3f} | PR-AUC: {prauc:.3f}")
    print(classification_report(y_test, preds, digits=3))

    results.append(dict(model=name, roc=roc, pr=prauc,
                        TP=cm[1,1], FP=cm[0,1],
                        FN=cm[1,0], TN=cm[0,0]))
    
        # Save predictions and true labels for XGB
    if name == "XGB":
        np.save("prob_baseline.npy", prob)
        np.save("y_test_baseline.npy", y_test)

    gc.collect()   # free RAM before next model

# Save all metrics for inclusion in the dissertation tables
pd.DataFrame(results).to_csv('baseline_metrics.csv', index=False)
print("\nSaved baseline_metrics.csv with results.")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the saved baseline metrics
df = pd.read_csv("baseline_metrics.csv")

# Sort models in display order (optional)
model_order = ['RF', 'XGB', 'MLP']
df = df.set_index('model').loc[model_order].reset_index()

# Plot side-by-side bars for ROC-AUC and PR-AUC
bar_width = 0.35
x = range(len(df))

plt.figure(figsize=(8, 5))
plt.bar([i - bar_width/2 for i in x], df['roc'], width=bar_width, label='ROC-AUC', color='#4e79a7')
plt.bar([i + bar_width/2 for i in x], df['pr'], width=bar_width, label='PR-AUC', color='#f28e2c')

# X-axis labels and aesthetics
plt.xticks(ticks=x, labels=df['model'])
plt.ylim(0, 1.05)
plt.ylabel('Score')
plt.title('Initial Baseline Model Performance (Time-Aware Split)')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
# ─────────────────────────────────────────────────────────────
# Standalone Cell: Visualise class distribution after SMOTE
# ─────────────────────────────────────────────────────────────
import numpy as np
import matplotlib.pyplot as plt

# Compute class counts in y_train (after SMOTE)
illicit_count = np.sum(y_train == 1)
Legitimate_count  = np.sum(y_train == 0)

# Build pie chart
labels = ['Legitimate Accounts (SMOTE)', 'Illicit Accounts (SMOTE)']
sizes  = [Legitimate_count, illicit_count]
colors = ['#66b3ff', '#ff6666']
explode = (0, 0.1)

plt.figure(figsize=(6, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.title('Class Distribution in SMOTE-Balanced Training Set')
plt.axis('equal')
plt.tight_layout()
plt.show()


In [None]:
# ---------------------------------------------------------------------------
# BigQuery enrichment script
# Purpose: Fetch recent (2023-present) transaction statistics for the list of
#          Forta + Legitimate BigQuery addresses and save them as a new CSV.
# Notes:
#   - Runs in 10 000-address batches to stay within query-parameter limits.
#   - Prints bytes-scanned + cost estimate for each batch (assumes $5 / TB).
#   - Requires the GCP project ‘diss-464115’ to have billing enabled.
# ---------------------------------------------------------------------------
from google.cloud import bigquery
import pandas as pd, math, time

# GCP project that *runs* the query (billed here, not in public dataset)
PROJECT_ID = "diss-464115"
client     = bigquery.Client(project=PROJECT_ID)

# ────────────────────────────────────────────────────────────────
# Load the address list exported earlier (one 0x… per line, lower-case)
# ────────────────────────────────────────────────────────────────
with open("/content/addr_list.txt") as fh:
    ADDR_LIST = [ln.strip().lower() for ln in fh if ln.strip()]

print("Total addresses:", len(ADDR_LIST))

# ────────────────────────────────────────────────────────────────
# Parameterised SQL template (limits to 2023-01-01 → present)
# ────────────────────────────────────────────────────────────────
QUERY = """
DECLARE addrs ARRAY<STRING>;
SET addrs = @addr_list;

WITH tx AS (
  SELECT
    from_address   AS address,
    value,
    gas_price,
    block_timestamp
  FROM `bigquery-public-data.crypto_ethereum.transactions`
  WHERE from_address IN UNNEST(addrs)
        AND block_timestamp >= '2023-01-01'          -- prune partitions
)
SELECT
  address                       AS Address,
  COUNT(*)                      AS tx_count_out,    -- # outbound tx
  SUM(value)/1e18               AS eth_sent,        -- total ETH sent
  AVG(gas_price)/1e9            AS avg_gwei,        -- mean gas price (GWei)
  MIN(block_timestamp)          AS first_seen,
  MAX(block_timestamp)          AS last_seen
FROM tx
GROUP BY address;
"""

def query_batch(addr_chunk, idx):
    """
    Run one BigQuery job for <=10 000 addresses, return result DataFrame.
    Also print bytes-scanned and dollar cost (at $5 / TB).
    """
    cfg = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ArrayQueryParameter("addr_list", "STRING", addr_chunk)
        ]
    )
    job    = client.query(QUERY, job_config=cfg)
    result = job.result()              # waits for job completion

    # Cost diagnostics
    mb   = result.total_bytes_processed / 1e6
    cost = result.total_bytes_processed / 1e12 * 5
    print(f"  • Batch {idx}  scanned {mb:,.1f} MB  (~${cost:,.2f})")

    return result.to_dataframe()

# ────────────────────────────────────────────────────────────────
# Loop over address list in 10 000-addr batches
# ────────────────────────────────────────────────────────────────
batch_size = 10_000
frames     = []

for i in range(0, len(ADDR_LIST), batch_size):
    chunk     = ADDR_LIST[i:i+batch_size]
    batch_no  = i // batch_size + 1
    print(f"Batch {batch_no}/{math.ceil(len(ADDR_LIST)/batch_size)} …")
    frames.append(query_batch(chunk, batch_no))

# Concatenate all batch DataFrames and persist to CSV
print("finished all batches, assembling DataFrame …")
enriched_df = pd.concat(frames, ignore_index=True)
enriched_df.to_csv("forta_bq_features.csv", index=False)
print("Saved forta_bq_features.csv  rows:", len(enriched_df))


In [None]:
import pandas as pd, numpy as np

# -------------------------------------------------------------------
# Merge freshly-downloaded BigQuery stats into `master`
#   - Replaces the median-imputed placeholders for Forta & recent-Legitimate
# -------------------------------------------------------------------

# Load enrichment CSV and normalise address casing
bq_new = pd.read_csv("forta_bq_features.csv")
bq_new['Address'] = bq_new['Address'].str.lower()

# Define the numeric columns we expect from enrichment
num_cols = ['tx_count_out', 'eth_sent', 'avg_gwei',
            'first_seen', 'last_seen']

# Clean up any previous *_new temp columns from earlier merges
master.drop(columns=[c for c in master.columns if c.endswith('_new')],
            inplace=True, errors='ignore')

# Left-merge: keep all rows in master, pull in new numeric fields
master = master.merge(bq_new, on="Address", how="left",
                      suffixes=('', '_new'))  # new cols tagged with _new

# For each numeric field, overwrite placeholder (old) values with
# the real numbers when they exist in *_new; then drop the *_new col.
for col in num_cols:
    new_col = f"{col}_new"
    if new_col in master.columns:            # only present if merge found data
        master[col] = master[new_col].combine_first(master[col])
        master.drop(columns=new_col, inplace=True)

print("Master rows:", len(master),
      "| tx_count_out missing:", master['tx_count_out'].isna().sum())


In [None]:
# -------------------------------------------------------------------
# Re-fit the ColumnTransformer after merging new BigQuery features
# Resulting pipeline is saved as preprocess_v3.joblib
# -------------------------------------------------------------------
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib

# Separate predictors from label / ID columns
X_raw = master.drop(columns=['FLAG', 'Address'])

# Detect data types ► which columns get which transformer
cat_cols = X_raw.select_dtypes('object').columns.tolist()   # token names, etc.
num_cols = X_raw.select_dtypes('number').columns.tolist()   # tx_count, eth_sent …

# Build the transformer:  numeric → Min-Max ;  categorical → One-Hot (sparse)
preprocess = ColumnTransformer([
    ('num', MinMaxScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), cat_cols)
])

# Fit on the *complete* dataset so every downstream model shares
# identical scaling / category vocabularies.
preprocess.fit(X_raw)

# Persist for reuse in training / inference notebooks
joblib.dump(preprocess, 'preprocess_v3.joblib')
print("Saved preprocess_v3.joblib")


In [None]:
# ──────────────────────────────────────────────────────────────
# STEP 9 & 10 (v3)  – Training / Evaluation with enriched features
# ──────────────────────────────────────────────────────────────
import joblib, numpy as np, pandas as pd, gc, warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import (roc_auc_score, average_precision_score,
                             classification_report, confusion_matrix)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Load the latest pre-processing pipeline (includes BigQuery metrics)
preprocess = joblib.load('preprocess_v3.joblib')

# Build a *time-aware* split
#    -  Historical addresses (≤2021) → Kaggle + Farrugia → always train
#    -  Recent addresses (2022-24)   → Forta + BigQuery
#       – sample 30 % into train to give model some exposure
#       – remaining 70 % used as unseen hold-out test
train_core_mask = master['Address'].isin(
    pd.concat([cleaned['Kaggle'], cleaned['Farrugia']])['Address']
)

recent_mask = master['Address'].isin(
    pd.concat([cleaned['Forta'], cleaned['BigQuery']])['Address']
)
recent_df = master[recent_mask]

train_extra = recent_df.sample(frac=0.30, random_state=42)  # 30 % recent → train
test_df     = recent_df.drop(train_extra.index)             # 70 % recent → test

train_df = pd.concat([master[train_core_mask], train_extra], ignore_index=True)

# Split predictors / labels
X_train_raw = train_df.drop(columns=['FLAG', 'Address'])
y_train     = train_df['FLAG'].values
X_test_raw  = test_df.drop(columns=['FLAG', 'Address'])
y_test      = test_df['FLAG'].values

# Apply scaler+OHE, then balance training rows with SMOTE
X_train = preprocess.transform(X_train_raw)
X_test  = preprocess.transform(X_test_raw)
X_train, y_train = SMOTE(random_state=42).fit_resample(X_train, y_train)

print(f"Train: {X_train.shape} | illicit% {y_train.mean():.3f}")
print(f"Test : {X_test.shape}  | illicit% {y_test.mean():.3f}")

# Fit three baseline models and report ROC-AUC / PR-AUC
models = {
    "RF":  RandomForestClassifier(
              n_estimators=400, class_weight='balanced',
              n_jobs=-1, random_state=42),

    "XGB": XGBClassifier(
              n_estimators=600, max_depth=6, learning_rate=0.05,
              subsample=0.8, colsample_bytree=0.8,
              scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
              eval_metric='auc', n_jobs=-1, random_state=42),

    "MLP": MLPClassifier(
              hidden_layer_sizes=(256,128), max_iter=40, random_state=42)
}

results = []
for name, clf in models.items():
    print(f"\n──────── {name} ────────")
    clf.fit(X_train, y_train)

    prob  = clf.predict_proba(X_test)[:, 1]        # class-1 probability
    preds = (prob >= 0.5).astype(int)              # hard threshold 0.5

    roc  = roc_auc_score(y_test, prob)             # threshold-free ROC-AUC
    pr   = average_precision_score(y_test, prob)   # PR-AUC – better for skew
    print(f"ROC-AUC = {roc:.3f} | PR-AUC = {pr:.3f}")
    print(classification_report(y_test, preds, digits=3))

    cm = confusion_matrix(y_test, preds)
    results.append(dict(model=name, roc=roc, pr=pr,
                        TP=cm[1,1], FP=cm[0,1],
                        FN=cm[1,0], TN=cm[0,0]))
    
        # Save predictions and true labels for XGB
    if name == "XGB":
        np.save("prob_enriched.npy", prob)
        np.save("y_test_enriched.npy", y_test)

    gc.collect()   # free memory before next model

# Persist metric table for Chapter 4 figures / tables
pd.DataFrame(results).to_csv('baseline_metrics_v3.csv', index=False)
print("\nSaved baseline_metrics_v3.csv")


In [None]:
# ───────────────────────────────────────────────
# Load saved probabilities and labels
# ───────────────────────────────────────────────
import numpy as np
from sklearn.metrics import roc_curve, precision_recall_curve
import matplotlib.pyplot as plt

prob_baseline = np.load("prob_baseline.npy")
y_test_baseline = np.load("y_test_baseline.npy")
prob_enriched = np.load("prob_enriched.npy")
y_test_enriched = np.load("y_test_enriched.npy")

# ───────────────────────────────────────────────
# Compute ROC + PR curves
# ───────────────────────────────────────────────
fpr_base, tpr_base, _ = roc_curve(y_test_baseline, prob_baseline)
fpr_enr,  tpr_enr,  _ = roc_curve(y_test_enriched,  prob_enriched)

prec_base, rec_base, _ = precision_recall_curve(y_test_baseline, prob_baseline)
prec_enr,  rec_enr,  _ = precision_recall_curve(y_test_enriched,  prob_enriched)

# ───────────────────────────────────────────────
# Plot: ROC and PR side by side
# ───────────────────────────────────────────────
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# ROC
axes[0].plot(fpr_enr, tpr_enr, label='BigQuery Enriched', color='#1b9e77', linewidth=2)
axes[0].plot(fpr_base, tpr_base, label='Initial Baseline', color='grey', linestyle='--')
axes[0].plot([0, 1], [0, 1], 'k--', lw=1, alpha=0.5)
axes[0].set_title("ROC Curve – XGBoost")
axes[0].set_xlabel("False Positive Rate")
axes[0].set_ylabel("True Positive Rate")
axes[0].legend()
axes[0].grid(alpha=0.3)

# PR
axes[1].plot(rec_enr, prec_enr, label='BigQuery Enriched', color='#1b9e77', linewidth=2)
axes[1].plot(rec_base, prec_base, label='Initial Baseline', color='grey', linestyle='--')
axes[1].set_title("Precision-Recall Curve – XGBoost")
axes[1].set_xlabel("Recall")
axes[1].set_ylabel("Precision")
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.suptitle("XGBoost ROC and PR Curves: BigQuery vs Baseline", fontsize=14)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
# ---------------------------------------------------------------------------
# Build the transaction edge list for graph-based feature engineering
# ---------------------------------------------------------------------------

#     Upload the list of all addresses in `master` to BigQuery as a
#     *temporary* table (tmp.master_addresses).
#     - Makes the downstream edge query fast because we can use an IN clause
#       instead of passing a huge Python list.
addr_df = master[['Address']]
addr_df.to_gbq(
    'tmp.master_addresses',          # dataset.table in your GCP project
    project_id=PROJECT_ID,           # e.g. "diss-464115"
    if_exists='replace'              # overwrite in case table already exists
)

#     Query the public Ethereum transactions table for edges *only* between
#     addresses that appear in our master list, limited to 2023-01-01 onward
#     to keep the scan cost small.
GQUERY = """
WITH addrs AS (
  SELECT LOWER(Address) AS addr
  FROM   `tmp.master_addresses`
)
SELECT
  LOWER(from_address) AS src,      -- source node
  LOWER(to_address)   AS dst       -- destination node
FROM `bigquery-public-data.crypto_ethereum.transactions`
WHERE block_timestamp >= '2023-01-01'            -- 2-year window
  AND from_address IN (SELECT addr FROM addrs)   -- both endpoints in master
  AND to_address   IN (SELECT addr FROM addrs)
"""

# Run the query, materialise result into a local DataFrame,
# and persist as CSV for later NetworkX processing.
edge_df = client.query(GQUERY).result().to_dataframe()
edge_df.to_csv('edge_list.csv', index=False)


In [None]:
import networkx as nx, pandas as pd

# -------------------------------------------------------------------
# STEP 8-B – Construct the transaction graph and compute core
#            network-centrality metrics for every address node.
# -------------------------------------------------------------------

#  Load the <src , dst> edge list extracted via BigQuery
edges = pd.read_csv('edge_list.csv')

#  Build a directed graph G where:
#       - nodes = Ethereum addresses present in master
#       - edges = ETH transfers between those addresses (2023-24 window)
G = nx.from_pandas_edgelist(
        edges,
        source='src',
        target='dst',
        create_using=nx.DiGraph()
     )

print("Graph:", G.number_of_nodes(), "nodes –", G.number_of_edges(), "edges")

#  Compute node-level features
#     - out/in-degree  : simple transactional activity
#     - PageRank       : global importance / centrality
#     - Betweenness    : bridge-likeness (sampled for speed, k=400)
deg_out  = dict(G.out_degree())                              # # tx sent
deg_in   = dict(G.in_degree())                               # # tx received
pagerank = nx.pagerank(G, alpha=0.85)                        # power-iteration
between  = nx.betweenness_centrality(G, k=400, seed=42)      # sampled approx.

#  Assemble into a DataFrame keyed by Address
graph_feat = pd.DataFrame({
    'Address'   : list(G.nodes()),
    'out_deg'   : pd.Series(deg_out),
    'in_deg'    : pd.Series(deg_in),
    'pagerank'  : pd.Series(pagerank),
    'betweenness': pd.Series(between)
})

#  Persist to CSV – will be merged into master and normalised later
graph_feat.to_csv('graph_features.csv', index=False)


In [None]:
# -------------------------------------------------------------------
# Merge graph-centrality metrics into `master` and rebuild the
# pre-processing pipeline (now v4) that includes these new features.
# -------------------------------------------------------------------
import pandas as pd

#  Join the graph feature table to the master dataset on Address
gfeat = pd.read_csv('graph_features.csv')
master = master.merge(gfeat, on='Address', how='left')

#     Any address that had no outgoing / incoming edge in the 2023-24
#     sub-graph will have NaN in the new columns — set those to zero.
for col in ['out_deg', 'in_deg', 'pagerank', 'betweenness']:
    master[col] = master[col].fillna(0)

# -------------------------------------------------------------------
# Re-fit ColumnTransformer  ->  preprocess_v4.joblib
#   - Adds the 4 graph metrics to the numeric pipeline
# -------------------------------------------------------------------
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib, numpy as np

# Split predictors from ID / target
X_raw = master.drop(columns=['FLAG', 'Address'])

# Detect column types
cat_cols = X_raw.select_dtypes('object').columns.tolist()   # categorical
num_cols = X_raw.select_dtypes('number').columns.tolist()   # numeric (incl. 4 graph feats)

# Build & fit the transformer
preprocess = ColumnTransformer([
    ('num', MinMaxScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), cat_cols)
])
preprocess.fit(X_raw)

# Persist as v4 (graph-enhanced) — subsequent modelling code will load this
joblib.dump(preprocess, 'preprocess_v4.joblib')
print("Saved preprocess_v4.joblib  (includes graph features)")


In [None]:
# -------------------------------------------------------------------
# Load the latest pre-processing pipeline (v4) that now includes
# min-max scaling, one-hot encoding, *and* the four graph-centrality
# columns added in the previous step. All subsequent training /
# inference will reuse these exact transformations.
# -------------------------------------------------------------------
preprocess = joblib.load('preprocess_v4.joblib')


In [None]:
# ---------------------------------------------------------------------------
# STEP 9 & 10  (graph-enhanced baseline)
#   - Uses preprocess_v4.joblib  – includes both transactional & graph features
#   - Time-aware split:   train = Kaggle+Farrugia + 30 % of recent   
#                         test  = remaining 70 % recent (Forta+BigQuery)
#   - Balances training data with SMOTE
#   - Trains three baseline models (RF, XGB, MLP) and logs metrics
# ---------------------------------------------------------------------------
import joblib, gc, warnings, numpy as np, pandas as pd
from sklearn.metrics import (roc_auc_score, average_precision_score,
                             classification_report, confusion_matrix)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
warnings.filterwarnings('ignore')

# Load the pre-fitted transformer that handles scaling + OHE + graph cols
preprocess = joblib.load('preprocess_v4.joblib')

# ---------  Build a time-aware train / test split  -------------------
#      - Historical core (≤ 2021) → always in training
#      - Recent addresses (2022-24) → 30 % into training, 70 % held out
train_core_mask = master['Address'].isin(
    pd.concat([cleaned['Kaggle'], cleaned['Farrugia']])['Address']
)

recent_mask = master['Address'].isin(
    pd.concat([cleaned['Forta'], cleaned['BigQuery']])['Address']
)
recent_df   = master[recent_mask]

train_extra = recent_df.sample(frac=0.30, random_state=42)   # 30 % recent
test_df     = recent_df.drop(train_extra.index)              # 70 % recent

train_df = pd.concat([master[train_core_mask], train_extra], ignore_index=True)

# Split predictors / labels
X_train = preprocess.transform(train_df.drop(columns=['FLAG', 'Address']))
y_train = train_df['FLAG'].values

X_test  = preprocess.transform(test_df.drop(columns=['FLAG', 'Address']))
y_test  = test_df['FLAG'].values

# Balance the minority class in TRAIN set only with SMOTE
X_train, y_train = SMOTE(random_state=42).fit_resample(X_train, y_train)

print(f"Train: {X_train.shape}  illicit% {y_train.mean():.3f}")
print(f"Test : {X_test.shape}   illicit% {y_test.mean():.3f}")

# Define three baseline learners
models = {
    "RF" : RandomForestClassifier(
              n_estimators=400, class_weight='balanced',
              n_jobs=-1, random_state=42),

    "XGB": XGBClassifier(
              n_estimators=600, max_depth=6, learning_rate=0.05,
              subsample=0.8, colsample_bytree=0.8,
              scale_pos_weight = (y_train==0).sum() / (y_train==1).sum(),
              eval_metric='auc', n_jobs=-1, random_state=42),

    "MLP": MLPClassifier(
              hidden_layer_sizes=(256,128),
              max_iter=40, random_state=42)
}

# Train -> evaluate -> collect metrics for each model
results = []
for name, clf in models.items():
    print(f"\n──────── {name} ────────")
    clf.fit(X_train, y_train)

    prob  = clf.predict_proba(X_test)[:, 1]         # P(illicit)
    preds = (prob >= 0.5).astype(int)               # threshold @ 0.5

    roc  = roc_auc_score(y_test, prob)              # ROC-AUC
    pr   = average_precision_score(y_test, prob)    # PR-AUC (class-imbalance)
    print(f"ROC-AUC = {roc:.3f} | PR-AUC = {pr:.3f}")
    print(classification_report(y_test, preds, digits=3))

    cm = confusion_matrix(y_test, preds)
    results.append(dict(model=name, roc=roc, pr=pr,
                        TP=cm[1,1], FP=cm[0,1],
                        FN=cm[1,0], TN=cm[0,0]))
    
    # SAVE each trained model so visual-pack can reload them
    if name == "RF":
        joblib.dump(clf, 'clf_rf.joblib')
    elif name == "XGB":
        joblib.dump(clf, 'clf_xgb.joblib')
    elif name == "MLP":
        joblib.dump(clf, 'clf_mlp.joblib')

    gc.collect()    # free GPU / CPU RAM before next model

# Save metrics table for Chapter 4 results section and test split
pd.DataFrame(results).to_csv("baseline_metrics_v4.csv", index=False)
test_df.to_csv("test_split_v4.csv", index=False)

print("\nSaved baseline_metrics_v4.csv, test_split_v4.csv, and model joblib files.")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# ─────────────────────────────────────────────────────────────
# Load model metrics from both phases
# ─────────────────────────────────────────────────────────────
v3 = pd.read_csv("baseline_metrics_v3.csv")  # BigQuery enriched
v4 = pd.read_csv("baseline_metrics_v4.csv")  # Graph-enhanced

# Merge into a single comparison DataFrame
v3['phase'] = 'BigQuery Only'
v4['phase'] = 'Graph + BigQuery'

df = pd.concat([v3, v4], ignore_index=True)
df = df[df['model'].isin(['RF', 'XGB', 'MLP'])]  # Consistent model ordering

# ─────────────────────────────────────────────────────────────
# Plot: Bar chart for ROC-AUC and PR-AUC by model + phase
# ─────────────────────────────────────────────────────────────
bar_width = 0.35
x_labels = ['RF', 'XGB', 'MLP']
x = np.arange(len(x_labels))

# Extract per-phase scores
roc_v3 = v3.set_index('model').loc[x_labels]['roc'].values
roc_v4 = v4.set_index('model').loc[x_labels]['roc'].values
pr_v3  = v3.set_index('model').loc[x_labels]['pr'].values
pr_v4  = v4.set_index('model').loc[x_labels]['pr'].values

# Plot ROC-AUC comparison
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.bar(x - bar_width/2, roc_v3, bar_width, label='BigQuery Only', color='#1b9e77')
plt.bar(x + bar_width/2, roc_v4, bar_width, label='Graph + BigQuery', color='#d95f02')
plt.xticks(x, x_labels)
plt.ylabel("ROC-AUC")
plt.ylim(0.4, 1.05)
plt.title("ROC-AUC Comparison")
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.6)

# Plot PR-AUC comparison
plt.subplot(1, 2, 2)
plt.bar(x - bar_width/2, pr_v3, bar_width, label='BigQuery Only', color='#1b9e77')
plt.bar(x + bar_width/2, pr_v4, bar_width, label='Graph + BigQuery', color='#d95f02')
plt.xticks(x, x_labels)
plt.ylabel("PR-AUC")
plt.ylim(0.4, 1.05)
plt.title("PR-AUC Comparison")
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.6)

plt.suptitle("Model Performance: BigQuery vs Graph-Augmented Features", fontsize=14)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
# ================================================================
# STEP 1: Rebuild clean master dataset
# ================================================================
import pandas as pd

# Load cleaned datasets
kag   = pd.read_csv("clean_Kaggle.csv")
farr  = pd.read_csv("clean_Farrugia.csv")
forta = pd.read_csv("clean_Forta.csv")
bq    = pd.read_csv("clean_BigQuery.csv")

# Build label registry
label_frames = [
    kag[["Address","FLAG"]],
    farr[["Address","FLAG"]],
    forta[["Address","FLAG"]],
    bq[["Address","FLAG"]]
]
registry = (
    pd.concat(label_frames, ignore_index=True)
      .groupby("Address", as_index=False)["FLAG"]
      .max()
)

# Build transactional feature matrix
tx_feats = pd.concat(
    [kag.drop(columns=["FLAG"]),
     farr.drop(columns=["FLAG"])],
    ignore_index=True
)
tx_feats = tx_feats.merge(
    bq.drop(columns=["FLAG"]),
    on="Address", how="outer"
)

# Merge labels and features
master = registry.merge(tx_feats, on="Address", how="left")

# Fill numeric NaNs
num_cols = master.select_dtypes("number").columns
master[num_cols] = master[num_cols].fillna(master[num_cols].median())

# Fill categorical NaNs
cat_cols = master.select_dtypes("object").columns.difference(["Address"])
master[cat_cols] = master[cat_cols].fillna("unknown")

# Save for reproducibility
master.to_csv("master_dataset_node2vec_base.csv", index=False)
print("master_dataset_node2vec_base.csv saved:", master.shape)


In [None]:
# ================================================================
# STEP 2: Node2Vec embedding generation
# ================================================================
import pandas as pd
import networkx as nx
from node2vec import Node2Vec

# Load edge list
edges = pd.read_csv("edge_list.csv")
G = nx.from_pandas_edgelist(edges, source="src", target="dst", create_using=nx.DiGraph())

print("Graph nodes:", G.number_of_nodes(), "| edges:", G.number_of_edges())

# Train Node2Vec
node2vec = Node2Vec(
    G, dimensions=64, walk_length=10, num_walks=50, workers=2, seed=42
)
model = node2vec.fit(window=5, min_count=1, batch_words=4)

# Create embedding DataFrame
embeddings = []
for node in G.nodes():
    vec = model.wv.get_vector(node)
    embeddings.append([node] + vec.tolist())

cols = ["Address"] + [f"n2v_{i}" for i in range(64)]
embed_df = pd.DataFrame(embeddings, columns=cols)

# Save embeddings
embed_df.to_csv("node2vec_embeddings.csv", index=False)
print("node2vec_embeddings.csv saved:", embed_df.shape)


In [None]:
# ================================================================
# STEP 3: Merge Node2Vec embeddings
# ================================================================
import pandas as pd

# Load base master
master = pd.read_csv("master_dataset_node2vec_base.csv")
print("Master loaded:", master.shape)

# Load embeddings
embed = pd.read_csv("node2vec_embeddings.csv")
embed["Address"] = embed["Address"].str.lower()

# Merge
master = master.merge(embed, on="Address", how="left")

# Fill embedding NaNs with 0
embed_cols = [c for c in master.columns if c.startswith("n2v_")]
master[embed_cols] = master[embed_cols].fillna(0)

# Save
master.to_csv("master_dataset_node2vec.csv", index=False)
print("master_dataset_node2vec.csv saved:", master.shape)


In [None]:
# ================================================================
# STEP 4: Strict Address-Based Split
# ================================================================
import numpy as np

# Load master
master = pd.read_csv("master_dataset_node2vec.csv")

# Get unique addresses
unique_addrs = master["Address"].unique()
print("Total unique addresses:", len(unique_addrs))

# Random split
rng = np.random.RandomState(42)
train_addrs = rng.choice(unique_addrs, size=int(0.6*len(unique_addrs)), replace=False)
test_addrs  = np.setdiff1d(unique_addrs, train_addrs)

# Confirm disjoint
assert len(set(train_addrs) & set(test_addrs)) == 0

# Build splits
train_df = master[master["Address"].isin(train_addrs)].reset_index(drop=True)
test_df  = master[master["Address"].isin(test_addrs)].reset_index(drop=True)

# Save for reproducibility
train_df.to_csv("train_node2vec.csv", index=False)
test_df.to_csv("test_node2vec.csv", index=False)

print("Train/Test split saved.")
print("Train rows:", len(train_df), "| Test rows:", len(test_df))


In [None]:
# ================================================================
# STEP 5: Preprocessing + Model Training (Leak-Free)
# ================================================================
import pandas as pd, numpy as np, gc, joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Load splits
train_df = pd.read_csv("train_node2vec.csv")
test_df  = pd.read_csv("test_node2vec.csv")

# Prepare columns
drop_cols = ["FLAG", "Address"]

X_train_raw = train_df.drop(columns=drop_cols)
y_train     = train_df["FLAG"].values

X_test_raw  = test_df.drop(columns=drop_cols)
y_test      = test_df["FLAG"].values

# Preprocessing
cat_cols = X_train_raw.select_dtypes("object").columns.tolist()
num_cols = X_train_raw.select_dtypes("number").columns.tolist()

X_train_raw[num_cols] = X_train_raw[num_cols].fillna(0)
X_test_raw[num_cols]  = X_test_raw[num_cols].fillna(0)

preprocess = ColumnTransformer([
    ("num", MinMaxScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols)
])

preprocess.fit(X_train_raw)
joblib.dump(preprocess, "preprocess_node2vec.joblib")
print("Fitted preprocess on train only.")

# Transform (dense + NaN safe)
def transform_dense(X):
    Xp = preprocess.transform(X)
    if hasattr(Xp, "toarray"):
        Xp = Xp.toarray()
    return np.nan_to_num(Xp, nan=0.0, posinf=0.0, neginf=0.0)

X_train = transform_dense(X_train_raw)
X_test  = transform_dense(X_test_raw)

# Balance training set
X_train, y_train = SMOTE(random_state=42).fit_resample(X_train, y_train)
print("Train balanced:", X_train.shape, "| % illicit:", y_train.mean())

# Train models
models = {
    "RF": RandomForestClassifier(
        n_estimators=400, class_weight="balanced",
        n_jobs=-1, random_state=42),

    "XGB": XGBClassifier(
        n_estimators=600, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
        eval_metric="auc", n_jobs=-1, random_state=42),

    "MLP": MLPClassifier(
        hidden_layer_sizes=(256,128), max_iter=40, random_state=42)
}

results = []

for name, clf in models.items():
    print(f"\n=== Training {name} ===")
    clf.fit(X_train, y_train)

    prob = clf.predict_proba(X_test)[:,1]
    preds = (prob >= 0.5).astype(int)

    roc = roc_auc_score(y_test, prob)
    pr  = average_precision_score(y_test, prob)

    print(f"ROC-AUC: {roc:.3f} | PR-AUC: {pr:.3f}")
    print(classification_report(y_test, preds, digits=3))

    cm = confusion_matrix(y_test, preds)
    results.append(dict(model=name, roc=roc, pr=pr,
                        TP=cm[1,1], FP=cm[0,1],
                        FN=cm[1,0], TN=cm[0,0]))

    joblib.dump(clf, f"clf_node2vec_{name.lower()}.joblib")
    gc.collect()

# Save metrics
pd.DataFrame(results).to_csv("node2vec_metrics.csv", index=False)
print("Saved node2vec_metrics.csv.")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import numpy as np
import joblib

# Reload test data
test_df = pd.read_csv("test_node2vec.csv")
y_test = test_df["FLAG"].values

# Reload preprocessing
preprocess = joblib.load("preprocess_node2vec.joblib")
X_test_raw = test_df.drop(columns=["FLAG", "Address"])

# Handle transformation
def transform_dense(X):
    Xp = preprocess.transform(X)
    return Xp.toarray() if hasattr(Xp, "toarray") else Xp

X_test = transform_dense(X_test_raw)

# Reload models
models = {
    "RF": joblib.load("clf_node2vec_rf.joblib"),
    "XGB": joblib.load("clf_node2vec_xgb.joblib"),
    "MLP": joblib.load("clf_node2vec_mlp.joblib")
}

# Store scores
precision_scores = []
recall_scores = []
f1_scores = []

for name, model in models.items():
    preds = model.predict(X_test)
    report = classification_report(y_test, preds, output_dict=True)
    precision_scores.append(report["1"]["precision"])
    recall_scores.append(report["1"]["recall"])
    f1_scores.append(report["1"]["f1-score"])

# Stacked bar plot
labels = list(models.keys())
x = np.arange(len(labels))
bar_width = 0.6

plt.figure(figsize=(8, 6))
plt.bar(x, precision_scores, width=bar_width, label='Precision', color='#1f77b4')
plt.bar(x, recall_scores, width=bar_width, bottom=precision_scores, label='Recall', color='#ff7f0e')
bottoms = np.array(precision_scores) + np.array(recall_scores)
plt.bar(x, f1_scores, width=bar_width, bottom=bottoms, label='F1-Score', color='#2ca02c')

plt.xticks(x, labels)
plt.ylim(0, 2.5)
plt.ylabel("Score (stacked)")
plt.title("Precision, Recall, and F1 per Model (Illicit Class)")
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve

plt.figure(figsize=(12, 5))

# Subplot 1: ROC
plt.subplot(1, 2, 1)
for name, model in models.items():
    prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, prob)
    plt.plot(fpr, tpr, label=name)
plt.plot([0, 1], [0, 1], 'k--', alpha=0.4)
plt.title("ROC Curves – Hybrid Feature Models")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(alpha=0.3)

# Subplot 2: Precision-Recall
plt.subplot(1, 2, 2)
for name, model in models.items():
    prob = model.predict_proba(X_test)[:, 1]
    prec, rec, _ = precision_recall_curve(y_test, prob)
    plt.plot(rec, prec, label=name)
plt.title("Precision-Recall Curves – Hybrid Feature Models")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.grid(alpha=0.3)

plt.suptitle("Hybrid Features (Transactional + Node2Vec): Model Comparison", fontsize=14)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
# ================================================================
# Node2Vec Embeddings Only (RF, XGB, MLP)
# ================================================================
import pandas as pd, numpy as np, joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE

# Load train/test splits
train_df = pd.read_csv("train_node2vec.csv")
test_df  = pd.read_csv("test_node2vec.csv")

# Select embedding columns
embed_cols = [c for c in train_df.columns if c.startswith("n2v_")]
X_train_raw, y_train = train_df[embed_cols], train_df["FLAG"].values
X_test_raw,  y_test  = test_df[embed_cols],  test_df["FLAG"].values

# Scale features
scaler = MinMaxScaler().fit(X_train_raw)
X_train, X_test = scaler.transform(X_train_raw), scaler.transform(X_test_raw)

# Balance training set
X_train, y_train = SMOTE(random_state=42).fit_resample(X_train, y_train)

# Define models
models = {
    "RandomForest": RandomForestClassifier(n_estimators=400, class_weight="balanced", n_jobs=-1, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=600, max_depth=6, learning_rate=0.05,
                             subsample=0.8, colsample_bytree=0.8,
                             scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
                             eval_metric="auc", n_jobs=-1, random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(256,128), max_iter=40, random_state=42)
}

# Train, evaluate, and save
results = []
for name, clf in models.items():
    print(f"\n=== {name} (Embeddings Only) ===")
    clf.fit(X_train, y_train)

    prob  = clf.predict_proba(X_test)[:,1]
    preds = (prob >= 0.5).astype(int)

    roc, pr = roc_auc_score(y_test, prob), average_precision_score(y_test, prob)
    print(f"ROC-AUC: {roc:.3f} | PR-AUC: {pr:.3f}")
    print(classification_report(y_test, preds))

    cm = confusion_matrix(y_test, preds)
    results.append(dict(model=name, roc=roc, pr=pr,
                        TP=cm[1,1], FP=cm[0,1],
                        FN=cm[1,0], TN=cm[0,0]))

    joblib.dump(clf, f"clf_embeddings_{name.lower()}.joblib")

# Save metrics
pd.DataFrame(results).to_csv("embeddings_only_metrics.csv", index=False)
print("\nResults saved to embeddings_only_metrics.csv")


In [None]:
# ================================================================
# Evaluate Saved Models (Node2Vec Embeddings Only)
# ================================================================
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, joblib
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.preprocessing import MinMaxScaler

# Load test set and isolate embeddings
test_df = pd.read_csv("test_node2vec.csv")
embed_cols = [c for c in test_df.columns if c.startswith("n2v_")]
X_test_raw, y_test = test_df[embed_cols], test_df["FLAG"].values

# Scale features
scaler = MinMaxScaler().fit(X_test_raw)
X_test = scaler.transform(X_test_raw)

# Reload trained models
models = {
    "RandomForest": joblib.load("clf_embeddings_randomforest.joblib"),
    "XGBoost": joblib.load("clf_embeddings_xgboost.joblib"),
    "MLP": joblib.load("clf_embeddings_mlp.joblib")
}

# Confusion matrices
plt.figure(figsize=(15, 4))
labels = ["Legitimate", "Illicit"]
for i, (name, model) in enumerate(models.items(), start=1):
    preds = model.predict(X_test)
    cm = confusion_matrix(y_test, preds)
    plt.subplot(1, 3, i)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Reds",
                xticklabels=labels, yticklabels=labels)
    plt.title(f"{name}")
    plt.xlabel("Predicted")
    plt.ylabel("True")
plt.suptitle("Confusion Matrices – Embeddings Only", fontsize=14)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# Precision–Recall curves
plt.figure(figsize=(7.5, 6))
for name, model in models.items():
    prob = model.predict_proba(X_test)[:, 1]
    prec, rec, _ = precision_recall_curve(y_test, prob)
    plt.plot(rec, prec, label=name)
plt.title("PR Curves – Embeddings Only")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# ================================================================
# Randomized Labels Test (RF, XGB, MLP)
# ================================================================
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np

# Shuffle labels
y_train_random = y_train.copy()
np.random.shuffle(y_train_random)

# Define models
models = {
    "RandomForest": RandomForestClassifier(
        n_estimators=400, class_weight="balanced", n_jobs=-1, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=600, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
        eval_metric="auc", n_jobs=-1, random_state=42),
    "MLP": MLPClassifier(
        hidden_layer_sizes=(256,128), max_iter=40, random_state=42)
}

# Train + evaluate
for name, clf in models.items():
    print(f"\n=== {name} (Randomized Labels) ===")
    clf.fit(X_train, y_train_random)
    prob  = clf.predict_proba(X_test)[:,1]
    preds = (prob >= 0.5).astype(int)

    roc, pr = roc_auc_score(y_test, prob), average_precision_score(y_test, prob)
    print(f"ROC-AUC: {roc:.3f} | PR-AUC: {pr:.3f}\n")
    print(classification_report(y_test, preds, digits=3))


In [None]:
# ================================================================
# ROC Curves – Randomized Labels
# ================================================================
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

# Plot ROC for each model
plt.figure(figsize=(7, 6))
for name, clf in models.items():
    prob = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, prob)
    plt.plot(fpr, tpr, label=name)

# Add random baseline
plt.plot([0, 1], [0, 1], '--', color='gray', label="Random Guess")

# Style
plt.title("ROC Curves – Randomized Labels")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# ================================================================
# Count Unique Addresses per Source
# ================================================================
import pandas as pd

# Load datasets
kaggle_df = pd.read_csv("clean_Kaggle.csv")
farr_df   = pd.read_csv("clean_Farrugia.csv")
forta_df  = pd.read_csv("clean_Forta.csv")

# Extract lowercase address sets
kaggle_addrs = set(kaggle_df["Address"].str.lower())
farr_addrs   = set(farr_df["Address"].str.lower())
forta_addrs  = set(forta_df["Address"].str.lower())

# Print counts
print("Unique addresses per client:")
print("Kaggle:", len(kaggle_addrs))
print("Farrugia:", len(farr_addrs))
print("Forta:", len(forta_addrs))


In [None]:
# ================================================================
# Edge Counts per Client
# ================================================================
# Load edge list + lowercase
edges = pd.read_csv("edge_list.csv")
edges["src"], edges["dst"] = edges["src"].str.lower(), edges["dst"].str.lower()

# Subgraphs by client source nodes
edges_kaggle = edges[edges["src"].isin(kaggle_addrs)]
edges_farr   = edges[edges["src"].isin(farr_addrs)]
edges_forta  = edges[edges["src"].isin(forta_addrs)]

# Print counts
print("Edges per client:")
print("Kaggle:", len(edges_kaggle))
print("Farrugia:", len(edges_farr))
print("Forta:", len(edges_forta))


In [None]:
# ================================================================
# Train Node2Vec Embeddings per Client
# ================================================================
from node2vec import Node2Vec
import networkx as nx

def train_node2vec(edge_df, name):
    print(f"\n=== Training Node2Vec for {name} ===")

    # Build directed graph
    G = nx.from_pandas_edgelist(edge_df, "src", "dst", create_using=nx.DiGraph())

    # Train Node2Vec
    model = Node2Vec(G, dimensions=64, walk_length=10,
                     num_walks=30, workers=2).fit()

    # Collect embeddings
    embeddings = [[node] + model.wv.get_vector(node).tolist()
                  for node in G.nodes()]

    # Save to CSV
    cols = ["Address"] + [f"{name}_emb_{i}" for i in range(64)]
    embed_df = pd.DataFrame(embeddings, columns=cols)
    embed_df.to_csv(f"embed_{name}.csv", index=False)
    print(f"Saved embed_{name}.csv: shape {embed_df.shape}")

    return embed_df

# Train per client
embed_kaggle = train_node2vec(edges_kaggle, "kaggle")
embed_farr   = train_node2vec(edges_farr, "farrugia")
embed_forta  = train_node2vec(edges_forta, "forta")


In [None]:
# ================================================================
# Merge Per-Client Embeddings into Federated Matrix
# ================================================================
# Outer-join on Address
merged = embed_kaggle.merge(embed_farr, on="Address", how="outer")
merged = merged.merge(embed_forta, on="Address", how="outer")

# Fill missing vectors with zeros
embed_cols = [c for c in merged.columns if "_emb_" in c]
merged[embed_cols] = merged[embed_cols].fillna(0)

# Save federated embeddings
print("Final merged embedding shape:", merged.shape)
merged.to_csv("federated_node2vec_embeddings.csv", index=False)
print("Saved federated_node2vec_embeddings.csv")

In [None]:
# ================================================================
# Add Federated Embeddings to Master Transactional Dataset
# ================================================================
import pandas as pd

# Load base master + federated embeddings
master = pd.read_csv("master_dataset_node2vec_base.csv")
fed    = pd.read_csv("federated_node2vec_embeddings.csv")
fed["Address"] = fed["Address"].str.lower()

# Merge on Address
master = master.merge(fed, on="Address", how="left")

# Fill missing embedding values
embed_cols = [c for c in master.columns if "_emb_" in c]
master[embed_cols] = master[embed_cols].fillna(0)

# Save updated master
print("Merged federated embeddings.")
print("New master shape:", master.shape)
master.to_csv("master_dataset_federated_node2vec.csv", index=False)
print("Saved master_dataset_federated_node2vec.csv")


In [None]:
# ================================================================
# Add Federated Embeddings to Master Transactional Dataset
# ================================================================
import pandas as pd

# Load base master + federated embeddings
master = pd.read_csv("master_dataset_node2vec_base.csv")
fed    = pd.read_csv("federated_node2vec_embeddings.csv")
fed["Address"] = fed["Address"].str.lower()

# Merge on Address
master = master.merge(fed, on="Address", how="left")

# Fill missing embedding values
embed_cols = [c for c in master.columns if "_emb_" in c]
master[embed_cols] = master[embed_cols].fillna(0)

# Save updated master
print("Merged federated embeddings.")
print("New master shape:", master.shape)
master.to_csv("master_dataset_federated_node2vec.csv", index=False)
print("Saved master_dataset_federated_node2vec.csv")


In [None]:
# ================================================================
# Preprocess Master Dataset (Scaling + One-Hot Encoding)
# ================================================================
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import numpy as np

# Split numeric/categorical
X_raw = df.drop(columns=["FLAG","Address"])
cat_cols = X_raw.select_dtypes("object").columns.tolist()
num_cols = X_raw.select_dtypes("number").columns.tolist()

# Fill NaNs
X_raw[num_cols] = X_raw[num_cols].fillna(0)

# Define transformer
pre = ColumnTransformer([
    ("num", MinMaxScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols)
])
pre.fit(X_raw)

# Transform helper
def transform(X):
    Xt = pre.transform(X)
    if hasattr(Xt, "toarray"):
        Xt = Xt.toarray()
    return np.nan_to_num(Xt)

# Apply to train/test
Xt_train = transform(train_df.drop(columns=["FLAG","Address"]))
Xt_test  = transform(test_df.drop(columns=["FLAG","Address"]))
y_train, y_test = train_df["FLAG"].values, test_df["FLAG"].values

print("Preprocessing complete.")


In [None]:
# ================================================================
# Balance Training Set with SMOTE
# ================================================================
from imblearn.over_sampling import SMOTE

Xt_train, y_train = SMOTE(random_state=42).fit_resample(Xt_train, y_train)
print("SMOTE balancing done. Train shape:", Xt_train.shape)

In [None]:
# ================================================================
# EXPERIMENT: Federated Node2Vec Embeddings + Transactional Features (RF, XGB, MLP)
# ================================================================
import pandas as pd
import numpy as np
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE

# Load your federated feature dataset
train_df = pd.read_csv("train_node2vec.csv")  # If you split with federated features, use the federated train/test
test_df  = pd.read_csv("test_node2vec.csv")

# Or, if you have separate splits, use:
# train_df = pd.read_csv("train_federated.csv")
# test_df = pd.read_csv("test_federated.csv")

drop_cols = ["FLAG", "Address"]
X_train_raw = train_df.drop(columns=drop_cols)
y_train     = train_df["FLAG"].values

X_test_raw  = test_df.drop(columns=drop_cols)
y_test      = test_df["FLAG"].values

# Fill NaNs just in case
num_cols = X_train_raw.select_dtypes("number").columns.tolist()
cat_cols = X_train_raw.select_dtypes("object").columns.tolist()
X_train_raw[num_cols] = X_train_raw[num_cols].fillna(0)
X_test_raw[num_cols]  = X_test_raw[num_cols].fillna(0)

# Preprocess
preprocess = ColumnTransformer([
    ("num", MinMaxScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols)
])
preprocess.fit(X_train_raw)

def transform(X):
    Xt = preprocess.transform(X)
    if hasattr(Xt, "toarray"):
        Xt = Xt.toarray()
    return np.nan_to_num(Xt)

Xt_train = transform(X_train_raw)
Xt_test  = transform(X_test_raw)

# SMOTE balancing on train only
Xt_train, y_train = SMOTE(random_state=42).fit_resample(Xt_train, y_train)

# Define models
models = {
    "RandomForest": RandomForestClassifier(n_estimators=400, class_weight="balanced", n_jobs=-1, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=600, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8,
                             scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
                             eval_metric="auc", n_jobs=-1, random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(256,128), max_iter=40, random_state=42)
}

results = []

for name, clf in models.items():
    print(f"\n=== {name} (Federated Node2Vec + Transactional) ===")
    clf.fit(Xt_train, y_train)

    prob = clf.predict_proba(Xt_test)[:,1]
    preds = (prob >= 0.5).astype(int)

    roc = roc_auc_score(y_test, prob)
    pr  = average_precision_score(y_test, prob)

    print(f"ROC-AUC: {roc:.3f} | PR-AUC: {pr:.3f}")
    print(classification_report(y_test, preds))

    cm = confusion_matrix(y_test, preds)
    results.append(dict(model=name, roc=roc, pr=pr,
                        TP=cm[1,1], FP=cm[0,1],
                        FN=cm[1,0], TN=cm[0,0]))

    joblib.dump(clf, f"clf_federated_{name.lower()}.joblib")

pd.DataFrame(results).to_csv("federated_metrics.csv", index=False)
print("\n Results saved to federated_metrics.csv")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load all metrics CSVs (ensure these files were generated in previous steps)
df_txn        = pd.read_csv("baseline_metrics.csv")              # Transactional only
df_n2v        = pd.read_csv("embeddings_only_metrics.csv")       # Node2Vec only
df_hybrid     = pd.read_csv("node2vec_metrics.csv")              # Hybrid
df_federated  = pd.read_csv("federated_metrics.csv")             # Federated

# Tag model type for each
df_txn["type"] = "Transactional"
df_n2v["type"] = "Node2Vec"
df_hybrid["type"] = "Hybrid"
df_federated["type"] = "Federated"

# Concatenate into single DataFrame
combined = pd.concat([df_txn, df_n2v, df_hybrid, df_federated], ignore_index=True)

combined["model"] = combined["model"].replace({
    "RandomForest": "RF",
    "XGBoost": "XGB"
})

# Pivot for bar chart: separate ROC and PR for grouped bars
roc_data = combined.pivot(index="model", columns="type", values="roc")
pr_data  = combined.pivot(index="model", columns="type", values="pr")


# Plot: Clustered bars for ROC + PR-AUC per model
fig, axs = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

bar_width = 0.2
models = roc_data.index.tolist()
types  = ["Transactional", "Node2Vec", "Hybrid", "Federated"]
colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728"]

# ROC-AUC chart
for i, t in enumerate(types):
    axs[0].bar(np.arange(len(models)) + i*bar_width,
               roc_data[t].values,
               width=bar_width, label=t, color=colors[i])
axs[0].set_xticks(np.arange(len(models)) + bar_width*1.5)
axs[0].set_xticklabels(models)
axs[0].set_ylim(0.0, 1.05)
axs[0].set_ylabel("ROC-AUC")
axs[0].set_title("ROC-AUC Comparison by Model Type")
axs[0].legend()

# PR-AUC chart
for i, t in enumerate(types):
    axs[1].bar(np.arange(len(models)) + i*bar_width,
               pr_data[t].values,
               width=bar_width, label=t, color=colors[i])
axs[1].set_xticks(np.arange(len(models)) + bar_width*1.5)
axs[1].set_xticklabels(models)
axs[1].set_ylim(0.0, 1.05)
axs[1].set_ylabel("PR-AUC")
axs[1].set_title("PR-AUC Comparison by Model Type")
axs[1].legend()

plt.suptitle("Model Performance: Federated Node2Vec vs Other Representations", fontsize=15)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [None]:
# ================================================================
# STEP 3: Stratified K-Fold Cross-Validation (RF, XGB only)
# ================================================================
import pandas as pd, numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from imblearn.over_sampling import SMOTE

# Load federated Node2Vec dataset
df = pd.read_csv("master_dataset_federated_node2vec.csv")

drop_cols = ["FLAG", "Address"]
X_raw = df.drop(columns=drop_cols)
y = df["FLAG"].values

cat_cols = X_raw.select_dtypes("object").columns.tolist()
num_cols = X_raw.select_dtypes("number").columns.tolist()
X_raw[num_cols] = X_raw[num_cols].fillna(0)

# K-Fold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# To store results per model
metrics = {
    "RandomForest": {"roc": [], "pr": []},
    "XGBoost": {"roc": [], "pr": []}
}

fold = 1
for train_idx, test_idx in kf.split(X_raw, y):
    print(f"\n=== Fold {fold} ===")
    X_train, y_train = X_raw.iloc[train_idx], y[train_idx]
    X_test,  y_test  = X_raw.iloc[test_idx],  y[test_idx]

    # Preprocess per fold
    pre = ColumnTransformer([
        ("num", MinMaxScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols)
    ])
    pre.fit(X_train)

    def transform(X):
        Xt = pre.transform(X)
        if hasattr(Xt, "toarray"):
            Xt = Xt.toarray()
        return np.nan_to_num(Xt)

    Xt_train = transform(X_train)
    Xt_test  = transform(X_test)

    # SMOTE
    Xt_train, y_train = SMOTE(random_state=42).fit_resample(Xt_train, y_train)

    # Define models
    models = {
        "RandomForest": RandomForestClassifier(n_estimators=400, class_weight="balanced", n_jobs=-1, random_state=42),
        "XGBoost": XGBClassifier(n_estimators=600, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8,
                                 scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
                                 eval_metric="auc", n_jobs=-1, random_state=42)
    }

    for name, clf in models.items():
        clf.fit(Xt_train, y_train)
        prob = clf.predict_proba(Xt_test)[:,1]
        roc  = roc_auc_score(y_test, prob)
        pr   = average_precision_score(y_test, prob)
        metrics[name]["roc"].append(roc)
        metrics[name]["pr"].append(pr)
        print(f"{name}: ROC-AUC: {roc:.3f} | PR-AUC: {pr:.3f}")

    fold += 1

# Show summary for each model
print("\n=== CV Summary ===")
for name in metrics:
    print(f"{name}: Mean ROC-AUC: {np.mean(metrics[name]['roc']):.3f} | Mean PR-AUC: {np.mean(metrics[name]['pr']):.3f}")

# Save to file
cv_results = pd.DataFrame({
    "Model": sum([[name]*5 for name in metrics], []),
    "Fold": list(range(1,6))*2,
    "ROC-AUC": metrics["RandomForest"]["roc"] + metrics["XGBoost"]["roc"],
    "PR-AUC": metrics["RandomForest"]["pr"] + metrics["XGBoost"]["pr"]
})


In [None]:
# ================================================================
# STEP 3+4: Stratified K-Fold Cross-Validation (Selected Folds) + Plot
# ================================================================
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from imblearn.over_sampling import SMOTE

# Load federated Node2Vec dataset
df = pd.read_csv("master_dataset_federated_node2vec.csv")
drop_cols = ["FLAG", "Address"]
X_raw = df.drop(columns=drop_cols)
y = df["FLAG"].values

cat_cols = X_raw.select_dtypes("object").columns.tolist()
num_cols = X_raw.select_dtypes("number").columns.tolist()
X_raw[num_cols] = X_raw[num_cols].fillna(0)

# Define specific folds to include
target_folds = [1, 5, 10, 12]
kf = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)

metrics = {
    "Model": [],
    "Fold": [],
    "ROC-AUC": [],
    "PR-AUC": []
}

fold = 1
for train_idx, test_idx in kf.split(X_raw, y):
    if fold not in target_folds:
        fold += 1
        continue

    print(f"\n=== Fold {fold} ===")
    X_train, y_train = X_raw.iloc[train_idx], y[train_idx]
    X_test,  y_test  = X_raw.iloc[test_idx],  y[test_idx]

    pre = ColumnTransformer([
        ("num", MinMaxScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols)
    ])
    pre.fit(X_train)

    def transform(X):
        Xt = pre.transform(X)
        if hasattr(Xt, "toarray"):
            Xt = Xt.toarray()
        return np.nan_to_num(Xt)

    Xt_train = transform(X_train)
    Xt_test  = transform(X_test)
    Xt_train, y_train = SMOTE(random_state=42).fit_resample(Xt_train, y_train)

    models = {
        "RandomForest": RandomForestClassifier(n_estimators=400, class_weight="balanced", n_jobs=-1, random_state=42),
        "XGBoost": XGBClassifier(
            n_estimators=600, max_depth=6, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
            eval_metric="auc", n_jobs=-1, random_state=42
        )
    }

    for name, clf in models.items():
        clf.fit(Xt_train, y_train)
        prob = clf.predict_proba(Xt_test)[:, 1]
        roc  = roc_auc_score(y_test, prob)
        pr   = average_precision_score(y_test, prob)
        print(f"{name}: ROC-AUC: {roc:.3f} | PR-AUC: {pr:.3f}")
        metrics["Model"].append(name)
        metrics["Fold"].append(fold)
        metrics["ROC-AUC"].append(roc)
        metrics["PR-AUC"].append(pr)

    fold += 1

# Convert to DataFrame and save
cv_df = pd.DataFrame(metrics)
cv_df.to_csv("cv_results_selected_folds.csv", index=False)
print("Saved: cv_results_selected_folds.csv")

# --------------------------------------------------
# Plot the results
# --------------------------------------------------
plt.figure(figsize=(14, 6))

# ROC-AUC Boxplot
plt.subplot(1, 2, 1)
sns.boxplot(data=cv_df, x="Model", y="ROC-AUC", palette="Set2")
sns.stripplot(data=cv_df, x="Model", y="ROC-AUC", color='black', size=6, jitter=True)
plt.title("Stratified CV (Folds 1, 5, 10, 12) – ROC-AUC")
plt.grid(True)

# PR-AUC Boxplot
plt.subplot(1, 2, 2)
sns.boxplot(data=cv_df, x="Model", y="PR-AUC", palette="Set2")
sns.stripplot(data=cv_df, x="Model", y="PR-AUC", color='black', size=6, jitter=True)
plt.title("Stratified CV (Folds 1, 5, 10, 12) – PR-AUC")
plt.grid(True)

plt.tight_layout()
plt.savefig("cv_performance_summary.png", dpi=300)
print("Saved: cv_performance_summary.png")
plt.show()


In [None]:
# ================================================================
# Robustness Evaluation (Federated Node2Vec + Transactional Features)
# ================================================================
import numpy as np, pandas as pd, joblib, warnings
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    matthews_corrcoef, cohen_kappa_score, confusion_matrix
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

warnings.filterwarnings('ignore')

# Load data
train_df = pd.read_csv("train_node2vec.csv")
test_df  = pd.read_csv("test_node2vec.csv")

# Prepare columns
drop_cols = ["FLAG", "Address"]
X_raw = pd.concat([train_df, test_df]).drop(columns=drop_cols)
cat_cols = X_raw.select_dtypes("object").columns.tolist()
num_cols = X_raw.select_dtypes("number").columns.tolist()

# Fill NaNs
train_df[num_cols] = train_df[num_cols].fillna(0)
test_df[num_cols]  = test_df[num_cols].fillna(0)

# Define and fit preprocessor
preprocess = ColumnTransformer([
    ("num", MinMaxScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols)
])
preprocess.fit(train_df.drop(columns=drop_cols))

# Transform data
X_train_full = preprocess.transform(train_df.drop(columns=drop_cols))
y_train_full = train_df["FLAG"].values
X_test       = preprocess.transform(test_df.drop(columns=drop_cols))
y_test       = test_df["FLAG"].values

# Scoring helper
def score_model(model_name, clf, xt=X_test, yt=y_test):
    prob  = clf.predict_proba(xt)[:, 1]
    preds = (prob >= 0.5).astype(int)
    return dict(
        model   = model_name,
        roc_auc = roc_auc_score(yt, prob),
        pr_auc  = average_precision_score(yt, prob),
        mcc     = matthews_corrcoef(yt, preds),
        kappa   = cohen_kappa_score(yt, preds)
    )

# Model builders
def get_xgb(y_train):
    return XGBClassifier(
        n_estimators=600, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
        eval_metric='auc', n_jobs=-1, random_state=42
    )

def get_rf():
    return RandomForestClassifier(n_estimators=400, class_weight='balanced', n_jobs=-1, random_state=42)

# Run experiments
results = []

# 1. Baseline
print("\n--- Training Baseline models (XGB & RF) ---")
X_train_bal, y_train_bal = SMOTE(random_state=42).fit_resample(X_train_full, y_train_full)
for name, clf_factory in [("Baseline_XGB", lambda: get_xgb(y_train_bal)),
                          ("Baseline_RF", get_rf)]:
    print(f"  Training {name} ...")
    clf = clf_factory()
    clf.fit(X_train_bal, y_train_bal)
    results.append(score_model(name, clf))

# 2. Label Noise
for noise_pct in [0.05, 0.10, 0.20, 0.35, 0.40, 0.45, 0.50]:
    print(f"\n--- Label Noise: Flipping {int(noise_pct*100)}% of labels ---")
    y_noisy = y_train_full.copy()
    flip_idx = np.random.choice(len(y_noisy), int(len(y_noisy) * noise_pct), replace=False)
    y_noisy[flip_idx] = 1 - y_noisy[flip_idx]
    X_bal, y_bal = SMOTE(random_state=42).fit_resample(X_train_full, y_noisy)

    for name, clf_factory in [(f"LabelNoise{int(noise_pct*100)}%_XGB", lambda: get_xgb(y_bal)),
                              (f"LabelNoise{int(noise_pct*100)}%_RF", get_rf)]:
        print(f"  Training {name} ...")
        clf = clf_factory()
        clf.fit(X_bal, y_bal)
        results.append(score_model(name, clf))

# Save and display
robust_df = pd.DataFrame(results)
robust_df.to_csv("robustness_metrics.csv", index=False)
print("\n Robustness evaluation complete. Results saved to robustness_metrics.csv")
robust_df


In [None]:
# ================================================================
#  robustness test
#  Federated Node2Vec + transactional features
#  Models: Random-Forest  &  XGBoost
# ================================================================
import numpy as np, pandas as pd, time, warnings
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import (roc_auc_score, average_precision_score,
                             matthews_corrcoef, cohen_kappa_score)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
warnings.filterwarnings("ignore")

t0 = time.time()

# ---------- CONFIG ------------------------------------------------
CSV_TRAIN  = "train_node2vec.csv"
CSV_TEST   = "test_node2vec.csv"
DROP_PCTS  = [0.50, 0.60, 0.70, 0.80, 0.85, 0.90, 0.95, 0.99]          
RNG        = np.random.default_rng(42)
SMOTE_KW   = dict(random_state=42)

# ---------- LOAD --------------------------------------------------
print(" Loading data …")
train_df = pd.read_csv(CSV_TRAIN)
test_df  = pd.read_csv(CSV_TEST)

y_train = train_df["FLAG"].values
y_test  = test_df["FLAG"].values
X_train_raw = train_df.drop(columns=["FLAG", "Address"])
X_test_raw  = test_df.drop(columns=["FLAG", "Address"])

# ---------- PRE-PROCESS ------------------------------------------
print(" Building pre-processor …")
num_cols = X_train_raw.select_dtypes("number").columns.tolist()
cat_cols = X_train_raw.select_dtypes("object").columns.tolist()

X_train_raw[num_cols] = X_train_raw[num_cols].fillna(0)
X_test_raw[num_cols]  = X_test_raw[num_cols].fillna(0)

pre = ColumnTransformer([
    ("num", MinMaxScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols)
])
pre.fit(X_train_raw)

def transform(df):
    X = pre.transform(df)
    return np.nan_to_num(X.toarray() if hasattr(X, "toarray") else X)

X_train_full = transform(X_train_raw)
X_test_full  = transform(X_test_raw)
n_features   = X_train_full.shape[1]
print(f" Encoded feature space: {n_features:,d} columns")

# ---------- MODEL BUILDERS ---------------------------------------
def build_rf():
    return RandomForestClassifier(
        n_estimators = 400,
        class_weight = "balanced",
        n_jobs       = -1,
        random_state = 42,
    )

def build_xgb(y):
    return XGBClassifier(
        n_estimators     = 600,
        max_depth        = 6,
        learning_rate    = 0.05,
        subsample        = 0.8,
        colsample_bytree = 0.8,
        scale_pos_weight = (y == 0).sum() / (y == 1).sum(),
        eval_metric      = "auc",
        n_jobs           = -1,
        random_state     = 42,
        verbosity        = 0,
    )

# ---------- METRIC HELPER ----------------------------------------
def evaluate(tag, clf, Xte, yte):
    proba = clf.predict_proba(Xte)[:, 1]
    pred  = (proba >= 0.5).astype(int)
    return dict(
        tag     = tag,
        roc_auc = roc_auc_score(yte, proba),
        pr_auc  = average_precision_score(yte, proba),
        mcc     = matthews_corrcoef(yte, pred),
        kappa   = cohen_kappa_score(yte, pred),
    )

# ---------- RUN EXPERIMENT ---------------------------------------
records = []

## A. Baseline -----------------------------------------------------
print("\n Training baselines (RF & XGB) …")
X_bal, y_bal = SMOTE(**SMOTE_KW).fit_resample(X_train_full, y_train)

rf_base  = build_rf();  rf_base.fit(X_bal, y_bal)
xgb_base = build_xgb(y_bal); xgb_base.fit(X_bal, y_bal)

records.append(evaluate("Baseline_RF",  rf_base,  X_test_full, y_test))
records.append(evaluate("Baseline_XGB", xgb_base, X_test_full, y_test))

## B. Nested feature-drop loop ------------------------------------
print("\n Creating a single random column order for nested masks …")
perm_idx = RNG.permutation(n_features)     # fixed ordering

for pct in DROP_PCTS:                      # 0.50, 0.80, 0.95
    keep_frac = 1.0 - pct                  # 0.50, 0.20, 0.05
    k = int(n_features * keep_frac)
    keep_idx = perm_idx[:k]                # nested subset

    print(f"\n Feature-drop {int(pct*100)} %  "
          f"(keeping {k}/{n_features} columns ≈ {keep_frac:0.0%})")

    Xtr = X_train_full[:, keep_idx]
    Xte = X_test_full[:,  keep_idx]
    X_bal, y_bal = SMOTE(**SMOTE_KW).fit_resample(Xtr, y_train)

    rf  = build_rf();         rf.fit(X_bal, y_bal)
    xgb = build_xgb(y_bal);   xgb.fit(X_bal, y_bal)

    records.append(evaluate(f"FeatDrop{int(pct*100)}%_RF",  rf,  Xte, y_test))
    records.append(evaluate(f"FeatDrop{int(pct*100)}%_XGB", xgb, Xte, y_test))

# ---------- RESULTS ----------------------------------------------
raw_df = pd.DataFrame(records)

pd.set_option("display.float_format", lambda x: f"{x:0.5f}")
print("\n================  Robustness Results  ================")
print(raw_df.loc[:, ["tag", "roc_auc", "pr_auc", "mcc", "kappa"]]
          .to_string(index=False))
print("======================================================")

raw_df.to_csv("fed_feature_drop_quick.csv", index=False)
print(f"\n Results saved to fed_feature_drop_quick.csv")


In [None]:
# ================================================================
# Robustness Plots: Label Noise & Feature Dropout
# ================================================================
import pandas as pd, matplotlib.pyplot as plt

# Load results
label_noise_df  = pd.read_csv("robustness_metrics.csv")
feature_drop_df = pd.read_csv("fed_feature_drop_quick.csv")

# Extract noise/drop %
label_noise_df["noise_pct"] = label_noise_df["model"].str.extract(r"LabelNoise(\d+)%")[0].astype(float)
label_noise_df = label_noise_df.dropna(subset=["noise_pct"])
feature_drop_df["drop_pct"] = feature_drop_df["tag"].str.extract(r"FeatDrop(\d+)%")[0].astype(float)
feature_drop_df = feature_drop_df.dropna(subset=["drop_pct"])

# Setup subplots
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# Label Noise curves
for model in ["XGB", "RF"]:
    subset = label_noise_df[label_noise_df["model"].str.contains(model)]
    ax[0].plot(subset["noise_pct"], subset["roc_auc"], marker='o', label=f"{model} ROC-AUC")
    ax[0].plot(subset["noise_pct"], subset["pr_auc"], marker='s', label=f"{model} PR-AUC")
ax[0].set_title("Label Noise vs ROC/PR-AUC")
ax[0].set_xlabel("Label Noise (%)")
ax[0].set_ylabel("Score")
ax[0].set_ylim(0, 1.05)
ax[0].legend()
ax[0].grid(True)

# Feature Drop curves
for model in ["XGB", "RF"]:
    subset = feature_drop_df[feature_drop_df["tag"].str.contains(model)]
    ax[1].plot(subset["drop_pct"], subset["roc_auc"], marker='o', label=f"{model} ROC-AUC")
    ax[1].plot(subset["drop_pct"], subset["pr_auc"], marker='s', label=f"{model} PR-AUC")
ax[1].set_title("Feature Dropout vs ROC/PR-AUC")
ax[1].set_xlabel("Feature Drop (%)")
ax[1].set_ylabel("Score")
ax[1].set_ylim(0, 1.05)
ax[1].legend()
ax[1].grid(True)

plt.tight_layout()
plt.show()


In [None]:
# ================================================================
# SHAP Explainability – Random Forest on Node2Vec + Tx Features
# ================================================================
import pandas as pd, numpy as np, shap, matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# 1. Load train/test
train_df, test_df = pd.read_csv("train_node2vec.csv"), pd.read_csv("test_node2vec.csv")
drop_cols = ["FLAG","Address"]
X_train_raw, y_train = train_df.drop(columns=drop_cols), train_df["FLAG"].astype(int).values
X_test_raw,  y_test  = test_df.drop(columns=drop_cols),  test_df["FLAG"].astype(int).values

# 2. Preprocess (scale numeric, one-hot categorical)
num_cols = X_train_raw.select_dtypes("number").columns.tolist()
cat_cols = X_train_raw.select_dtypes("object").columns.tolist()
pre = ColumnTransformer([
    ("num", MinMaxScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
]).fit(X_train_raw)

def transform(X):
    Xt = pre.transform(X)
    return Xt.toarray() if hasattr(Xt, "toarray") else np.nan_to_num(Xt)

Xt_train, Xt_test = transform(X_train_raw), transform(X_test_raw)
feature_names = list(num_cols) + (list(pre.named_transformers_["cat"].get_feature_names_out(cat_cols)) if cat_cols else [])
print("Xt_train:", Xt_train.shape, "| Xt_test:", Xt_test.shape)

# 3. Train Random Forest
rf = RandomForestClassifier(n_estimators=400, class_weight="balanced", n_jobs=-1, random_state=42)
rf.fit(Xt_train, y_train)

# 4. Global importances
fi = pd.Series(rf.feature_importances_, index=feature_names).sort_values(ascending=False)
print("\nTop 20 Features:\n", fi.head(20))

# 5. SHAP values
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(Xt_test)

# Handle SHAP output formats
if isinstance(shap_values, list):
    sv = [arr for arr in shap_values if arr.shape[1] == Xt_test.shape[1]][0]
elif len(shap_values.shape) == 3 and shap_values.shape[2] == 2:
    sv = shap_values[..., 1]
else:
    sv = shap_values
print("SHAP values shape:", sv.shape)

# 6. SHAP summary plot (global)
shap.summary_plot(sv, Xt_test, feature_names=feature_names, max_display=20, show=False)
plt.tight_layout()
plt.savefig("shap_summary_plot.png")
plt.close()
print("Saved shap_summary_plot.png")

# 7. Predictions + error analysis
prob = rf.predict_proba(Xt_test)[:,1]
preds = (prob >= 0.5).astype(int)
mis_idx = np.where(preds != y_test)[0]
tp_idx  = np.where((preds == 1) & (y_test == 1))[0]

# 8. SHAP local case studies
if len(mis_idx) > 0:
    idx = mis_idx[0]
    shap.plots._waterfall.waterfall_legacy(explainer.expected_value[1], sv[idx], feature_names=feature_names)
    plt.title("SHAP Waterfall – Misclassified Example")
    plt.tight_layout()
    plt.savefig("shap_waterfall_misclassified.png")
    plt.close()

if len(tp_idx) > 0:
    idx = tp_idx[0]
    shap.plots._waterfall.waterfall_legacy(explainer.expected_value[1], sv[idx], feature_names=feature_names)
    plt.title("SHAP Waterfall – True Positive Example")
    plt.tight_layout()
    plt.savefig("shap_waterfall_truepositive.png")
    plt.close()

print("SHAP summary + case study plots saved.")


In [None]:
# ================================================================
# SHAP Explainability – XGBoost on Node2Vec + Tx Features
# ================================================================
import pandas as pd, numpy as np, shap, matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier

# 1. Load train/test
train_df, test_df = pd.read_csv("train_node2vec.csv"), pd.read_csv("test_node2vec.csv")
drop_cols = ["FLAG","Address"]
X_train_raw, y_train = train_df.drop(columns=drop_cols), train_df["FLAG"].astype(int).values
X_test_raw,  y_test  = test_df.drop(columns=drop_cols),  test_df["FLAG"].astype(int).values

# 2. Preprocess
num_cols = X_train_raw.select_dtypes("number").columns.tolist()
cat_cols = X_train_raw.select_dtypes("object").columns.tolist()
pre = ColumnTransformer([
    ("num", MinMaxScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
]).fit(X_train_raw)

def transform(X):
    Xt = pre.transform(X)
    return Xt.toarray() if hasattr(Xt,"toarray") else np.nan_to_num(Xt)

Xt_train, Xt_test = transform(X_train_raw), transform(X_test_raw)
feature_names = list(num_cols) + (list(pre.named_transformers_["cat"].get_feature_names_out(cat_cols)) if cat_cols else [])
print("Xt_train:", Xt_train.shape, "| Xt_test:", Xt_test.shape)

# 3. Train XGBoost
xgb = XGBClassifier(
    n_estimators=600, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8,
    scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
    eval_metric="auc", n_jobs=-1, random_state=42, use_label_encoder=False
)
xgb.fit(Xt_train, y_train)

# 4. Global importances
fi = pd.Series(xgb.feature_importances_, index=feature_names).sort_values(ascending=False)
print("\nTop 20 Features:\n", fi.head(20))

# 5. SHAP values
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(Xt_test)

if isinstance(shap_values, list):
    sv = [arr for arr in shap_values if arr.shape[1]==Xt_test.shape[1]][0]
elif len(shap_values.shape)==3 and shap_values.shape[2]==2:
    sv = shap_values[...,1]
else:
    sv = shap_values
print("SHAP values shape:", sv.shape)

# 6. SHAP summary plot
shap.summary_plot(sv, Xt_test, feature_names=feature_names, max_display=20, show=False)
plt.tight_layout(); plt.savefig("shap_summary_plot_xgb.png"); plt.close()
print("Saved shap_summary_plot_xgb.png")

# 7. Predictions + error analysis
prob, preds = xgb.predict_proba(Xt_test)[:,1], None
preds = (prob >= 0.5).astype(int)
mis_idx = np.where(preds != y_test)[0]
tp_idx  = np.where((preds==1) & (y_test==1))[0]

# 8. SHAP local case studies
if len(mis_idx)>0:
    idx = mis_idx[0]
    shap.plots._waterfall.waterfall_legacy(explainer.expected_value, sv[idx], feature_names=feature_names)
    plt.title("SHAP Waterfall – Misclassified (XGB)")
    plt.tight_layout(); plt.savefig("shap_waterfall_misclassified_xgb.png"); plt.close()

if len(tp_idx)>0:
    idx = tp_idx[0]
    shap.plots._waterfall.waterfall_legacy(explainer.expected_value, sv[idx], feature_names=feature_names)
    plt.title("SHAP Waterfall – True Positive (XGB)")
    plt.tight_layout(); plt.savefig("shap_waterfall_truepositive_xgb.png"); plt.close()

print("SHAP summary + case study plots saved (XGBoost).")


In [None]:
# ================================================================
# SHAP Explainability – MLP on Node2Vec + Tx Features
# ================================================================
import pandas as pd, numpy as np, shap, matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.neural_network import MLPClassifier

# 1. Load train/test
train_df, test_df = pd.read_csv("train_node2vec.csv"), pd.read_csv("test_node2vec.csv")
drop_cols = ["FLAG","Address"]
X_train_raw, y_train = train_df.drop(columns=drop_cols), train_df["FLAG"].astype(int).values
X_test_raw,  y_test  = test_df.drop(columns=drop_cols),  test_df["FLAG"].astype(int).values

# 2. Preprocess
num_cols = X_train_raw.select_dtypes("number").columns.tolist()
cat_cols = X_train_raw.select_dtypes("object").columns.tolist()
pre = ColumnTransformer([
    ("num", MinMaxScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
]).fit(X_train_raw)

def transform(X):
    Xt = pre.transform(X)
    return Xt.toarray() if hasattr(Xt,"toarray") else np.nan_to_num(Xt)

Xt_train, Xt_test = transform(X_train_raw), transform(X_test_raw)
feature_names = list(num_cols) + (list(pre.named_transformers_["cat"].get_feature_names_out(cat_cols)) if cat_cols else [])
print("Xt_train:", Xt_train.shape, "| Xt_test:", Xt_test.shape)

# 3. Train MLP
mlp = MLPClassifier(hidden_layer_sizes=(256,128), max_iter=40, random_state=42)
mlp.fit(Xt_train, y_train)
print(f"MLP train acc: {mlp.score(Xt_train,y_train):.3f}, test acc: {mlp.score(Xt_test,y_test):.3f}")

# 4. SHAP (KernelExplainer with sampled background + test set)
X_bg  = shap.sample(Xt_train, 100) if Xt_train.shape[0]>100 else Xt_train
X_exp = shap.sample(Xt_test, 100)  if Xt_test.shape[0]>100  else Xt_test
explainer = shap.KernelExplainer(mlp.predict_proba, X_bg)
shap_values = explainer.shap_values(X_exp, nsamples=100)

# Pick correct SHAP array
if isinstance(shap_values, list):
    sv = [arr for arr in shap_values if arr.shape[1]==X_exp.shape[1]][0]
elif len(shap_values.shape)==3 and shap_values.shape[2]==2:
    sv = shap_values[...,1]
else:
    sv = shap_values
print("SHAP values shape:", sv.shape)

# 5. SHAP summary plot
shap.summary_plot(sv, X_exp, feature_names=feature_names, max_display=20, show=False)
plt.tight_layout(); plt.savefig("shap_summary_plot_mlp.png"); plt.close()
print("Saved shap_summary_plot_mlp.png")

# 6. Predictions + error analysis
prob, preds = mlp.predict_proba(Xt_test)[:,1], None
preds = (prob>=0.5).astype(int)
mis_idx = np.where(preds!=y_test)[0]
tp_idx  = np.where((preds==1)&(y_test==1))[0]

# 7. SHAP local case studies
if len(mis_idx)>0:
    idx = mis_idx[0] if mis_idx[0]<len(X_exp) else 0
    shap.plots._waterfall.waterfall_legacy(explainer.expected_value[1], sv[idx], feature_names=feature_names)
    plt.title("SHAP Waterfall – Misclassified (MLP)")
    plt.tight_layout(); plt.savefig("shap_waterfall_misclassified_mlp.png"); plt.close()

if len(tp_idx)>0:
    idx = tp_idx[0] if tp_idx[0]<len(X_exp) else 0
    shap.plots._waterfall.waterfall_legacy(explainer.expected_value[1], sv[idx], feature_names=feature_names)
    plt.title("SHAP Waterfall – True Positive (MLP)")
    plt.tight_layout(); plt.savefig("shap_waterfall_truepositive_mlp.png"); plt.close()

print("SHAP summary + case study plots saved (MLP).")
