In [None]:
import sys
import os
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import platform
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from dotenv import load_dotenv

load_dotenv()
project_root = os.environ.get("PROJECT_ROOT")
sys.path.append('../models/graphCL')

from models import GCL_Encoder, LogisticRegression
from augmentations import TemporalAugmentor

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Running on: {DEVICE}")

: 

In [None]:
# --- GRAPH CONTRACTION SCRIPT ---
# Turns Wallet nodes (Tx -> Addr -> Tx) into direct edges (Tx -> Tx).

ELLIPTIC_PP_DIR = os.path.join(project_root, "elliptic++_bitcoin_dataset")

print(f"Data Directory: {ELLIPTIC_PP_DIR}")

# Tx -> Addr
tx_addr_path = os.path.join(ELLIPTIC_PP_DIR, "TxAddr_edgelist.csv")
print(f"Loading {tx_addr_path}...")
df_tx_addr = pd.read_csv(tx_addr_path)
df_tx_addr.columns = ["txId_src", "address"]

# Addr -> Tx
addr_tx_path = os.path.join(ELLIPTIC_PP_DIR, "AddrTx_edgelist.csv")
print(f"Loading {addr_tx_path}...")
df_addr_tx = pd.read_csv(addr_tx_path)
df_addr_tx.columns = ["address", "txId_dst"]

# Tx_A -> Address_X -> Tx_B  ==>  Tx_A -> Tx_B
print("Performing Graph Contraction (Tx->Addr + Addr->Tx = Tx->Tx)...")
df_contracted = pd.merge(df_tx_addr, df_addr_tx, on="address", how="inner")

print(f"  - Tx->Addr edges: {len(df_tx_addr):,}")
print(f"  - Addr->Tx edges: {len(df_addr_tx):,}")
print(f"  - Contracted Tx->Tx edges: {len(df_contracted):,}")

features_path = os.path.join(ELLIPTIC_PP_DIR, "txs_features.csv")
print(f"Loading timestamps from {features_path}...")
df_features = pd.read_csv(features_path, usecols=["txId", "Time step"])
time_map = df_features.set_index("txId")["Time step"].to_dict()

print("Mapping timestamps to edges...")
df_contracted["src_time"] = df_contracted["txId_src"].map(time_map)
df_contracted["dst_time"] = df_contracted["txId_dst"].map(time_map)

df_contracted = df_contracted.dropna(subset=["src_time", "dst_time"])

num_same_time = (df_contracted["dst_time"] == df_contracted["src_time"]).sum()
num_forward = (df_contracted["dst_time"] > df_contracted["src_time"]).sum()
num_backward = (df_contracted["dst_time"] < df_contracted["src_time"]).sum()

print("\nContracted Graph Temporal Analysis:")
print(f"  Same-timestep edges: {num_same_time:,} ({100*num_same_time/len(df_contracted):.1f}%)")
print(f"  Forward-causal edges: {num_forward:,} ({100*num_forward/len(df_contracted):.1f}%) <--- THIS IS WHAT WE WANTED")
print(f"  Backward-invalid edges: {num_backward:,} ({100*num_backward/len(df_contracted):.3f}%)")


# Remove backwards edges
valid_edges = df_contracted[df_contracted["dst_time"] >= df_contracted["src_time"]]

output_file = os.path.join(ELLIPTIC_PP_DIR, "elliptic_pp_contracted_edgelist.csv")
print(f"\nSaving {len(valid_edges):,} valid causal edges to {output_file}...")
valid_edges[["txId_src", "txId_dst"]].to_csv(output_file, index=False)
print("Done.")

In [None]:
ELLIPTIC_PP_DIR = os.path.join(project_root, "elliptic++_bitcoin_dataset")
features_path = os.path.join(ELLIPTIC_PP_DIR, "txs_features.csv")
classes_path = os.path.join(ELLIPTIC_PP_DIR, "txs_classes.csv")
edges_path = os.path.join(ELLIPTIC_PP_DIR, "elliptic_pp_contracted_edgelist.csv")

print("Loading features...")
df_features = pd.read_csv(features_path)
df_features = df_features.rename(columns={"Time step": "timeStep"})
df_features = df_features.set_index("txId")

feature_cols = [c for c in df_features.columns if c != "timeStep"]
x = torch.tensor(df_features[feature_cols].values, dtype=torch.float)

time_vals = df_features["timeStep"].values
time = torch.tensor(time_vals, dtype=torch.long)

print(f"Loading classes from {classes_path}...")
df_classes = pd.read_csv(classes_path).set_index("txId")
df_classes = df_classes.reindex(df_features.index)

# 1=Illicit, 2=Licit, 3=Unknown -> 1, 0, -1
class_map = {1: 1, 2: 0, 3: -1}
y = torch.tensor(df_classes["class"].map(class_map).fillna(-1).values, dtype=torch.long)

tx_ids = list(df_features.index)
node_mapping = {int(txid): idx for idx, txid in enumerate(tx_ids)}

print(f"Loading edges from {edges_path}...")
df_edges = pd.read_csv(edges_path)

valid_src = df_edges["txId_src"].isin(node_mapping)
valid_dst = df_edges["txId_dst"].isin(node_mapping)
df_edges = df_edges[valid_src & valid_dst]

print("Deduplicating edges (calculating weights)...")
df_edges_grouped = df_edges.groupby(["txId_src", "txId_dst"]).size().reset_index(name="weight")

print(f"  - Original edges: {len(df_edges):,}")
print(f"  - Deduplicated edges: {len(df_edges_grouped):,}")

src_idx = df_edges_grouped["txId_src"].map(node_mapping).values
dst_idx = df_edges_grouped["txId_dst"].map(node_mapping).values
edge_index = torch.tensor([src_idx, dst_idx], dtype=torch.long)
edge_attr = torch.tensor(df_edges_grouped["weight"].values, dtype=torch.float).view(-1, 1)

train_mask = time < 35
test_mask = time >= 35

data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y, time=time, train_mask=train_mask, test_mask=test_mask)
data.tx_id_to_node = node_mapping
data.node_to_tx_id = {v: k for k, v in node_mapping.items()}
data.feature_columns = feature_cols

print("Elliptic++ Data Object Created (Hardened):")
print(data)
print("Num nodes:", data.num_nodes)
print("Num edges:", data.num_edges)
print("Features shape:", data.x.shape)
print("Class distribution:", torch.unique(data.y, return_counts=True))

data = data.to(DEVICE)
print(f"Data ready: {data.num_nodes} nodes, {data.num_edges} edges.")

In [None]:
BATCH_SIZE = 4096
HIDDEN_CHANNELS = 256
OUT_CHANNELS = 128
LR_PRETRAIN = 0.0005
EPOCHS_PRETRAIN = 300

import json
from datetime import datetime

RESULTS_DIR = os.path.join(project_root, "results", "graphCL")
os.makedirs(RESULTS_DIR, exist_ok=True)
RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")
print(f"Results will be saved to: {RESULTS_DIR}")
print(f"Run ID: {RUN_ID}")

config = {
    "run_id": RUN_ID,
    "batch_size": BATCH_SIZE,
    "hidden_channels": HIDDEN_CHANNELS,
    "out_channels": OUT_CHANNELS,
    "lr_pretrain": LR_PRETRAIN,
    "epochs_pretrain": EPOCHS_PRETRAIN,
    "num_neighbors": [25, 15],
    "temperature": 0.1,
    "intra_step_drop_prob": 0.5,
    "inter_step_drop_prob": 0.0,
    "num_nodes": data.num_nodes,
    "num_edges": data.num_edges,
    "device": str(DEVICE),
}
with open(os.path.join(RESULTS_DIR, f"config_{RUN_ID}.json"), "w") as f:
    json.dump(config, f, indent=2)

train_loader = NeighborLoader(
    data.cpu(),
    num_neighbors=[25, 15], 
    batch_size=BATCH_SIZE,
    input_nodes=data.train_mask,
    shuffle=True,
    num_workers=4
)

encoder = GCL_Encoder(data.num_features, HIDDEN_CHANNELS, OUT_CHANNELS).to(DEVICE)
optimizer = torch.optim.Adam(encoder.parameters(), lr=LR_PRETRAIN)
augmentor = TemporalAugmentor(intra_step_drop_prob=0.5, inter_step_drop_prob=0.0)

def contrastive_loss(z1, z2, temperature=0.1):
    z1 = F.normalize(z1, dim=1)
    z2 = F.normalize(z2, dim=1)
    sim_matrix = torch.mm(z1, z2.t()) / temperature
    labels = torch.arange(z1.size(0)).to(DEVICE)
    return F.cross_entropy(sim_matrix, labels)

print("Starting Unsupervised Pre-training...")
encoder.train()
loss_history = []

for epoch in range(1, EPOCHS_PRETRAIN + 1):
    total_loss = 0
    steps = 0
    
    for batch in train_loader:
        batch = batch.to(DEVICE)
        
        view1 = augmentor.get_view(batch, mode='temporal_edge')
        view2 = augmentor.get_view(batch, mode='feature')
        
        _, z1 = encoder(view1.x, view1.edge_index, view1.edge_attr)
        _, z2 = encoder(view2.x, view2.edge_index, view2.edge_attr)
        
        loss = contrastive_loss(z1, z2)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        steps += 1
    
    avg_loss = total_loss / steps
    loss_history.append(avg_loss)
    print(f"Epoch {epoch:03d} | Contrastive Loss: {avg_loss:.4f}")

    if epoch % 50 == 0:
        checkpoint_path = os.path.join(RESULTS_DIR, f"encoder_checkpoint_epoch{epoch}_{RUN_ID}.pt")
        torch.save({
            'epoch': epoch,
            'model_state_dict': encoder.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss_history': loss_history,
        }, checkpoint_path)
        print(f"Checkpoint saved: {checkpoint_path}")

print("Pre-training complete.")

final_model_path = os.path.join(RESULTS_DIR, f"encoder_final_{RUN_ID}.pt")
torch.save({
    'epoch': EPOCHS_PRETRAIN,
    'model_state_dict': encoder.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss_history': loss_history,
    'config': config,
}, final_model_path)
print(f"Final encoder saved: {final_model_path}")

loss_df = pd.DataFrame({'epoch': range(1, len(loss_history) + 1), 'loss': loss_history})
loss_df.to_csv(os.path.join(RESULTS_DIR, f"loss_history_{RUN_ID}.csv"), index=False)
print("Loss history saved.")

In [None]:
LR_EVAL = 0.001
EPOCHS_EVAL = 100

print("Generating node embeddings for the whole graph...")
encoder.eval()

with torch.no_grad():
    try:
        embeddings, _ = encoder(data.x, data.edge_index, data.edge_attr)
    except RuntimeError as e:
        print(f"GPU OOM: {e}")
        print("Switching encoder to CPU for inference...")
        encoder_cpu = encoder.cpu()
        data_cpu = data.cpu()
        embeddings, _ = encoder_cpu(data_cpu.x, data_cpu.edge_index, data_cpu.edge_attr)
        embeddings = embeddings.to(DEVICE)
        encoder.to(DEVICE)

print("Embedding Statistics:")
print(f"  Shape: {embeddings.shape}")
print(f"  Mean: {embeddings.mean().item():.4f}")
print(f"  Std: {embeddings.std().item():.4f}")
print(f"  Min: {embeddings.min().item():.4f}")
print(f"  Max: {embeddings.max().item():.4f}")
print(f"  Has NaN: {torch.isnan(embeddings).any().item()}")
print(f"  Has Inf: {torch.isinf(embeddings).any().item()}")

if torch.isnan(embeddings).any() or torch.isinf(embeddings).any():
    print("Sanitizing embeddings (replacing NaN/Inf with 0)...")
    embeddings = torch.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)

embeddings = F.normalize(embeddings, dim=1)

embeddings_path = os.path.join(RESULTS_DIR, f"embeddings_{RUN_ID}.pt")
torch.save(embeddings.cpu(), embeddings_path)
print(f"Embeddings saved: {embeddings_path}")

X = embeddings.detach()
y = data.y.to(DEVICE)

labeled_mask = (y != -1)
train_mask = data.train_mask.to(DEVICE) & labeled_mask
test_mask = data.test_mask.to(DEVICE) & labeled_mask

X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test = X[test_mask], y[test_mask]

train_class_counts = torch.bincount(y_train)
print("Training Class Distribution:")
print(f"  Class 0 (Licit): {train_class_counts[0].item():,}")
print(f"  Class 1 (Illicit): {train_class_counts[1].item():,}")

class_weights = 1.0 / train_class_counts.float()
class_weights = class_weights / class_weights.sum()
print(f"  Class Weights: {class_weights.tolist()}")

classifier = LogisticRegression(X.shape[1], 2).to(DEVICE)
optimizer_eval = torch.optim.Adam(classifier.parameters(), lr=LR_EVAL)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights.to(DEVICE))

print(f"Training Linear Classifier on {len(y_train):,} nodes...")

best_f1 = 0
eval_history = []
for epoch in range(1, EPOCHS_EVAL + 1):
    classifier.train()
    optimizer_eval.zero_grad()
    
    out = classifier(X_train)
    loss = criterion(out, y_train)
    
    loss.backward()
    optimizer_eval.step()

    if epoch % 20 == 0:
        classifier.eval()
        with torch.no_grad():
            val_out = classifier(X_test)
            val_preds = val_out.argmax(dim=1)
            val_f1 = f1_score(y_test.cpu().numpy(), val_preds.cpu().numpy(), pos_label=1)
            eval_history.append({'epoch': epoch, 'loss': loss.item(), 'val_f1': val_f1})
            if val_f1 > best_f1:
                best_f1 = val_f1
        print(f"  Epoch {epoch:3d} | Loss: {loss.item():.4f} | Val F1: {val_f1:.4f}")

classifier.eval()
with torch.no_grad():
    out_test = classifier(X_test)
    probs = F.softmax(out_test, dim=1)
    preds = probs.argmax(dim=1)
    
    y_true = y_test.cpu().numpy()
    y_pred = preds.cpu().numpy()
    y_prob = probs[:, 1].cpu().numpy()

    f1 = f1_score(y_true, y_pred, average='binary', pos_label=1)
    prec = precision_score(y_true, y_pred, average='binary', pos_label=1, zero_division=0)
    rec = recall_score(y_true, y_pred, average='binary', pos_label=1, zero_division=0)
    acc = (y_true == y_pred).sum() / len(y_true)
    f1_macro = f1_score(y_true, y_pred, average='macro')
    
    cm = confusion_matrix(y_true, y_pred)
    
    print("Evaluation")
    print(f"  Accuracy:       {acc:.4f}")
    print(f"  Precision (1):  {prec:.4f}")
    print(f"  Recall (1):     {rec:.4f}")
    print(f"  F1 Score (1):   {f1:.4f}")
    print(f"  F1 Macro:       {f1_macro:.4f}")
    print("\nConfusion Matrix:")
    print(f"  [[TN={cm[0,0]:5d}, FP={cm[0,1]:5d}]")
    print(f"   [FN={cm[1,0]:5d}, TP={cm[1,1]:5d}]]")
    print("\nPer-Class Breakdown:")
    print(f"  Licit (0):   {cm[0,0]:,} correct / {cm[0].sum():,} total = {cm[0,0]/cm[0].sum():.2%}")
    print(f"  Illicit (1): {cm[1,1]:,} correct / {cm[1].sum():,} total = {cm[1,1]/cm[1].sum():.2%}")

classifier_path = os.path.join(RESULTS_DIR, f"classifier_{RUN_ID}.pt")
torch.save(classifier.state_dict(), classifier_path)
print(f"\nClassifier saved: {classifier_path}")

results = {
    "run_id": RUN_ID,
    "accuracy": float(acc),
    "precision": float(prec),
    "recall": float(rec),
    "f1_score": float(f1),
    "f1_macro": float(f1_macro),
    "confusion_matrix": cm.tolist(),
    "train_samples": int(len(y_train)),
    "test_samples": int(len(y_test)),
    "class_distribution_train": {
        "licit": int(train_class_counts[0].item()),
        "illicit": int(train_class_counts[1].item())
    },
    "eval_history": eval_history,
}

results_path = os.path.join(RESULTS_DIR, f"results_{RUN_ID}.json")
with open(results_path, "w") as f:
    json.dump(results, f, indent=2)
print(f"Results saved: {results_path}")

predictions_df = pd.DataFrame({
    'y_true': y_true,
    'y_pred': y_pred,
    'prob_illicit': y_prob,
})
predictions_path = os.path.join(RESULTS_DIR, f"predictions_{RUN_ID}.csv")
predictions_df.to_csv(predictions_path, index=False)
print(f"Predictions saved: {predictions_path}")

print(f"ALL FILES SAVED TO: {RESULTS_DIR}")
print(f"  - config_{RUN_ID}.json")
print(f"  - encoder_final_{RUN_ID}.pt")
print(f"  - loss_history_{RUN_ID}.csv")
print(f"  - embeddings_{RUN_ID}.pt")
print(f"  - classifier_{RUN_ID}.pt")
print(f"  - results_{RUN_ID}.json")
print(f"  - predictions_{RUN_ID}.csv")