Reproduce random forest baseline

In [8]:
import pandas as pd

txs_classes = pd.read_csv('../data/elliptic_txs_classes.csv')
txs_edges = pd.read_csv('../data/elliptic_txs_edgelist.csv')
txs_features = pd.read_csv('../data/elliptic_txs_features.csv', header=None)


# join features with classes using tx id (1st column of txs_features)
txs_data = txs_features.merge(txs_classes, left_on=0, right_on='txId', how='left')

# convert class labels to integers
label_mapping = {'1': 0, '2': 1, 'unknown': -1}
txs_data['class'] = txs_data['class'].map(label_mapping).astype(int)

# split data and edges into train and test according to timestep (2nd column of txs_features)
train_data_all = txs_data[txs_data[1] <= 34]
test_data_all = txs_data[txs_data[1] > 34]

# separate datasets with labels(1 or 2) from those without labels(class=unknown)
train_data_labeled = train_data_all[train_data_all['class'].isin([0, 1])]
test_data_labeled = test_data_all[test_data_all['class'].isin([0, 1])]

# process edges like data: add timestep info and split into train and test
txs_edges = txs_edges.merge(txs_features[[0, 1]], left_on='txId1', right_on=0, how='left').rename(columns={1: 'timestep'}).drop(columns=[0])
train_edges_all = txs_edges[txs_edges['timestep'] <= 34]
test_edges_all = txs_edges[txs_edges['timestep'] > 34]
train_edges_labeled = train_edges_all[train_edges_all['txId1'].isin(train_data_labeled['txId']) & train_edges_all['txId2'].isin(train_data_labeled['txId'])]
test_edges_labeled = test_edges_all[test_edges_all['txId1'].isin(test_data_labeled['txId']) & test_edges_all['txId2'].isin(test_data_labeled['txId'])]

# print sizes of datasets
print(f"Train data all: {train_data_all.shape}, Train data labeled: {train_data_labeled.shape}")
print(f"Test data all: {test_data_all.shape}, Test data labeled: {test_data_labeled.shape}")
print(f"Train edges all: {train_edges_all.shape}, Train edges labeled: {train_edges_labeled.shape}")
print(f"Test edges all: {test_edges_all.shape}, Test edges labeled: {test_edges_labeled.shape}")

Train data all: (136265, 169), Train data labeled: (29894, 169)
Test data all: (67504, 169), Test data labeled: (16670, 169)
Train edges all: (156843, 3), Train edges labeled: (22898, 3)
Test edges all: (77512, 3), Test edges labeled: (13726, 3)


In [4]:
from sklearn.ensemble import RandomForestClassifier

# licit node class=1, illicit node class=0
# train and evaluate a random forest classifier on the train and test data with labels
# n_estimators=50, max_features=50
# evaluate on both licit and illicit nodes' precision and recall and f1-score, also include micro and macro averages
clf = RandomForestClassifier(n_estimators=50, max_features=50, random_state=42)
clf.fit(train_data_labeled.iloc[:, 2:-2], train_data_labeled['class'])
test_preds = clf.predict(test_data_labeled.iloc[:, 2:-2]) 
from sklearn.metrics import classification_report
report = classification_report(test_data_labeled['class'], test_preds, target_names=['illicit', 'licit'])
print("Random Forest Classifier Report on Labeled Test Data:")
print(report)

Random Forest Classifier Report on Labeled Test Data:
              precision    recall  f1-score   support

     illicit       0.89      0.72      0.80      1083
       licit       0.98      0.99      0.99     15587

    accuracy                           0.98     16670
   macro avg       0.94      0.86      0.89     16670
weighted avg       0.98      0.98      0.98     16670



In [9]:
import dgl
import torch

# create DGL graphs for train and test data
def create_dgl_graph(data, edges):
    node_ids = data['txId'].tolist()
    id_to_idx = {node_id: idx for idx, node_id in enumerate(node_ids)}
    
    src = edges['txId1'].map(id_to_idx).tolist()
    dst = edges['txId2'].map(id_to_idx).tolist()
    
    g = dgl.graph((src, dst), num_nodes=len(node_ids))
    features = torch.tensor(data.iloc[:, 2:-2].values, dtype=torch.float32)
    labels = torch.tensor(data['class'].values, dtype=torch.long)
    
    g.ndata['feat'] = features
    g.ndata['label'] = labels
    
    return g

train_labeled_graph = create_dgl_graph(train_data_labeled, train_edges_labeled)
test_labeled_graph = create_dgl_graph(test_data_labeled, test_edges_labeled)
train_all_graph = create_dgl_graph(train_data_all, train_edges_all)
test_all_graph = create_dgl_graph(test_data_all, test_edges_all)

print(f"Train labeled graph: {train_labeled_graph}")
print(f"Test labeled graph: {test_labeled_graph}")
print(f"Train all graph: {train_all_graph}")
print(f"Test all graph: {test_all_graph}")

Train labeled graph: Graph(num_nodes=29894, num_edges=22898,
      ndata_schemes={'feat': Scheme(shape=(165,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})
Test labeled graph: Graph(num_nodes=16670, num_edges=13726,
      ndata_schemes={'feat': Scheme(shape=(165,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})
Train all graph: Graph(num_nodes=136265, num_edges=156843,
      ndata_schemes={'feat': Scheme(shape=(165,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})
Test all graph: Graph(num_nodes=67504, num_edges=77512,
      ndata_schemes={'feat': Scheme(shape=(165,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})


In [44]:
import torch.nn as nn
from dgl.nn import GraphConv

# Train a 2-layer GCN on the labeled subgraphs
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class_weights = torch.tensor([0.7, 0.3], dtype=torch.float32, device=device)
embedding_dim = 100
num_epochs = 800

labeled_only = True

if labeled_only:
    train_graph = dgl.to_bidirected(train_labeled_graph, copy_ndata=True)
    test_graph = dgl.to_bidirected(test_labeled_graph, copy_ndata=True)
else:
    train_graph = dgl.to_bidirected(train_all_graph, copy_ndata=True)
    test_graph = dgl.to_bidirected(test_all_graph, copy_ndata=True)

train_graph = dgl.add_self_loop(train_graph)
test_graph = dgl.add_self_loop(test_graph)
train_features = train_graph.ndata['feat']
train_labels = train_graph.ndata['label']
train_mask = (train_labels >= 0)
test_features = test_graph.ndata['feat']
test_labels = test_graph.ndata['label']
test_mask = (test_labels >= 0)

train_graph = train_graph.to(device)
test_graph = test_graph.to(device)
train_features = train_features.to(device)
train_labels = train_labels.to(device)
test_features = test_features.to(device)
test_labels = test_labels.to(device)

torch.manual_seed(42) 

class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super().__init__()
        self.conv1 = GraphConv(in_feats, hidden_size)
        self.conv2 = GraphConv(hidden_size, num_classes)

    def forward(self, g, feat):
        h = self.conv1(g, feat)
        h = torch.relu(h)
        h = self.conv2(g, h)
        return h

model = GCN(train_features.shape[1], embedding_dim, 2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)

criterion = nn.CrossEntropyLoss(weight=class_weights,ignore_index=-1)

for epoch in range(1, num_epochs + 1):
    model.train()
    logits = model(train_graph, train_features)
    loss = criterion(logits, train_labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch == 1 or epoch % 100 == 0:
        train_pred = logits.argmax(dim=1)
        train_precision = ((train_pred[train_mask] == 0) & (train_labels[train_mask] == 0)).sum().item() / (train_pred[train_mask] == 0).sum().item()
        train_recall = ((train_pred[train_mask] == 0) & (train_labels[train_mask] == 0)).sum().item() / (train_labels[train_mask] == 0).sum().item()
        train_f1 = 2 * train_precision * train_recall / (train_precision + train_recall)
        with torch.no_grad():
            model.eval()
            test_logits = model(test_graph, test_features)
            test_pred = test_logits.argmax(dim=1)
            test_loss = criterion(test_logits, test_labels).item()
            # report F1-score, precision, recall on illicit node only
            test_precision = ((test_pred[test_mask] == 0) & (test_labels[test_mask] == 0)).sum().item() / (test_pred[test_mask] == 0).sum().item()
            test_recall = ((test_pred[test_mask] == 0) & (test_labels[test_mask] == 0)).sum().item() / (test_labels[test_mask] == 0).sum().item()
            test_f1 = 2 * test_precision * test_recall / (test_precision + test_recall)
        print(f"Epoch {epoch:03d}: Loss {loss.item():.4f}, Train F1 {train_f1:.4f}, Test Loss {test_loss:.4f}, Test F1 {test_f1:.4f}")
model.eval()
with torch.no_grad():
    test_logits = model(test_graph, test_features)
test_preds = test_logits.argmax(dim=1).cpu().numpy()
test_true = test_labels.cpu().numpy()

print("GCN Classification Report on Labeled Test Graph:")
print(classification_report(test_true[test_mask.cpu().numpy()], test_preds[test_mask.cpu().numpy()], target_names=['illicit', 'licit']))


Epoch 001: Loss 0.6558, Train F1 0.1194, Test Loss 0.7959, Test F1 0.1635
Epoch 100: Loss 0.1586, Train F1 0.8169, Test Loss 0.2861, Test F1 0.4778
Epoch 200: Loss 0.1239, Train F1 0.8556, Test Loss 0.2756, Test F1 0.5328
Epoch 300: Loss 0.1059, Train F1 0.8783, Test Loss 0.2819, Test F1 0.5584
Epoch 400: Loss 0.0929, Train F1 0.8931, Test Loss 0.2973, Test F1 0.5891
Epoch 500: Loss 0.0828, Train F1 0.9032, Test Loss 0.3145, Test F1 0.6106
Epoch 600: Loss 0.0752, Train F1 0.9115, Test Loss 0.3327, Test F1 0.6204
Epoch 700: Loss 0.0691, Train F1 0.9177, Test Loss 0.3466, Test F1 0.6226
Epoch 800: Loss 0.0642, Train F1 0.9242, Test Loss 0.3591, Test F1 0.6269
GCN Classification Report on Labeled Test Graph:
              precision    recall  f1-score   support

     illicit       0.77      0.53      0.63      1083
       licit       0.97      0.99      0.98     15587

    accuracy                           0.96     16670
   macro avg       0.87      0.76      0.80     16670
weighted avg 