In [1]:
pip install mongopy

Note: you may need to restart the kernel to use updated packages.


In [52]:
pip install torch_geometric

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pymongo import MongoClient
import pandas as pd


client = MongoClient("mongodb://localhost:27017/")
db = client["solana_tx_db"]
collection = db["transactions"]


In [6]:
cursor = collection.find().limit(100)
df = pd.DataFrame(list(cursor))


if "block_time" in df.columns:
    df["block_time"] = pd.to_datetime(df["block_time"])

print(df.head())


                        _id  \
0  687f2c646e21c4e4792e1119   
1  687f2c646e21c4e4792e111a   
2  687f2c646e21c4e4792e111b   
3  687f2c646e21c4e4792e111c   
4  687f2c646e21c4e4792e111d   

                                           signature       slot   blockTime  \
0  4eJrP55btsYsWLGYSU6URG2y14uQiFDfRjL4TpezeWPUng...  354935068  1753164885   
1  45mKgq7Yt1wifqGWbMwmkirAEkcNFUWfNaUEvCdUED5xTp...  354935068  1753164885   
2  4kP2Q5FqWHbpDHxsPSEKzSecGCG35eNHHuD2jHcvwjigS6...  354935068  1753164885   
3  gueBD215eV125kqLQ85eJjdHJ3ghTtFypvi5yJNfRxh4MS...  354935068  1753164885   
4  2ZEsXfeWghSwZ7RVktH9zTLWqjaLQZYKXBcdAx3jCVCV6T...  354935068  1753164885   

    status     type       fee     value  \
0  success  unknown  0.000005  0.000005   
1  success  unknown  0.000005  0.000005   
2  success  unknown  0.000005  0.000005   
3  success  unknown  0.000005  0.000005   
4  success  unknown  0.000005  0.000005   

                                        programs  \
0  [Vote1111111111111111111

In [None]:
"""This is to load data into dataframe so that it can be used by the graphs"""
from pymongo import MongoClient
import pandas as pd

def load_transactions(mongo_uri="mongodb://localhost:27017/",
                      db_name="solana_tx_db", collection_name="transactions"):
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    cursor = collection.find({})
    docs = list(cursor)
    df = pd.DataFrame(docs)

    # Ensure numeric types
    df["fee"] = pd.to_numeric(df["fee"], errors="coerce").fillna(0)
    df["value"] = pd.to_numeric(df["value"], errors="coerce").fillna(0)
    df["instructionCount"] = pd.to_numeric(df["instructionCount"], errors="coerce").fillna(0)

    return df

#Inorder to derive block level statistics we need to aggregate the data present in each of the blocks.
def build_block_stats(df):
    # Group by slot
    block_stats = df.groupby("slot").agg(
        tx_count=("signature", "count"),
        success_count=("status", lambda x: (x == "success").sum()),
        fail_count=("status", lambda x: (x != "success").sum()),
        avg_fee=("fee", "mean"),
        median_fee=("fee", "median"),
        fee_variance=("fee", "var"),
        avg_value=("value", "mean"),
        median_value=("value", "median"),
        instruction_sum=("instructionCount", "sum"),
        block_time=("blockTime", "first")
    ).reset_index()

    # Computing block time delta i.e gap between blocks
    block_stats = block_stats.sort_values("slot").reset_index(drop=True)
    block_stats["block_time_delta"] = block_stats["block_time"].diff().fillna(0)
    block_stats["block_utilization"] = block_stats["tx_count"] / block_stats["tx_count"].max()

    return block_stats

#Building transactions level GNN using already present individual transactions.
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler

def build_transaction_graph(df):
    features = df[["fee", "value", "instructionCount"]].fillna(0)
    scaler = StandardScaler()
    x = torch.tensor(scaler.fit_transform(features), dtype=torch.float)

    #Edges used to connect tx in same block (slot)
    slot_groups = df.groupby("slot").groups
    src, dst = [], []
    for txs in slot_groups.values():
        txs = list(txs)
        for i in range(len(txs)-1):
            src.append(txs[i]); dst.append(txs[i+1])

    edge_index = torch.tensor([src + dst, dst + src], dtype=torch.long)
    y = torch.tensor((df["status"] == "success").astype(int).values, dtype=torch.long)

    return Data(x=x, edge_index=edge_index, y=y)

#Building block level GNN using already coomputed aggregated block stats
def build_block_graph(block_stats):
    features = block_stats[[
        "tx_count", "success_count", "fail_count",
        "avg_fee", "median_fee", "fee_variance",
        "block_time_delta", "block_utilization"
    ]].fillna(0)

    scaler = StandardScaler()
    x = torch.tensor(scaler.fit_transform(features), dtype=torch.float)
    src = list(range(len(block_stats)-1))
    dst = list(range(1, len(block_stats)))
    edge_index = torch.tensor([src + dst, dst + src], dtype=torch.long)

    # Labels: congestion = utilization > 0.8
    y = torch.tensor((block_stats["block_utilization"] > 0.8).astype(int).values, dtype=torch.long)

    return Data(x=x, edge_index=edge_index, y=y)

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool


# ----- Block Graph Encoder -----
class BlockEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(BlockEncoder, self).__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, out_dim)
    
    def forward(self, x, edge_index, batch):
        h = F.relu(self.conv1(x, edge_index))
        h = F.relu(self.conv2(h, edge_index))   # <-- added ReLU for stability
        return global_mean_pool(h, batch)       # pooled block-level embedding


# ----- Transaction Graph Encoder -----
class TxEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(TxEncoder, self).__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, out_dim)
    
    def forward(self, x, edge_index, batch):
        h = F.relu(self.conv1(x, edge_index))
        h = F.relu(self.conv2(h, edge_index))   # <-- added ReLU
        return global_mean_pool(h, batch)       # pooled tx-level embedding


# ----- Hybrid Congestion Prediction Model -----
class HybridCongestionModel(nn.Module):
    def __init__(self, block_in, tx_in, hidden=64, out_dim=2):
        super(HybridCongestionModel, self).__init__()
        # Encoders
        self.block_encoder = BlockEncoder(block_in, hidden, hidden)
        self.tx_encoder = TxEncoder(tx_in, hidden, hidden)
        
        # Fusion + classification
        self.fc1 = nn.Linear(hidden*2, hidden)
        self.dropout = nn.Dropout(0.3)            # regularization
        self.fc2 = nn.Linear(hidden, out_dim)     # binary: congested / not congested
    
    def forward(self, block_data, tx_data):
        block_emb = self.block_encoder(block_data.x, block_data.edge_index, block_data.batch)
        tx_emb = self.tx_encoder(tx_data.x, tx_data.edge_index, tx_data.batch)
        
        # Fuse embeddings
        h = torch.cat([block_emb, tx_emb], dim=1)  # [batch_size, hidden*2]
        h = F.relu(self.fc1(h))
        h = self.dropout(h)
        return self.fc2(h)   # logits (use CrossEntropyLoss)

import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.loader import DataLoader
from sklearn.model_selection import train_test_split


# ----- 1. Dataset Splitting -----
def split_dataset(block_graphs, tx_graphs, labels, test_size=0.2):
    # Ensure block_graphs[i] matches tx_graphs[i] and labels[i]
    idx_train, idx_test = train_test_split(range(len(labels)), test_size=test_size, random_state=42, stratify=labels)

    train_blocks = [block_graphs[i] for i in idx_train]
    test_blocks  = [block_graphs[i] for i in idx_test]
    train_txs    = [tx_graphs[i] for i in idx_train]
    test_txs     = [tx_graphs[i] for i in idx_test]
    y_train      = labels[idx_train]
    y_test       = labels[idx_test]

    return train_blocks, test_blocks, train_txs, test_txs, y_train, y_test


# ----- 2. Custom Dataloader Wrapper -----
class HybridDataset(torch.utils.data.Dataset):
    def __init__(self, block_graphs, tx_graphs, labels):
        self.block_graphs = block_graphs
        self.tx_graphs = tx_graphs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.block_graphs[idx], self.tx_graphs[idx], self.labels[idx]


# ----- 3. Collate Function for Hybrid Graphs -----
def collate_fn(batch):
    block_batch, tx_batch, labels = zip(*batch)
    return (
        torch_geometric.data.Batch.from_data_list(block_batch),
        torch_geometric.data.Batch.from_data_list(tx_batch),
        torch.tensor(labels, dtype=torch.long)
    )


# ----- 4. Training Function -----
def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, correct = 0, 0
    for block_data, tx_data, labels in loader:
        block_data, tx_data, labels = block_data.to(device), tx_data.to(device), labels.to(device)

        optimizer.zero_grad()
        out = model(block_data, tx_data)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        preds = out.argmax(dim=1)
        correct += (preds == labels).sum().item()

    return total_loss / len(loader.dataset), correct / len(loader.dataset)


# ----- 5. Evaluation Function -----
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for block_data, tx_data, labels in loader:
            block_data, tx_data, labels = block_data.to(device), tx_data.to(device), labels.to(device)

            out = model(block_data, tx_data)
            loss = criterion(out, labels)

            total_loss += loss.item() * labels.size(0)
            preds = out.argmax(dim=1)
            correct += (preds == labels).sum().item()

    return total_loss / len(loader.dataset), correct / len(loader.dataset)


# ----- 6. Main Training Loop -----
def run_training(block_graphs, tx_graphs, labels, model, epochs=20, batch_size=32, lr=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Split dataset
    train_blocks, test_blocks, train_txs, test_txs, y_train, y_test = split_dataset(block_graphs, tx_graphs, labels)

    # Create datasets & loaders
    train_dataset = HybridDataset(train_blocks, train_txs, y_train)
    test_dataset  = HybridDataset(test_blocks, test_txs, y_test)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    test_loader  = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    # Optimizer & Loss
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    # Training Loop
    for epoch in range(1, epochs + 1):
        train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
        test_loss, test_acc = evaluate(model, test_loader, criterion, device)

        print(f"Epoch {epoch:02d}: "
              f"Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}, "
              f"Test Loss={test_loss:.4f}, Test Acc={test_acc:.4f}")

    return model



In [142]:
"""This is to load data into dataframe so that it can be used by the graphs"""
from pymongo import MongoClient
import pandas as pd

def load_transactions(mongo_uri="mongodb://localhost:27017/",
                      db_name="solana_tx_db", collection_name="transactions"):
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    cursor = collection.find({})
    docs = list(cursor)
    df = pd.DataFrame(docs)

    # Ensure numeric types
    df["fee"] = pd.to_numeric(df["fee"], errors="coerce").fillna(0)
    df["value"] = pd.to_numeric(df["value"], errors="coerce").fillna(0)
    df["instructionCount"] = pd.to_numeric(df["instructionCount"], errors="coerce").fillna(0)

    return df


In [144]:
#Inorder to derive block level statistics we need to aggregate the data present in each of the blocks.
def build_block_stats(df):
    #Grouping blocks by slot
    block_stats = df.groupby("slot").agg(
        tx_count=("signature", "count"),
        success_count=("status", lambda x: (x == "success").sum()),
        fail_count=("status", lambda x: (x != "success").sum()),
        avg_fee=("fee", "mean"),
        median_fee=("fee", "median"),
        fee_variance=("fee", "var"),
        avg_value=("value", "mean"),
        median_value=("value", "median"),
        instruction_sum=("instructionCount", "sum"),
        block_time=("blockTime", "first")
    ).reset_index()

    #Computing block time delta i.e gap between blocks
    block_stats = block_stats.sort_values("slot").reset_index(drop=True)
    block_stats["block_time_delta"] = block_stats["block_time"].diff().fillna(0)
    block_stats["block_utilization"] = block_stats["tx_count"] / block_stats["tx_count"].max()

    return block_stats


In [146]:
from torch_geometric.data import Data
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler

def build_block_graphs(block_stats, window=16, util_thresh=0.85):
    """
    Returns:
      graphs: list[Data] — each has `window` block-nodes, chain edges within the window
      labels: torch.LongTensor — label for the *target* block at the end of each window
      target_slots: list[int] — slot numbers aligned to graphs/labels (for tx pairing)
    """
    bs = block_stats.sort_values("slot").reset_index(drop=True).copy()

    feat_cols = [
        "tx_count", "success_count", "fail_count",
        "avg_fee", "median_fee", "fee_variance",
        "block_time_delta", "block_utilization"
    ]
    X = bs[feat_cols].fillna(0).to_numpy()
    scaler = StandardScaler().fit(X)
    Xs = scaler.transform(X)

    graphs, labels, target_slots = [], [], []

    for i in range(window - 1, len(bs)):
        #window of blocks ending at i
        x_win = torch.tensor(Xs[i - window + 1:i + 1], dtype=torch.float)
        n = x_win.size(0)

        if n > 1:
            src = list(range(n - 1))
            dst = list(range(1, n))
            edge_index = torch.tensor([src + dst, dst + src], dtype=torch.long)
        else:
            edge_index = torch.empty((2, 0), dtype=torch.long)

        # label from the *target* block (end of window)
        y = int(bs.loc[i, "block_utilization"] > util_thresh)

        # validation
        if edge_index.numel() > 0:
            assert edge_index.max().item() < n, \
                f"[BlockGraph] max edge {edge_index.max().item()} >= num_nodes {n}"

        data = Data(x=x_win, edge_index=edge_index)
        data.slot = int(bs.loc[i, "slot"])  
        graphs.append(data)
        labels.append(y)
        target_slots.append(data.slot)

    return graphs, torch.tensor(labels, dtype=torch.long), target_slots


In [148]:
def build_tx_graphs(df, target_slots):
    """
    For each slot in `target_slots`, build a tx graph *for that slot only*,
    with local 0..n-1 node ids and sequential bidirectional edges.
    """
    df = df.reset_index(drop=True).copy()

    feat_cols = ["fee", "value", "instructionCount"]
    X = df[feat_cols].fillna(0).to_numpy()
    scaler = StandardScaler().fit(X)
    Xs = scaler.transform(X)

    #Mapping slots to row indices (global)
    slot2idxs = df.groupby("slot").indices

    graphs = []
    for s in target_slots:
        idxs = list(slot2idxs.get(s, []))  # all tx rows for this slot (global indices)
        n = len(idxs)

        if n == 0:
            # make a minimal empty-edges graph so batching still works
            x = torch.zeros((1, len(feat_cols)), dtype=torch.float)
            edge_index = torch.empty((2, 0), dtype=torch.long)
        else:
            # features *for this slot*, stacked in the slot's order
            x = torch.tensor(Xs[idxs], dtype=torch.float)

            # IMPORTANT: edges must use LOCAL ids 0..n-1, never the global row ids
            if n > 1:
                src = list(range(n - 1))
                dst = list(range(1, n))
                edge_index = torch.tensor([src + dst, dst + src], dtype=torch.long)
            else:
                edge_index = torch.empty((2, 0), dtype=torch.long)

        if edge_index.numel() > 0:
            assert edge_index.max().item() < x.size(0), \
                f"[TxGraph] max edge {edge_index.max().item()} >= num_nodes {x.size(0)} for slot {s}"

        data = Data(x=x, edge_index=edge_index)
        data.slot = int(s)
        graphs.append(data)

    return graphs


In [150]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool


# ----- Block Graph Encoder -----
class BlockEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(BlockEncoder, self).__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, out_dim)
    
    def forward(self, x, edge_index, batch):
        h = F.relu(self.conv1(x, edge_index))
        h = F.relu(self.conv2(h, edge_index))   # <-- added ReLU for stability
        return global_mean_pool(h, batch)       # pooled block-level embedding


# ----- Transaction Graph Encoder -----
class TxEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(TxEncoder, self).__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, out_dim)
    
    def forward(self, x, edge_index, batch):
        h = F.relu(self.conv1(x, edge_index))
        h = F.relu(self.conv2(h, edge_index))   # <-- added ReLU
        return global_mean_pool(h, batch)       # pooled tx-level embedding


# ----- Hybrid Congestion Prediction Model -----
class HybridCongestionModel(nn.Module):
    def __init__(self, block_in, tx_in, hidden=64, out_dim=2):
        super(HybridCongestionModel, self).__init__()
        # Encoders
        self.block_encoder = BlockEncoder(block_in, hidden, hidden)
        self.tx_encoder = TxEncoder(tx_in, hidden, hidden)
        
        # Fusion + classification
        self.fc1 = nn.Linear(hidden*2, hidden)
        self.dropout = nn.Dropout(0.3)            # regularization
        self.fc2 = nn.Linear(hidden, out_dim)     # binary: congested / not congested
    
    def forward(self, block_data, tx_data):
        block_emb = self.block_encoder(block_data.x, block_data.edge_index, block_data.batch)
        tx_emb = self.tx_encoder(tx_data.x, tx_data.edge_index, tx_data.batch)
        
        # Fuse embeddings
        h = torch.cat([block_emb, tx_emb], dim=1)  
        h = F.relu(self.fc1(h))
        h = self.dropout(h)
        return self.fc2(h)   


In [152]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.loader import DataLoader
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data, Batch


def split_dataset(block_graphs, tx_graphs, labels, test_size=0.2):
    # Ensure block_graphs[i] matches tx_graphs[i] and labels[i]
    idx_train, idx_test = train_test_split(range(len(labels)), test_size=test_size, random_state=42, stratify=None)

    train_blocks = [block_graphs[i] for i in idx_train]
    test_blocks  = [block_graphs[i] for i in idx_test]
    train_txs    = [tx_graphs[i] for i in idx_train]
    test_txs     = [tx_graphs[i] for i in idx_test]
    y_train      = labels[idx_train]
    y_test       = labels[idx_test]

    return train_blocks, test_blocks, train_txs, test_txs, y_train, y_test


# ----- 2. Custom Dataloader Wrapper -----
class HybridDataset(torch.utils.data.Dataset):
    def __init__(self, block_graphs, tx_graphs, labels):
        self.block_graphs = block_graphs
        self.tx_graphs = tx_graphs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.block_graphs[idx], self.tx_graphs[idx], self.labels[idx]


# ----- 3. Collate Function for Hybrid Graphs -----
def collate_fn(batch):
    block_batch, tx_batch, labels = zip(*batch)
    return (
        Batch.from_data_list(block_batch),
        Batch.from_data_list(tx_batch),
        torch.tensor(labels, dtype=torch.long)
    )


# ----- 4. Training Function -----
def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, correct = 0, 0
    for block_data, tx_data, labels in loader:
        block_data, tx_data, labels = block_data.to(device), tx_data.to(device), labels.to(device)

        optimizer.zero_grad()
        out = model(block_data, tx_data)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        preds = out.argmax(dim=1)
        correct += (preds == labels).sum().item()

    return total_loss / len(loader.dataset), correct / len(loader.dataset)


# ----- 5. Evaluation Function -----
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for block_data, tx_data, labels in loader:
            block_data, tx_data, labels = block_data.to(device), tx_data.to(device), labels.to(device)

            out = model(block_data, tx_data)
            loss = criterion(out, labels)

            total_loss += loss.item() * labels.size(0)
            preds = out.argmax(dim=1)
            correct += (preds == labels).sum().item()

    return total_loss / len(loader.dataset), correct / len(loader.dataset)


# ----- 6. Main Training Loop -----
def run_training(block_graphs, tx_graphs, labels, model, epochs=20, batch_size=32, lr=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Split dataset
    train_blocks, test_blocks, train_txs, test_txs, y_train, y_test = split_dataset(block_graphs, tx_graphs, labels)

    # Create datasets & loaders
    train_dataset = HybridDataset(train_blocks, train_txs, y_train)
    test_dataset  = HybridDataset(test_blocks, test_txs, y_test)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    test_loader  = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    # Optimizer & Loss
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    # Training Loop
    for epoch in range(1, epochs + 1):
        train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
        test_loss, test_acc = evaluate(model, test_loader, criterion, device)

        print(f"Epoch {epoch:02d}: "
              f"Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}, "
              f"Test Loss={test_loss:.4f}, Test Acc={test_acc:.4f}")

    return model


In [156]:
def main():
    # 1) load + aggregate
    df = load_transactions()
    block_stats = build_block_stats(df)

    # 2) build graphs
    block_graphs, labels, target_slots = build_block_graphs(block_stats, window=16, util_thresh=0.85)
    tx_graphs = build_tx_graphs(df, target_slots)

    import numpy as np
    print("n samples (graphs):", len(block_graphs))
    print(labels)
    unique, counts = np.unique(labels.numpy() if hasattr(labels, "numpy") else labels, return_counts=True)
    print("Label distribution:", dict(zip(unique, counts)))
    
    # If using train/test split function:
    train_blocks, test_blocks, train_txs, test_txs, y_train, y_test = split_dataset(block_graphs, tx_graphs, labels)
    print("train size:", len(train_blocks), "test size:", len(test_blocks))
    print("train label counts:", np.unique(y_train, return_counts=True))
    print("test label counts:", np.unique(y_test, return_counts=True))


    # 3) quick sanity prints (won’t crash on empty edges)
    for i, (b, t) in enumerate(zip(block_graphs[:5], tx_graphs[:5])): 
        be = (b.edge_index.max().item() if b.edge_index.numel() else -1)
        te = (t.edge_index.max().item() if t.edge_index.numel() else -1)
        print(f"[{i}] slot={b.slot}  block_nodes={b.x.size(0)} max_e={be} | "
              f"tx_nodes={t.x.size(0)} max_e={te}")

    # 4) train
    model = HybridCongestionModel(block_in=8, tx_in=3, hidden=64, out_dim=2)
    run_training(block_graphs, tx_graphs, labels, model, epochs=20, batch_size=16, lr=1e-3)

if __name__ == "__main__":
    main()
    


n samples (graphs): 81
tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0])
Label distribution: {0: 71, 1: 10}
train size: 64 test size: 17
train label counts: (array([0, 1]), array([56,  8]))
test label counts: (array([0, 1]), array([15,  2]))
[0] slot=354935319  block_nodes=16 max_e=15 | tx_nodes=1436 max_e=1435
[1] slot=354935334  block_nodes=16 max_e=15 | tx_nodes=1427 max_e=1426
[2] slot=354935349  block_nodes=16 max_e=15 | tx_nodes=1396 max_e=1395
[3] slot=354935365  block_nodes=16 max_e=15 | tx_nodes=1682 max_e=1681
[4] slot=365730564  block_nodes=16 max_e=15 | tx_nodes=1278 max_e=1277
Epoch 01: Train Loss=0.7325, Train Acc=0.1250, Test Loss=0.7067, Test Acc=0.2353
Epoch 02: Train Loss=0.6889, Train Acc=0.5312, Test Loss=0.6631, Test Acc=0.8824
Epoch 03: Train

In [28]:
from torch_geometric.data import Data
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler

def build_block_graphs(block_stats, window=16, util_thresh=0.8):
    """
    Returns:
      graphs: list[Data] — each has `window` block-nodes, chain edges within the window
      labels: torch.LongTensor — label for the *target* block at the end of each window
      target_slots: list[int] — slot numbers aligned to graphs/labels (for tx pairing)
    """
    bs = block_stats.sort_values("slot").reset_index(drop=True).copy()

    feat_cols = [
        "tx_count", "success_count", "fail_count",
        "avg_fee", "median_fee", "fee_variance",
        "block_time_delta", "block_utilization"
    ]
    X = bs[feat_cols].fillna(0).to_numpy()
    scaler = StandardScaler().fit(X)
    Xs = scaler.transform(X)

    graphs, labels, target_slots = [], [], []

    for i in range(window - 1, len(bs)):
        # window of blocks ending at i
        x_win = torch.tensor(Xs[i - window + 1:i + 1], dtype=torch.float)  # [n, F]
        n = x_win.size(0)

        if n > 1:
            src = list(range(n - 1))
            dst = list(range(1, n))
            edge_index = torch.tensor([src + dst, dst + src], dtype=torch.long)
        else:
            edge_index = torch.empty((2, 0), dtype=torch.long)

        # label from the *target* block (end of window)
        y = int(bs.loc[i, "block_utilization"] > util_thresh)

        # validation
        if edge_index.numel() > 0:
            assert edge_index.max().item() < n, \
                f"[BlockGraph] max edge {edge_index.max().item()} >= num_nodes {n}"

        data = Data(x=x_win, edge_index=edge_index)
        data.slot = int(bs.loc[i, "slot"])  # keep slot to align with tx graphs later
        graphs.append(data)
        labels.append(y)
        target_slots.append(data.slot)

    return graphs, torch.tensor(labels, dtype=torch.long), target_slots


In [30]:
def build_tx_graphs(df, target_slots):
    """
    For each slot in `target_slots`, build a tx graph *for that slot only*,
    with local 0..n-1 node ids and sequential bidirectional edges.
    """
    df = df.reset_index(drop=True).copy()

    feat_cols = ["fee", "value", "instructionCount"]
    X = df[feat_cols].fillna(0).to_numpy()
    scaler = StandardScaler().fit(X)
    Xs = scaler.transform(X)

    # Map slot -> row indices (global)
    slot2idxs = df.groupby("slot").indices

    graphs = []
    for s in target_slots:
        idxs = list(slot2idxs.get(s, []))  # all tx rows for this slot (global indices)
        n = len(idxs)

        if n == 0:
            # make a minimal empty-edges graph so batching still works
            x = torch.zeros((1, len(feat_cols)), dtype=torch.float)
            edge_index = torch.empty((2, 0), dtype=torch.long)
        else:
            # features *for this slot*, stacked in the slot's order
            x = torch.tensor(Xs[idxs], dtype=torch.float)

            # IMPORTANT: edges must use LOCAL ids 0..n-1, never the global row ids
            if n > 1:
                src = list(range(n - 1))
                dst = list(range(1, n))
                edge_index = torch.tensor([src + dst, dst + src], dtype=torch.long)
            else:
                edge_index = torch.empty((2, 0), dtype=torch.long)

        if edge_index.numel() > 0:
            assert edge_index.max().item() < x.size(0), \
                f"[TxGraph] max edge {edge_index.max().item()} >= num_nodes {x.size(0)} for slot {s}"

        data = Data(x=x, edge_index=edge_index)
        data.slot = int(s)
        graphs.append(data)

    return graphs


In [32]:
def main():
    # 1) load + aggregate
    df = load_transactions()
    block_stats = build_block_stats(df)

    # 2) build graphs
    block_graphs, labels, target_slots = build_block_graphs(block_stats, window=16, util_thresh=0.8)
    tx_graphs = build_tx_graphs(df, target_slots)

    # 3) quick sanity prints (won’t crash on empty edges)
    for i, (b, t) in enumerate(zip(block_graphs[:5], tx_graphs[:5])):  # first few
        be = (b.edge_index.max().item() if b.edge_index.numel() else -1)
        te = (t.edge_index.max().item() if t.edge_index.numel() else -1)
        print(f"[{i}] slot={b.slot}  block_nodes={b.x.size(0)} max_e={be} | "
              f"tx_nodes={t.x.size(0)} max_e={te}")

    # 4) train
    model = HybridCongestionModel(block_in=8, tx_in=3, hidden=64, out_dim=2)
    run_training(block_graphs, tx_graphs, labels, model, epochs=20, batch_size=16, lr=1e-3)

if __name__ == "__main__":
    main()


[0] slot=354935319  block_nodes=16 max_e=15 | tx_nodes=1436 max_e=1435
[1] slot=354935334  block_nodes=16 max_e=15 | tx_nodes=1427 max_e=1426
[2] slot=354935349  block_nodes=16 max_e=15 | tx_nodes=1396 max_e=1395
[3] slot=354935365  block_nodes=16 max_e=15 | tx_nodes=1682 max_e=1681


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [14]:
import pandas as pd
from pymongo import MongoClient

def load_and_aggregate(mongo_uri="mongodb://localhost:27017", db_name="solana_tx_db", collection_name="transactions"):
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]
    df = pd.DataFrame(list(collection.find({})))
    
    block_stats = df.groupby("slot").agg(
        block_time=("blockTime", "first"),
        tx_count=("signature", "count"),
        success_count=("status", lambda x: (x == "success").sum()),
        fail_count=("status", lambda x: (x != "success").sum()),
        avg_fee=("fee", "mean"),
        median_fee=("fee", "median"),
        fee_variance=("fee", "var"),
        programs=("programs", lambda x: x.explode().value_counts().to_dict())
    ).reset_index()
    

    block_stats["block_time_delta"] = block_stats["block_time"].diff()
    MAX_TX_PER_BLOCK = 5000
    block_stats["block_utilization"] = block_stats["tx_count"] / MAX_TX_PER_BLOCK
    
    return block_stats


In [24]:
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler

def build_graph(block_stats):
    features = block_stats[[
        "tx_count", "success_count", "fail_count",
        "avg_fee", "median_fee", "fee_variance",
        "block_time_delta", "block_utilization"
    ]].fillna(0)
    
    scaler = StandardScaler()
    x = torch.tensor(scaler.fit_transform(features), dtype=torch.float)
    

    src = list(range(len(block_stats)-1))
    dst = list(range(1, len(block_stats)))
    edge_index = torch.tensor([src + dst, dst + src], dtype=torch.long) 
    
    # Target label is to predict congestion (binary classification)
    # Define congestion = utilization > 0.8
    y = torch.tensor((block_stats["block_utilization"] > 0.8).astype(int).values, dtype=torch.long)
    
    return Data(x=x, edge_index=edge_index, y=y)


In [26]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class BlockCongestionGCN(nn.Module):
    def __init__(self, input_dim, hidden_dim=32, output_dim=2):
        super(BlockCongestionGCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return self.fc(x)


In [28]:
import torch
from torch_geometric.loader import DataLoader

def train_gnn(data, epochs=50, lr=0.01):
    model = BlockCongestionGCN(input_dim=data.num_node_features)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
    criterion = nn.CrossEntropyLoss()
    
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            pred = out.argmax(dim=1)
            acc = (pred == data.y).sum().item() / data.num_nodes
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}, Acc: {acc:.4f}")
    return model


In [30]:
#load mongodb data, build graph and training gnn model
block_stats = load_and_aggregate()
graph_data = build_graph(block_stats)
model = train_gnn(graph_data, epochs=100)
model.eval()
with torch.no_grad():
    out = model(graph_data)
    predictions = out.argmax(dim=1).numpy()

block_stats["predicted_congestion"] = predictions
print(block_stats[["slot", "block_utilization", "predicted_congestion"]].head(20))


Epoch 0, Loss: 0.7340, Acc: 0.3684
Epoch 10, Loss: 0.0087, Acc: 1.0000
Epoch 20, Loss: 0.0000, Acc: 1.0000
Epoch 30, Loss: 0.0000, Acc: 1.0000
Epoch 40, Loss: 0.0000, Acc: 1.0000
Epoch 50, Loss: 0.0000, Acc: 1.0000
Epoch 60, Loss: 0.0000, Acc: 1.0000
Epoch 70, Loss: 0.0000, Acc: 1.0000
Epoch 80, Loss: 0.0000, Acc: 1.0000
Epoch 90, Loss: 0.0000, Acc: 1.0000
         slot  block_utilization  predicted_congestion
0   354935068             0.3392                     0
1   354935084             0.3352                     0
2   354935100             0.3162                     0
3   354935116             0.3332                     0
4   354935132             0.3494                     0
5   354935148             0.3244                     0
6   354935164             0.3268                     0
7   354935183             0.2964                     0
8   354935199             0.2948                     0
9   354935215             0.3172                     0
10  354935232             0.2852    