In [7]:
import pandas as pd

balanced_df = pd.read_csv("./ml_datasets/balanced_df.csv")
balanced_df = balanced_df.astype('int64')
balanced_df.dtypes


Unnamed: 0              int64
probe_id                int64
path_index              int64
path_amount             int64
path_id                 int64
hop_index               int64
scid                    int64
fee                     int64
attempted               int64
is_final_hop            int64
pubkey_map_id           int64
path_failure            int64
hop_failure             int64
hop_duration_seconds    int64
dtype: object

In [17]:
from collections import defaultdict
import numpy as np
import torch

# Initialize a defaultdict of defaultdicts
probes = defaultdict(lambda: defaultdict(list))

# Group the DataFrame by 'probe_id' and 'path_index'
grouped = balanced_df.groupby(['probe_id', 'path_index'])

# Iterate over the groups
for (probe_id, path_index), group in grouped:
    # Create a NumPy array for this group and convert it to a PyTorch tensor
    hop_tensor = torch.tensor(group[['path_amount', 'path_id', 'hop_index', 'scid', 'fee', 'attempted', 'is_final_hop', 'path_failure', 'hop_failure', 'hop_duration_seconds', 'pubkey_map_id']].values)
    # Add the tensor to the list for this 'probe_id' and 'path_index'
    probes[probe_id][path_index].append(hop_tensor)


In [45]:
from torch_geometric.data import Data

data_list = []

# Iterate over the probes
for probe_id in probes:
    for path_index in probes[probe_id]:
        # Get the hop tensor for this probe path
        hop_tensor = probes[probe_id][path_index][0]
        
        # Create edge_index, edge_attr, and node_attr
        edge_index = []
        edge_attr = []
        node_attr = []
        for i in range(len(hop_tensor)):
            src = hop_tensor[i][10]    # pubkey_map_id at hop_index i
            scid = hop_tensor[i][3]    # scid at hop_index i
            path_amount = hop_tensor[i][0]   # path_amount at hop_index i
            fee = hop_tensor[i][4]   # fee at hop_index i
            hop_duration = hop_tensor[i][9]   # hop_duration_seconds at hop_index i
            is_final_hop = hop_tensor[i][6]   # is_final_hop at hop_index i
            hop_failure = hop_tensor[i][8]   # hop_failure at hop_index i
            if i < len(hop_tensor) - 1:
                dst = hop_tensor[i+1][10]  # pubkey_map_id at hop_index i+1
                edge_index.append((src, dst))
                edge_attr.append((scid, path_amount, fee, hop_duration))
            node_attr.append((is_final_hop, hop_failure))
        
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_attr = torch.tensor(edge_attr, dtype=torch.float)
        node_attr = torch.tensor(node_attr, dtype=torch.float)
        
        # Get the target variable for this probe path
        path_failure = hop_tensor[0][7].clone().detach()  # path_failure at hop_index 0
        
        # Create a Data instance for this probe path
        data = Data(x=node_attr, edge_index=edge_index, edge_attr=edge_attr, y=path_failure)
        
        # Add the Data instance to the list
        data_list.append(data)


In [64]:
# Select a sample Data instance from the list
sample_data = data_list[213593]  # Adjust the index to select a different sample

# Print the details of the sample
print("Sample Data Instance:")
print("Node Attributes (x):")
print(sample_data.x)
print("\nEdge Indices (edge_index):")
print(sample_data.edge_index)
print("\nEdge Attributes (edge_attr):")
print(sample_data.edge_attr)
print("\nTarget Value (y):")
print(sample_data.y)


Sample Data Instance:
Node Attributes (x):
tensor([[0., 0.]])

Edge Indices (edge_index):
tensor([], dtype=torch.int64)

Edge Attributes (edge_attr):
tensor([])

Target Value (y):
tensor(0)


In [22]:
import pickle

with open('data_list.pkl', 'wb') as f:
    pickle.dump(data_list, f)


In [62]:
print(len(sub_dataset))


1000


In [53]:
from torch_geometric.data import DataLoader
from torch.utils.data import Dataset

class ProbesDataset(Dataset):
    def __init__(self, data_list):
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        return self.data_list[idx]

# Assuming data_list is a list of Data instances, one for each probe
dataset = ProbesDataset(data_list)
dataloader = DataLoader(dataset, batch_size=32)


In [57]:
sub_dataset = dataset[:1000]


In [58]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import pytorch_lightning as pl

class GNNModel(torch.nn.Module):
    def __init__(self, num_node_features):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return x


In [59]:
import pytorch_lightning as pl
from torch_geometric.data import DataLoader
import torch.nn.functional as F

class LightningGNN(pl.LightningModule):
    def __init__(self, data_list, model):
        super().__init__()
        self.model = model
        self.dataset = ProbesDataset(data_list)

    def forward(self, data):
        return self.model(data)

    def training_step(self, batch, batch_idx):
        data = batch  # 'batch' is already a combined object with .x, .edge_index, etc.
        targets = data.y.view(-1, 1)
        predictions = self(data).view(-1, 1)
        loss = F.binary_cross_entropy_with_logits(predictions, targets)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.01)

    def train_dataloader(self):
        return DataLoader(self.dataset, batch_size=32, shuffle=True)


In [60]:
from pytorch_lightning import Trainer
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Assuming num_node_features is the number of node features
num_node_features = 2  # replace with the actual number of node features

model = GNNModel(num_node_features)
module = LightningGNN(sub_dataset, model)

trainer = Trainer(max_epochs=10)
trainer.fit(module)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [11]:
import pickle
with open('probes.pkl', 'wb') as f:
    pickle.dump(probes, f)

# with open('probes.pkl', 'rb') as f:
#     probes = pickle.load(f)


PicklingError: Can't pickle <function <lambda> at 0x7ff632606670>: attribute lookup <lambda> on __main__ failed