In [1]:
import pandas as pd

balanced_df = pd.read_csv("./ml_datasets/balanced_df.csv", index_col=0)
balanced_df.head()


Unnamed: 0,probe_id,path_index,path_amount,path_id,hop_index,scid,fee,attempted,is_final_hop,pubkey_map_id,path_failure,hop_failure,hop_duration_seconds
0,11098,0,50000000,38871,1,874790142906597376,50000000,True,True,0,1,True,2
1,11098,0,50000000,38871,0,880197540956340224,1500,True,False,1,1,False,2
2,11099,0,50000000,38872,0,888120621832536064,1400,True,False,2,1,False,1
3,11099,0,50000000,38872,1,880061201685413888,0,True,False,3,1,True,1
4,11099,0,50000000,38872,2,869064985909592064,50000000,False,True,0,1,False,1


In [2]:
# Convert all columns to integers
for col in balanced_df.columns:
    balanced_df[col] = balanced_df[col].astype(int)

# Sort by 'probe_id', 'path_index', and 'hop_index'
balanced_df.sort_values(by=['probe_id', 'path_index', 'hop_index'], inplace=True)
# Drop rows where 'attempted' is False
balanced_df = balanced_df[balanced_df['attempted'] == True]
# Drop the 'attempted' column
balanced_df = balanced_df.drop(columns=['attempted'])
balanced_df.head()


Unnamed: 0,probe_id,path_index,path_amount,path_id,hop_index,scid,fee,is_final_hop,pubkey_map_id,path_failure,hop_failure,hop_duration_seconds
1,11098,0,50000000,38871,0,880197540956340224,1500,0,1,1,0,2
0,11098,0,50000000,38871,1,874790142906597376,50000000,1,0,1,1,2
2,11099,0,50000000,38872,0,888120621832536064,1400,0,2,1,0,1
3,11099,0,50000000,38872,1,880061201685413888,0,0,3,1,1,1
8,11100,0,50000000,38873,0,880197540956340224,1500,0,1,1,0,1


In [3]:
from collections import defaultdict
import numpy as np
import torch

# Initialize a defaultdict of defaultdicts
probes = defaultdict(lambda: defaultdict(list))

# Group the DataFrame by 'probe_id' and 'path_index'
grouped = balanced_df.groupby(['probe_id', 'path_index'])
grouped.head()


Unnamed: 0,probe_id,path_index,path_amount,path_id,hop_index,scid,fee,is_final_hop,pubkey_map_id,path_failure,hop_failure,hop_duration_seconds
1,11098,0,50000000,38871,0,880197540956340224,1500,0,1,1,0,2
0,11098,0,50000000,38871,1,874790142906597376,50000000,1,0,1,1,2
2,11099,0,50000000,38872,0,888120621832536064,1400,0,2,1,0,1
3,11099,0,50000000,38872,1,880061201685413888,0,0,3,1,1,1
8,11100,0,50000000,38873,0,880197540956340224,1500,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6233052,348699,3,247500000,1281148,0,898527499299323904,3477,0,1,0,0,52
6233049,348699,3,247500000,1281148,1,873127681192361984,10000,0,107,0,0,52
6233051,348699,3,247500000,1281148,2,879730248572141568,222751,0,10,0,0,52
6233050,348699,3,247500000,1281148,3,898981597697277952,1247,0,2660,0,0,52


In [4]:
# Iterate over the groups
for (probe_id, path_index), group in grouped:
    # Create a NumPy array for this group and convert it to a PyTorch tensor
    hop_tensor = torch.tensor(group[['path_amount', 'path_id', 'hop_index', 'scid', 'fee', 'is_final_hop', 'path_failure', 'hop_failure', 'hop_duration_seconds', 'pubkey_map_id']].values)
    # Add the tensor to the list for this 'probe_id' and 'path_index'
    probes[probe_id][path_index].append(hop_tensor)


In [5]:
from torch_geometric.data import Data

data_list = []

# Iterate over the probes
for probe_id in probes:
    for path_index in probes[probe_id]:
        # Get the hop tensor for this probe path
        hop_tensor = probes[probe_id][path_index][0]
        
        # Create edge_index, edge_attr, and node_attr
        edge_index = []
        edge_attr = []
        node_attr = []
        for i in range(len(hop_tensor)):
            if i == 0:  # Special case for hop_index 0
                src = 1000000  # Special pubkey_map_id for my node
                dst = hop_tensor[i][9]  # pubkey_map_id at hop_index i
            else:
                src = hop_tensor[i-1][9]  # pubkey_map_id at hop_index i-1
                dst = hop_tensor[i][9]  # pubkey_map_id at hop_index i

            scid = hop_tensor[i][3]    # scid at hop_index i
            path_amount = hop_tensor[i][0]   # path_amount at hop_index i
            fee = hop_tensor[i][4]   # fee at hop_index i
            hop_duration = hop_tensor[i][8]   # hop_duration_seconds at hop_index i
            is_final_hop = hop_tensor[i][5]   # is_final_hop at hop_index i
            hop_failure = hop_tensor[i][7]   # hop_failure at hop_index i

            edge_index.append((src, dst))
            edge_attr.append((scid, path_amount, fee, hop_duration))
            node_attr.append((is_final_hop, hop_failure))
        
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_attr = torch.tensor(edge_attr, dtype=torch.float)
        node_attr = torch.tensor(node_attr, dtype=torch.float)
        
        # Get the target variable for this probe path
        path_failure = hop_tensor[0][6].clone().detach()  # path_failure at hop_index 0
        
        # Create a Data instance for this probe path
        data = Data(x=node_attr, edge_index=edge_index, edge_attr=edge_attr, y=path_failure)
        
        # Add the Data instance to the list
        data_list.append(data)


In [66]:
for key in probes:
    print(probes[key])
    break


defaultdict(<class 'list'>, {0: [tensor([[          50000000,              38871,                  0,
         880197540956340224,               1500,                  1,
                          0,                  1,                  0,
                          2,                  1],
        [          50000000,              38871,                  1,
         874790142906597376,           50000000,                  1,
                          1,                  1,                  1,
                          2,                  0]])]})


In [64]:
# Select a sample Data instance from the list
sample_data = data_list[213593]  # Adjust the index to select a different sample

# Print the details of the sample
print("Sample Data Instance:")
print("Node Attributes (x):")
print(sample_data.x)
print("\nEdge Indices (edge_index):")
print(sample_data.edge_index)
print("\nEdge Attributes (edge_attr):")
print(sample_data.edge_attr)
print("\nTarget Value (y):")
print(sample_data.y)


Sample Data Instance:
Node Attributes (x):
tensor([[0., 0.]])

Edge Indices (edge_index):
tensor([], dtype=torch.int64)

Edge Attributes (edge_attr):
tensor([])

Target Value (y):
tensor(0)


In [22]:
import pickle

with open('data_list.pkl', 'wb') as f:
    pickle.dump(data_list, f)


In [62]:
print(len(sub_dataset))


1000


In [6]:
from torch_geometric.data import DataLoader
from torch.utils.data import Dataset

class ProbesDataset(Dataset):
    def __init__(self, data_list):
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        return self.data_list[idx]

# Assuming data_list is a list of Data instances, one for each probe
dataset = ProbesDataset(data_list)
dataloader = DataLoader(dataset, batch_size=32)




In [57]:
sub_dataset = dataset[:1000]


In [17]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch.optim import Adam

class GNNModel(torch.nn.Module):
    def __init__(self, num_node_features):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GNNModel(num_node_features=2)
optimizer = Adam(model.parameters(), lr=0.01)


In [10]:
class LightningGNN(pl.LightningModule):
    def __init__(self, train_data_list, val_data_list, model):
        super().__init__()
        self.model = model
        self.train_dataset = ProbesDataset(train_data_list)
        self.val_dataset = ProbesDataset(val_data_list)

    def forward(self, data):
        return self.model(data)

    def training_step(self, batch, batch_idx):
        data = batch
        targets = data.y.view(-1, 1)
        predictions = self(data).view(-1, 1)
        loss = F.binary_cross_entropy_with_logits(predictions, targets)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        data = batch
        targets = data.y.view(-1, 1)
        predictions = self(data).view(-1, 1)
        loss = F.binary_cross_entropy_with_logits(predictions, targets)
        self.log('val_loss', loss)
        return loss

    def on_validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('avg_val_loss', avg_loss)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.01)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=32, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=32)


In [15]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [32]:
from torch.utils.data import random_split
from torch_geometric.loader import DataLoader

# Split the dataset
train_size = int(0.7 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


In [33]:
# Create a mapping from original node IDs to new indices
unique_node_ids = set()
for data in train_loader:
    unique_node_ids.update(data.edge_index.view(-1).tolist())
node_id_to_index = {node_id: idx for idx, node_id in enumerate(unique_node_ids)}

# Ensure that 1000000 is included in the dictionary
if 1000000 not in node_id_to_index:
    node_id_to_index[1000000] = max(node_id_to_index.values()) + 1

# Remap edge indices
for data in train_loader:
    data.edge_index = torch.tensor([[node_id_to_index[src], node_id_to_index[dst]] for src, dst in data.edge_index.t()])

KeyError: tensor(1000000)

In [30]:
def train(epoch):
    model.train()
    total_loss = 0
    for data in train_loader:
        optimizer.zero_grad()
        out = model(data)
        loss = F.binary_cross_entropy_with_logits(out, data.y.view(-1, 1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch: {epoch}, Loss: {total_loss / len(train_loader)}')

for epoch in range(10):
    train(epoch)

def validate():
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data in val_loader:
            out = model(data)
            loss = F.binary_cross_entropy_with_logits(out, data.y.view(-1, 1))
            total_loss += loss.item()
    print(f'Validation Loss: {total_loss / len(val_loader)}')

validate()


RuntimeError: index 928 is out of bounds for dimension 0 with size 148

In [None]:
trainer.test(test_dataloaders=DataLoader(test_dataset, batch_size=32))


In [11]:
import pickle
with open('probes.pkl', 'wb') as f:
    pickle.dump(probes, f)

# with open('probes.pkl', 'rb') as f:
#     probes = pickle.load(f)


PicklingError: Can't pickle <function <lambda> at 0x7ff632606670>: attribute lookup <lambda> on __main__ failed