In [1]:
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import networkx as nx
import plotly.graph_objects as go
import pandas as pd
from sqlalchemy import create_engine
import os
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os


In [2]:
# Get the database parameters from environment variables
load_dotenv()
db_params = {
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'host': os.getenv('DB_HOST'),
    'port': os.getenv('DB_PORT'),
}

# Create the connection string
conn_str = f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}"

# Create the database engine
engine = create_engine(conn_str)


In [3]:
df_path = pd.read_sql("SELECT * FROM path", engine)
df_path.head()


Unnamed: 0,id,probe_id,path_index,path_amount,failure,duration_seconds
0,38871,11098,0,50000000,True,2
1,38872,11099,0,50000000,True,1
2,38873,11100,0,50000000,True,1
3,38874,11101,0,50000000,True,1
4,38875,11102,0,50000000,True,1


In [4]:
df_hop = pd.read_sql("SELECT * FROM hop", engine)


In [5]:
def bytes_list_to_hex(bytes_list):
    pubkey_bytes = b''.join(bytes_list)

    # Convert byte sequence to a large integer
    pubkey_int = pubkey_bytes.hex()
    
    return pubkey_int


In [6]:
df_hop['pubkey'] = df_hop['pubkey'].apply(bytes_list_to_hex)


In [7]:
# Create a DataFrame with unique public keys from df_hop
df_unique_pubkeys = pd.DataFrame(df_hop['pubkey'].unique(), columns=['pubkey'])

# Insert your public key at the first position
my_pubkey = 'MyPublicKey'
df_unique_pubkeys = pd.concat([pd.DataFrame([my_pubkey], columns=['pubkey']), df_unique_pubkeys], ignore_index=True)
df_unique_pubkeys.dtypes


pubkey    object
dtype: object

In [8]:
# Add a new column 'pubkey_index' to df_unique_pubkeys
df_unique_pubkeys['pubkey_index'] = df_unique_pubkeys.index

# Create a mapping from pubkey to pubkey_index
pubkey_to_index = df_unique_pubkeys.set_index('pubkey')['pubkey_index'].to_dict()

# Replace the 'pubkey' column in df_hop with 'pubkey_index'
df_hop['pubkey'] = df_hop['pubkey'].map(pubkey_to_index)

df_hop.head()


Unnamed: 0,id,path_id,hop_index,scid,fee,pubkey,failure,attempted,is_final_hop
0,250113,38871,1,8.747901e+17,50000000,1,True,True,True
1,250114,38871,0,8.801975e+17,1500,2,False,True,False
2,250116,38872,0,8.881206e+17,1400,3,False,True,False
3,250115,38872,1,8.800612e+17,0,4,True,True,False
4,250117,38872,2,8.69065e+17,50000000,1,False,False,True


In [9]:
# Create a DataFrame with unique scid from df_hop
df_unique_scids = pd.DataFrame(df_hop['scid'].unique(), columns=['scid'])

# Add a new column 'scid_index' to df_unique_scids
df_unique_scids['scid_index'] = df_unique_scids.index

# Create a mapping from scid to scid_index
scid_to_index = df_unique_scids.set_index('scid')['scid_index'].to_dict()

# Replace the 'scid' column in df_hop with 'scid_index'
df_hop['scid'] = df_hop['scid'].map(scid_to_index)


In [10]:
import pickle

with open('scid_dict.pkl', 'wb') as f:
    pickle.dump(scid_to_index, f)


In [11]:
df_hop.head()


Unnamed: 0,id,path_id,hop_index,scid,fee,pubkey,failure,attempted,is_final_hop
0,250113,38871,1,0,50000000,1,True,True,True
1,250114,38871,0,1,1500,2,False,True,False
2,250116,38872,0,2,1400,3,False,True,False
3,250115,38872,1,3,0,4,True,True,False
4,250117,38872,2,4,50000000,1,False,False,True


In [12]:
df_hop_attempted = df_hop[df_hop['attempted'] == True]
df_hop_sorted = df_hop_attempted.sort_values(by=['path_id', 'hop_index'])


In [13]:
# Create a new column 'source_pubkey' that contains the pubkey of the source node
df_hop_sorted['source_pubkey'] = df_hop_sorted['pubkey'].shift(1)
df_hop_sorted.loc[df_hop_sorted['hop_index'] == 0, 'source_pubkey'] = 0  # for hop_index=0, source_pubkey=0
df_hop_sorted['source_pubkey'] = df_hop_sorted['source_pubkey'].astype(int)


In [14]:
df_hop_sorted.head()


Unnamed: 0,id,path_id,hop_index,scid,fee,pubkey,failure,attempted,is_final_hop,source_pubkey
1,250114,38871,0,1,1500,2,False,True,False,0
0,250113,38871,1,0,50000000,1,True,True,True,2
2,250116,38872,0,2,1400,3,False,True,False,0
3,250115,38872,1,3,0,4,True,True,False,3
8,250121,38873,0,1,1500,2,False,True,False,0


In [15]:
# Group by 'path_id' and create a list of tuples for each path
df_hop_sorted['hops'] = list(zip(df_hop_sorted['hop_index'], df_hop_sorted['pubkey'], df_hop_sorted['source_pubkey'], df_hop_sorted['is_final_hop'], df_hop_sorted['scid'], df_hop_sorted['failure']))
df_paths = df_hop_sorted.groupby('path_id')['hops'].apply(list).reset_index()

# Create 'path_failure' column
df_paths['path_failure'] = df_hop_sorted.groupby('path_id')['failure'].any().values

df_paths.head()


Unnamed: 0,path_id,hops,path_failure
0,38871,"[(0, 2, 0, False, 1, False), (1, 1, 2, True, 0...",True
1,38872,"[(0, 3, 0, False, 2, False), (1, 4, 3, False, ...",True
2,38873,"[(0, 2, 0, False, 1, False), (1, 6, 2, False, ...",True
3,38874,"[(0, 3, 0, False, 2, False), (1, 7, 3, False, ...",True
4,38875,"[(0, 3, 0, False, 2, False), (1, 8, 3, False, ...",True


In [16]:
import torch

# Set 'path_id' as the index
df_paths.set_index('path_id', inplace=True)

# Convert 'hops' to tensor
df_paths['hops'] = df_paths['hops'].apply(lambda x: torch.tensor(x, dtype=torch.float32))

# Convert 'path_failure' to tensor
df_paths['path_failure'] = df_paths['path_failure'].apply(lambda x: torch.tensor([x], dtype=torch.float32))

df_paths.head()


Unnamed: 0_level_0,hops,path_failure
path_id,Unnamed: 1_level_1,Unnamed: 2_level_1
38871,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",[tensor(1.)]
38872,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(1.)]
38873,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",[tensor(1.)]
38874,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(1.)]
38875,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(1.)]


In [17]:
# Merge 'df_paths' with 'df_path' on 'path_id'
df_paths = df_paths.merge(df_path[['id', 'path_amount', 'duration_seconds']], left_index=True, right_on='id')

# Drop the 'id' column
df_paths.drop(columns='id', inplace=True)

# Convert 'path_amount' and 'duration_seconds' to tensor
df_paths['path_amount'] = df_paths['path_amount'].apply(lambda x: torch.tensor([x], dtype=torch.float32))
df_paths['duration_seconds'] = df_paths['duration_seconds'].apply(lambda x: torch.tensor([x], dtype=torch.float32))

df_paths.head()


Unnamed: 0,hops,path_failure,path_amount,duration_seconds
0,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",[tensor(1.)],[tensor(50000000.)],[tensor(2.)]
1,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(1.)],[tensor(50000000.)],[tensor(1.)]
2,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",[tensor(1.)],[tensor(50000000.)],[tensor(1.)]
3,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(1.)],[tensor(50000000.)],[tensor(1.)]
4,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(1.)],[tensor(50000000.)],[tensor(1.)]


In [21]:
# Convert tensors to numpy arrays
df_paths['hops'] = df_paths['hops'].apply(lambda x: x)
df_paths['path_failure'] = df_paths['path_failure'].apply(lambda x: x)
df_paths['path_amount'] = df_paths['path_amount'].apply(lambda x: x[0])
df_paths['duration_seconds'] = df_paths['duration_seconds'].apply(lambda x: x[0])

# Save DataFrame as CSV
df_paths.to_csv('df_paths.csv', index=False)


In [None]:
df_paths.head()


In [19]:
df_paths[df_paths['path_failure'] == 0].head()


Unnamed: 0,hops,path_failure,path_amount_x,duration_seconds_x,path_amount_y,duration_seconds_y
26,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(0.)],[tensor(3.2627e+08)],[tensor(12.)],50000000,2
27,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(0.)],[tensor(1.2499e+08)],[tensor(10.)],50000000,2
28,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",[tensor(0.)],[tensor(49999900.)],[tensor(8.)],50000000,4
35,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",[tensor(0.)],[tensor(23483124.)],[tensor(10.)],50000000,3
36,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",[tensor(0.)],[tensor(49999900.)],[tensor(8.)],50000000,1


In [20]:
import pickle

with open('df_paths.pkl', 'wb') as f:
    pickle.dump(df_paths, f)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load your data
data = df_hop_sorted

df = pd.DataFrame(data)

# Convert boolean to int
df['is_final_hop'] = df['is_final_hop'].astype(int)

# Split data into features and target
X = df[['source_pubkey', 'pubkey', 'hop_index', 'is_final_hop']]
y = df['failure']

# Convert labels to binary (0 for success, 1 for failure)
y = (y > 0).astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)

# Create a Dataset
class PathDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Create DataLoaders
train_dataset = PathDataset(X_train, y_train)
test_dataset = PathDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Define the model
class PathClassifier(nn.Module):
    def __init__(self):
        super(PathClassifier, self).__init__()
        self.fc1 = nn.Linear(4, 10)  # 4 input features
        self.fc2 = nn.Linear(10, 10)
        self.fc3 = nn.Linear(10, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))  # Sigmoid for binary classification
        return x


In [None]:

model = PathClassifier()

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):  # Number of epochs
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Evaluate the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = (outputs.squeeze() > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy: {100 * correct / total}%')


In [32]:
# df_hop_sorted = df_hop_sorted.rename(columns={'pubkey': 'destination_pubkey'})

import torch 

# Create edges
edges = df_hop_sorted[['source_pubkey', 'destination_pubkey']].values.T
edge_index = torch.tensor(edges, dtype=torch.long)

# Create nodes
nodes = pd.concat([df_hop_sorted['source_pubkey'], df_hop_sorted['destination_pubkey']]).unique()
x = torch.tensor(nodes, dtype=torch.long)

# Create target
df_hop_sorted['path_failure'] = df_hop_sorted.groupby('path_id')['failure'].transform('any')
y = torch.tensor(df_hop_sorted['path_failure'].values, dtype=torch.float)


In [33]:
import torch
from torch_geometric.nn import GCNConv
import pytorch_lightning as pl

class GCN(pl.LightningModule):
    def __init__(self, num_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, 128)
        self.conv2 = GCNConv(128, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = torch.dropout(x, p=0.5, training=self.training)

        x = self.conv2(x, edge_index)

        return torch.sigmoid(x)

    def training_step(self, data, batch_idx):
        y_hat = self.forward(data)
        loss = torch.nn.functional.binary_cross_entropy(y_hat, data.y)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.01)


In [34]:
from torch_geometric.data import DataLoader
from torch_geometric.datasets import random_split

num_nodes = len(x)

# Create a boolean tensor of shape [num_nodes]
mask = torch.rand(num_nodes) < 0.8

# Training mask: 80% of the data
train_mask = mask

# Validation mask: 10% of the data
val_mask = ~mask & (torch.rand(num_nodes) < 0.5)

# Test mask: remaining data
test_mask = ~(train_mask | val_mask)


ImportError: cannot import name 'random_split' from 'torch_geometric.datasets' (/home/paperspace/.local/lib/python3.9/site-packages/torch_geometric/datasets/__init__.py)

In [None]:
model = GCN(num_features=x.shape[1], num_classes=2)
trainer = pl.Trainer(max_epochs=100)
trainer.fit(model, train_loader, val_loader)


In [None]:
trainer.test(test_dataloaders=test_loader)
