In [1]:
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import networkx as nx
import plotly.graph_objects as go
import pandas as pd
from sqlalchemy import create_engine
import os
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os


## Connecting to our Probing Database

We'll be using a dataset of lightning network "probes": fake payments used to discern the current relative balances in a channel to inform path selection.

The dataset comes from our own lightning network prober and was supplemented by MutinyWallet, a popular bitcoin and lightning wallet that also uses the Lightning Development Kit for their probing.

We feed our and mutiny's probes in a neonDB for ease of use with python tool chains.

This section covers the iniital data manipulation and exploration we did in structuring for the ML models

In [None]:
# Get the database parameters from environment variables
load_dotenv()
db_params = {
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'host': os.getenv('DB_HOST'),
    'port': os.getenv('DB_PORT'),
}

# Create the connection string
conn_str = f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}"

# Create the database engine
engine = create_engine(conn_str)


### Paths

A "Probe" payment is given up to 5 potential Paths by LDK whenever it's attempted. Paths through the lightning network may or may not succeed, but are guaranteed to be topologically connected, and normally the bitcoin wallet just steps through them attempting them until 1 works. We will filter exclusively for the Paths that were attempted, filter those into our training set, and train classifier models we can use to predict whether a path will succeed before attempting it.

In [3]:
df_path = pd.read_sql("SELECT * FROM path", engine)
df_path.head()


Unnamed: 0,id,probe_id,path_index,path_amount,failure,duration_seconds
0,38871,11098,0,50000000,True,2
1,38872,11099,0,50000000,True,1
2,38873,11100,0,50000000,True,1
3,38874,11101,0,50000000,True,1
4,38875,11102,0,50000000,True,1


## Hops

Hops are the channel edges through which the payment passes along the Path for the payment. Hops originate at our paying node, and terminate at the receiver.

If the payment failed along the path prior to the final hop, we know that the hop on which it failed has insufficient liquidity to route the payment, and can binary search off that amount until we find the amount the channel is capable of paying.

In [4]:
df_hop = pd.read_sql("SELECT * FROM hop", engine)


#### Handling pubkeys

Nodes are identified by secp256k1 compressed public keys of 33 bytes, so we'll convert those to hex temporarily. Then we'll map the nodes to index numbers so they're smaller and more manageable when we do our model training.

In [5]:
def bytes_list_to_hex(bytes_list):
    pubkey_bytes = b''.join(bytes_list)

    # Convert byte sequence to a large integer
    pubkey_int = pubkey_bytes.hex()
    
    return pubkey_int


In [6]:
df_hop['pubkey'] = df_hop['pubkey'].apply(bytes_list_to_hex)


In [7]:
# Create a DataFrame with unique public keys from df_hop
df_unique_pubkeys = pd.DataFrame(df_hop['pubkey'].unique(), columns=['pubkey'])

# Insert your public key at the first position
my_pubkey = 'MyPublicKey'
df_unique_pubkeys = pd.concat([pd.DataFrame([my_pubkey], columns=['pubkey']), df_unique_pubkeys], ignore_index=True)
df_unique_pubkeys.dtypes


pubkey    object
dtype: object

In [8]:
# Add a new column 'pubkey_index' to df_unique_pubkeys
df_unique_pubkeys['pubkey_index'] = df_unique_pubkeys.index

# Create a mapping from pubkey to pubkey_index
pubkey_to_index = df_unique_pubkeys.set_index('pubkey')['pubkey_index'].to_dict()

# Replace the 'pubkey' column in df_hop with 'pubkey_index'
df_hop['pubkey'] = df_hop['pubkey'].map(pubkey_to_index)

df_hop.head()


Unnamed: 0,id,path_id,hop_index,scid,fee,pubkey,failure,attempted,is_final_hop
0,250113,38871,1,8.747901e+17,50000000,1,True,True,True
1,250114,38871,0,8.801975e+17,1500,2,False,True,False
2,250116,38872,0,8.881206e+17,1400,3,False,True,False
3,250115,38872,1,8.800612e+17,0,4,True,True,False
4,250117,38872,2,8.69065e+17,50000000,1,False,False,True


#### Short Channel IDs

Now we'll do the same index mapping with short channel IDs, which are used to uniquely identify channels, but read as extremely large integers. SCIDs are not ordinal data so this mapping does not lose information.

In [9]:
# Create a DataFrame with unique scid from df_hop
df_unique_scids = pd.DataFrame(df_hop['scid'].unique(), columns=['scid'])

# Add a new column 'scid_index' to df_unique_scids
df_unique_scids['scid_index'] = df_unique_scids.index

# Create a mapping from scid to scid_index
scid_to_index = df_unique_scids.set_index('scid')['scid_index'].to_dict()

# Replace the 'scid' column in df_hop with 'scid_index'
df_hop['scid'] = df_hop['scid'].map(scid_to_index)


In [None]:
import pickle

# save the scid dict for remapping later
with open('scid_dict.pkl', 'wb') as f:
    pickle.dump(scid_to_index, f)


In [11]:
df_hop.head()


Unnamed: 0,id,path_id,hop_index,scid,fee,pubkey,failure,attempted,is_final_hop
0,250113,38871,1,0,50000000,1,True,True,True
1,250114,38871,0,1,1500,2,False,True,False
2,250116,38872,0,2,1400,3,False,True,False
3,250115,38872,1,3,0,4,True,True,False
4,250117,38872,2,4,50000000,1,False,False,True


In [12]:
df_hop_attempted = df_hop[df_hop['attempted'] == True]
df_hop_sorted = df_hop_attempted.sort_values(by=['path_id', 'hop_index'])


#### Transforming Hop Data

The pubkey in the hop is the destination node, the source node can be inferred by transforming the hops into a path, which we will do shortly.

In [13]:
# Create a new column 'source_pubkey' that contains the pubkey of the source node
df_hop_sorted['source_pubkey'] = df_hop_sorted['pubkey'].shift(1)
df_hop_sorted.loc[df_hop_sorted['hop_index'] == 0, 'source_pubkey'] = 0  # for hop_index=0, source_pubkey=0
df_hop_sorted['source_pubkey'] = df_hop_sorted['source_pubkey'].astype(int)


In [14]:
df_hop_sorted.head()


Unnamed: 0,id,path_id,hop_index,scid,fee,pubkey,failure,attempted,is_final_hop,source_pubkey
1,250114,38871,0,1,1500,2,False,True,False,0
0,250113,38871,1,0,50000000,1,True,True,True,2
2,250116,38872,0,2,1400,3,False,True,False,0
3,250115,38872,1,3,0,4,True,True,False,3
8,250121,38873,0,1,1500,2,False,True,False,0


## Path Failures

We recreate the path by grouping the hops together on path_id

In [15]:
# Group by 'path_id' and create a list of tuples for each path
df_hop_sorted['hops'] = list(zip(df_hop_sorted['hop_index'], df_hop_sorted['pubkey'], df_hop_sorted['source_pubkey'], df_hop_sorted['is_final_hop'], df_hop_sorted['scid'], df_hop_sorted['failure']))
df_paths = df_hop_sorted.groupby('path_id')['hops'].apply(list).reset_index()

# Create 'path_failure' column
df_paths['path_failure'] = df_hop_sorted.groupby('path_id')['failure'].any().values

df_paths.head()


Unnamed: 0,path_id,hops,path_failure
0,38871,"[(0, 2, 0, False, 1, False), (1, 1, 2, True, 0...",True
1,38872,"[(0, 3, 0, False, 2, False), (1, 4, 3, False, ...",True
2,38873,"[(0, 2, 0, False, 1, False), (1, 6, 2, False, ...",True
3,38874,"[(0, 3, 0, False, 2, False), (1, 7, 3, False, ...",True
4,38875,"[(0, 3, 0, False, 2, False), (1, 8, 3, False, ...",True


## Torch Tensors

We'll need to format these as torch tensors for the deep learning models later.

In [16]:
import torch

# Set 'path_id' as the index
df_paths.set_index('path_id', inplace=True)

# Convert 'hops' to tensor
df_paths['hops'] = df_paths['hops'].apply(lambda x: torch.tensor(x, dtype=torch.float32))

# Convert 'path_failure' to tensor
df_paths['path_failure'] = df_paths['path_failure'].apply(lambda x: torch.tensor([x], dtype=torch.float32))

df_paths.head()


Unnamed: 0_level_0,hops,path_failure
path_id,Unnamed: 1_level_1,Unnamed: 2_level_1
38871,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",[tensor(1.)]
38872,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(1.)]
38873,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",[tensor(1.)]
38874,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(1.)]
38875,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(1.)]


## Combine recreated paths with Paths for additional data

We'll take our rebuilt paths and merrge it with the data we get from path_amounts

In [17]:
# Merge 'df_paths' with 'df_path' on 'path_id'
df_paths = df_paths.merge(df_path[['id', 'path_amount', 'duration_seconds']], left_index=True, right_on='id')

# Drop the 'id' column
df_paths.drop(columns='id', inplace=True)

# Convert 'path_amount' and 'duration_seconds' to tensor
df_paths['path_amount'] = df_paths['path_amount'].apply(lambda x: torch.tensor([x], dtype=torch.float32))
df_paths['duration_seconds'] = df_paths['duration_seconds'].apply(lambda x: torch.tensor([x], dtype=torch.float32))

df_paths.head()


Unnamed: 0,hops,path_failure,path_amount,duration_seconds
0,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",[tensor(1.)],[tensor(50000000.)],[tensor(2.)]
1,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(1.)],[tensor(50000000.)],[tensor(1.)]
2,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",[tensor(1.)],[tensor(50000000.)],[tensor(1.)]
3,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(1.)],[tensor(50000000.)],[tensor(1.)]
4,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",[tensor(1.)],[tensor(50000000.)],[tensor(1.)]


Now we'll save it to a csv to avoid reprocessing

In [21]:
# Convert tensors to numpy arrays
df_paths['hops'] = df_paths['hops'].apply(lambda x: x)
df_paths['path_failure'] = df_paths['path_failure'].apply(lambda x: x)
df_paths['path_amount'] = df_paths['path_amount'].apply(lambda x: x[0])
df_paths['duration_seconds'] = df_paths['duration_seconds'].apply(lambda x: x[0])

# Save DataFrame as CSV
df_paths.to_csv('df_paths.csv', index=False)
