# Cleaning Part 2: Flattening the Paths and building the data splits

In this section we flatten the paths into columns with each hop feature. This is necessary preprocessing for our skikit classifiers.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load your data
data = pd.read_csv('df_paths.csv')


Here's the indexed breakdown of each element in the 'hops' tuples:

0. df_hop_sorted['hop_index']: This is the index of the hop in the path. It starts from 0 and increments by 1 for each subsequent hop in the path.

1. df_hop_sorted['pubkey']: This is the public key of the node that the hop is going to. It uniquely identifies a node in the network.

2. df_hop_sorted['source_pubkey']: This is the public key of the node where the hop is coming from. It also uniquely identifies a node in the network.

3. df_hop_sorted['is_final_hop']: This is a boolean value indicating whether the hop is the final hop in the path. If it's True, this means the hop is going to the final destination node.

4. df_hop_sorted['scid']: This is the short channel ID (scid) of the channel used for the hop. It uniquely identifies a channel in the network.

5. df_hop_sorted['failure']: This is a boolean value indicating whether the hop failed. If it's True, this means the payment failed at this hop.

So, each tuple in the 'hops' list represents a hop in the path, and the elements of the tuple provide information about the hop. The index of each element in the tuple is as listed above.

In [2]:
data.head()



Unnamed: 0,hops,path_failure,path_amount,duration_seconds
0,[[0. 2. 0. 0. 1. 0.]\n [1. 1. 2. 1. 0. 1.]],1.0,tensor(50000000.),tensor(2.)
1,[[0. 3. 0. 0. 2. 0.]\n [1. 4. 3. 0. 3. 1.]],1.0,tensor(50000000.),tensor(1.)
2,[[0. 2. 0. 0. 1. 0.]\n [1. 6. 2. 0. 6. 1.]],1.0,tensor(50000000.),tensor(1.)
3,[[0. 3. 0. 0. 2. 0.]\n [1. 7. 3. 0. 8. 1.]],1.0,tensor(50000000.),tensor(1.)
4,[[ 0. 3. 0. 0. 2. 0.]\n [ 1. 8. 3. 0. ...,1.0,tensor(50000000.),tensor(1.)


In [3]:
data.dtypes


hops                 object
path_failure        float64
path_amount          object
duration_seconds     object
dtype: object

Here we'll map the hops into 2 dimensional arrays, just required formatting from reading back from the csv

In [4]:
import re
import numpy as np

def str_to_2dlist(s):
    s = re.sub(r'\s+', ', ', s.replace('\n', '').replace('[ ', '[').replace(' ]', ']'))  # Remove newline characters and extra spaces
    s = re.sub(r'\[,', '[', s)  # Remove leading commas in each sub-list
    return eval(s)

# Convert 'hops' column to 2D numpy arrays
data['hops'] = data['hops'].apply(lambda x: np.array(str_to_2dlist(x)))

data.head()


Unnamed: 0,hops,path_failure,path_amount,duration_seconds
0,"[[0.0, 2.0, 0.0, 0.0, 1.0, 0.0], [1.0, 1.0, 2....",1.0,tensor(50000000.),tensor(2.)
1,"[[0.0, 3.0, 0.0, 0.0, 2.0, 0.0], [1.0, 4.0, 3....",1.0,tensor(50000000.),tensor(1.)
2,"[[0.0, 2.0, 0.0, 0.0, 1.0, 0.0], [1.0, 6.0, 2....",1.0,tensor(50000000.),tensor(1.)
3,"[[0.0, 3.0, 0.0, 0.0, 2.0, 0.0], [1.0, 7.0, 3....",1.0,tensor(50000000.),tensor(1.)
4,"[[0.0, 3.0, 0.0, 0.0, 2.0, 0.0], [1.0, 8.0, 3....",1.0,tensor(50000000.),tensor(1.)


## Dropping the Hop Failure

Hop failure is recorded for all the attempted payments, but it's not information we'll have available to us when we're going to predict whether a path will succeed or not.

We could potentially use it if we were to make a model predicting whetehr a specific hop will fial, but for this project we're just looking for whether the PATH will fail or not, so we can just drop it .

In [18]:
data['hops'] = data['hops'].apply(lambda x: [i[:-1] for i in x])
data.head()


KeyError: 'hops'

We'll have to reformat the columns we saved as tensors into numpy arrays and ints.


In [6]:
import ast

def str_to_float(s):
    # Remove the 'tensor' part from the string
    s = s.replace('tensor(', '').replace(')', '')
    # Convert the string to a float
    return float(s)

data['path_amount'] = data['path_amount'].apply(str_to_float)
data['duration_seconds'] = data['duration_seconds'].apply(str_to_float)
data.head()


Unnamed: 0,hops,path_failure,path_amount,duration_seconds
0,"[[0.0, 2.0, 0.0, 0.0, 1.0], [1.0, 1.0, 2.0, 1....",1.0,50000000.0,2.0
1,"[[0.0, 3.0, 0.0, 0.0, 2.0], [1.0, 4.0, 3.0, 0....",1.0,50000000.0,1.0
2,"[[0.0, 2.0, 0.0, 0.0, 1.0], [1.0, 6.0, 2.0, 0....",1.0,50000000.0,1.0
3,"[[0.0, 3.0, 0.0, 0.0, 2.0], [1.0, 7.0, 3.0, 0....",1.0,50000000.0,1.0
4,"[[0.0, 3.0, 0.0, 0.0, 2.0], [1.0, 8.0, 3.0, 0....",1.0,50000000.0,1.0


Now we save the formatted data to not have to go through all that over and over again

In [7]:
data.to_csv('./cleaned_data/hold.csv')


In [9]:
data = pd.read_csv('./cleaned_data/hold.csv')


## Flattening Hops into Columns

The scikit learn machine learning models we'll be using initially don't take tensors and can't handle array data. So we'll have to flatten the hops into columns.

We'll do this by finding the max number of hops, creating columns for features of hops up to that final hop, and saving them.

In [7]:
from tqdm import tqdm

# Determine the maximum hop index across all rows
max_hop_index = max(data['hops'].apply(lambda x: max(sublist[0] for sublist in x)))

# Create separate columns for each attribute of each hop
for i in tqdm(range(int(max_hop_index) + 1)):
    data[f'hop_{i}_destination_pubkey'] = data['hops'].apply(lambda x: [sublist[1] for sublist in x if sublist[0] == i][0] if [sublist[1] for sublist in x if sublist[0] == i] else None)
    data[f'hop_{i}_source_pubkey'] = data['hops'].apply(lambda x: [sublist[2] for sublist in x if sublist[0] == i][0] if [sublist[2] for sublist in x if sublist[0] == i] else None)
    data[f'hop_{i}_is_final_hop'] = data['hops'].apply(lambda x: [sublist[3] for sublist in x if sublist[0] == i][0] if [sublist[3] for sublist in x if sublist[0] == i] else None)
    data[f'hop_{i}_scid'] = data['hops'].apply(lambda x: [sublist[4] for sublist in x if sublist[0] == i][0] if [sublist[4] for sublist in x if sublist[0] == i] else None)

# Drop the original 'hops' column
data = data.drop('hops', axis=1)
data = data.drop('duration_seconds', axis=1)
data.head()


100%|██████████| 13/13 [01:26<00:00,  6.64s/it]


Unnamed: 0,path_failure,path_amount,hop_0_destination_pubkey,hop_0_source_pubkey,hop_0_is_final_hop,hop_0_scid,hop_1_destination_pubkey,hop_1_source_pubkey,hop_1_is_final_hop,hop_1_scid,...,hop_10_is_final_hop,hop_10_scid,hop_11_destination_pubkey,hop_11_source_pubkey,hop_11_is_final_hop,hop_11_scid,hop_12_destination_pubkey,hop_12_source_pubkey,hop_12_is_final_hop,hop_12_scid
0,1.0,50000000.0,2.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,...,,,,,,,,,,
1,1.0,50000000.0,3.0,0.0,0.0,2.0,4.0,3.0,0.0,3.0,...,,,,,,,,,,
2,1.0,50000000.0,2.0,0.0,0.0,1.0,6.0,2.0,0.0,6.0,...,,,,,,,,,,
3,1.0,50000000.0,3.0,0.0,0.0,2.0,7.0,3.0,0.0,8.0,...,,,,,,,,,,
4,1.0,50000000.0,3.0,0.0,0.0,2.0,8.0,3.0,0.0,10.0,...,,,,,,,,,,


Now we'll fill all the shorter paths with < max_hops with -1s , the classifier will learn to ignore these but we have to do something because the classifier models don't take nulls or NAs.

In [10]:
# Fill NaN values with -1
data = data.fillna(-1)

# Convert all columns to integers
data = data.astype(int)

data.head()


Unnamed: 0,path_failure,path_amount,hop_0_destination_pubkey,hop_0_source_pubkey,hop_0_is_final_hop,hop_0_scid,hop_1_destination_pubkey,hop_1_source_pubkey,hop_1_is_final_hop,hop_1_scid,...,hop_10_is_final_hop,hop_10_scid,hop_11_destination_pubkey,hop_11_source_pubkey,hop_11_is_final_hop,hop_11_scid,hop_12_destination_pubkey,hop_12_source_pubkey,hop_12_is_final_hop,hop_12_scid
0,1,50000000,2,0,0,1,1,2,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,1,50000000,3,0,0,2,4,3,0,3,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,1,50000000,2,0,0,1,6,2,0,6,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,1,50000000,3,0,0,2,7,3,0,8,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,1,50000000,3,0,0,2,8,3,0,10,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


now we'll save our data so we don't have to run through everything over and over again.

In [13]:
data.to_csv('./cleaned_data/final.csv')


## Train, Test, Validation Splits

Here we're going to split everything into train, test, and validation data which we'll use for all the models.

This is because if we don't train the models across the same splits, we can't fairly compare them.

We'll apply SMOTE to the train dataset to ensure it's evenly balanced to improve the classifier training processes, but not apply it to the validation or test data because those should only include real data.

In [15]:
import logging
import sys
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import os
import pickle

# Create a logger
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

# Split data into features and target
logging.info('Splitting data into features and target')
target = data['path_failure']
features = data.drop('path_failure', axis=1)
features = features.fillna(-1)

# Split data into train+validation and test sets
logging.info('Splitting data into train+validation and test sets')
features_train_val, features_test, target_train_val, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Split train+validation set into separate training and validation sets
logging.info('Splitting train+validation set into separate training and validation sets')
features_train, features_val, target_train, target_val = train_test_split(features_train_val, target_train_val, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

# Now apply SMOTE only to the training set
logging.info('Applying SMOTE to the training set')
smote = SMOTE(random_state=42)
features_train, target_train = smote.fit_resample(features_train, target_train)

logging.info('Data preparation completed')


In [16]:
# Check the balance of the 'path_failure' variable in the training set
print(target_train.value_counts())


1    581886
0    581886
Name: path_failure, dtype: int64


now we'll save the data splits for use with our models

In [17]:

# Create a new directory to save the data
os.makedirs('cleaned_data', exist_ok=True)

# Save the data
features_train.to_csv('cleaned_data/features_train.csv')
target_train.to_csv('cleaned_data/target_train.csv')
features_test.to_csv('cleaned_data/features_test.csv')
target_test.to_csv('cleaned_data/target_test.csv')
features_val.to_csv('cleaned_data/features_val.csv')
target_val.to_csv('cleaned_data/target_val.csv')
