In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load your data
data = pd.read_csv('df_paths.csv')


Here's the indexed breakdown of each element in the 'hops' tuples:

0. df_hop_sorted['hop_index']: This is the index of the hop in the path. It starts from 0 and increments by 1 for each subsequent hop in the path.

1. df_hop_sorted['pubkey']: This is the public key of the node that the hop is going to. It uniquely identifies a node in the network.

2. df_hop_sorted['source_pubkey']: This is the public key of the node where the hop is coming from. It also uniquely identifies a node in the network.

3. df_hop_sorted['is_final_hop']: This is a boolean value indicating whether the hop is the final hop in the path. If it's True, this means the hop is going to the final destination node.

4. df_hop_sorted['scid']: This is the short channel ID (scid) of the channel used for the hop. It uniquely identifies a channel in the network.

5. df_hop_sorted['failure']: This is a boolean value indicating whether the hop failed. If it's True, this means the payment failed at this hop.

So, each tuple in the 'hops' list represents a hop in the path, and the elements of the tuple provide information about the hop. The index of each element in the tuple is as listed above.

In [44]:
data.head()



Unnamed: 0,hops,path_failure,path_amount,duration_seconds
0,[[0. 2. 0. 0. 1. 0.]\n [1. 1. 2. 1. 0. 1.]],1.0,tensor(50000000.),tensor(2.)
1,[[0. 3. 0. 0. 2. 0.]\n [1. 4. 3. 0. 3. 1.]],1.0,tensor(50000000.),tensor(1.)
2,[[0. 2. 0. 0. 1. 0.]\n [1. 6. 2. 0. 6. 1.]],1.0,tensor(50000000.),tensor(1.)
3,[[0. 3. 0. 0. 2. 0.]\n [1. 7. 3. 0. 8. 1.]],1.0,tensor(50000000.),tensor(1.)
4,[[ 0. 3. 0. 0. 2. 0.]\n [ 1. 8. 3. 0. ...,1.0,tensor(50000000.),tensor(1.)


In [45]:
data.dtypes


hops                 object
path_failure        float64
path_amount          object
duration_seconds     object
dtype: object

In [46]:
import re
import numpy as np

def str_to_2dlist(s):
    s = re.sub(r'\s+', ', ', s.replace('\n', '').replace('[ ', '[').replace(' ]', ']'))  # Remove newline characters and extra spaces
    s = re.sub(r'\[,', '[', s)  # Remove leading commas in each sub-list
    return eval(s)

# Convert 'hops' column to 2D numpy arrays
data['hops'] = data['hops'].apply(lambda x: np.array(str_to_2dlist(x)))

data.head()


Unnamed: 0,hops,path_failure,path_amount,duration_seconds
0,"[[0.0, 2.0, 0.0, 0.0, 1.0, 0.0], [1.0, 1.0, 2....",1.0,tensor(50000000.),tensor(2.)
1,"[[0.0, 3.0, 0.0, 0.0, 2.0, 0.0], [1.0, 4.0, 3....",1.0,tensor(50000000.),tensor(1.)
2,"[[0.0, 2.0, 0.0, 0.0, 1.0, 0.0], [1.0, 6.0, 2....",1.0,tensor(50000000.),tensor(1.)
3,"[[0.0, 3.0, 0.0, 0.0, 2.0, 0.0], [1.0, 7.0, 3....",1.0,tensor(50000000.),tensor(1.)
4,"[[0.0, 3.0, 0.0, 0.0, 2.0, 0.0], [1.0, 8.0, 3....",1.0,tensor(50000000.),tensor(1.)


In [37]:
data['hops'] = data['hops'].apply(lambda x: [i[:-1] for i in x])
data.head()


Unnamed: 0,hops,path_failure,path_amount,duration_seconds
0,"[[0.0, 2.0, 0.0, 0.0, 1.0], [1.0, 1.0, 2.0, 1....",1.0,tensor(50000000.),tensor(2.)
1,"[[0.0, 3.0, 0.0, 0.0, 2.0], [1.0, 4.0, 3.0, 0....",1.0,tensor(50000000.),tensor(1.)
2,"[[0.0, 2.0, 0.0, 0.0, 1.0], [1.0, 6.0, 2.0, 0....",1.0,tensor(50000000.),tensor(1.)
3,"[[0.0, 3.0, 0.0, 0.0, 2.0], [1.0, 7.0, 3.0, 0....",1.0,tensor(50000000.),tensor(1.)
4,"[[0.0, 3.0, 0.0, 0.0, 2.0], [1.0, 8.0, 3.0, 0....",1.0,tensor(50000000.),tensor(1.)


In [47]:
import ast

def str_to_float(s):
    # Remove the 'tensor' part from the string
    s = s.replace('tensor(', '').replace(')', '')
    # Convert the string to a float
    return float(s)

data['path_amount'] = data['path_amount'].apply(str_to_float)
data['duration_seconds'] = data['duration_seconds'].apply(str_to_float)
data.head()


Unnamed: 0,hops,path_failure,path_amount,duration_seconds
0,"[[0.0, 2.0, 0.0, 0.0, 1.0, 0.0], [1.0, 1.0, 2....",1.0,50000000.0,2.0
1,"[[0.0, 3.0, 0.0, 0.0, 2.0, 0.0], [1.0, 4.0, 3....",1.0,50000000.0,1.0
2,"[[0.0, 2.0, 0.0, 0.0, 1.0, 0.0], [1.0, 6.0, 2....",1.0,50000000.0,1.0
3,"[[0.0, 3.0, 0.0, 0.0, 2.0, 0.0], [1.0, 7.0, 3....",1.0,50000000.0,1.0
4,"[[0.0, 3.0, 0.0, 0.0, 2.0, 0.0], [1.0, 8.0, 3....",1.0,50000000.0,1.0


In [48]:
# Flatten the 'hops' column into a 1D list
data['hops'] = data['hops'].apply(lambda x: [item for sublist in x for item in sublist])

# Create separate columns for each element in the 'hops' list
for i in range(data['hops'].apply(len).max()):
    data[f'hops_{i}'] = data['hops'].apply(lambda x: x[i] if i < len(x) else np.nan)

# Drop the original 'hops' column
data = data.drop('hops', axis=1)


In [49]:
data = data.drop('duration_seconds', axis=1)


In [50]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import os
import pickle

# Split data into features and target
target = data['path_failure']
features = data.drop('path_failure', axis=1)
features = features.fillna(0)

# Split data into train and test sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
features_train, target_train = smote.fit_resample(features_train, target_train)

# Split train set into train and validation sets
features_train, features_val, target_train, target_val = train_test_split(features_train, target_train, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2


In [29]:
# Check the balance of the 'path_failure' variable in the training set
print(target_train.value_counts())


1.0    582031
0.0    581841
Name: path_failure, dtype: int64


In [51]:

# Create a new directory to save the data
os.makedirs('cleaned_data', exist_ok=True)

# Save the data
pickle.dump(features_train, open('cleaned_data/features_train.pkl', 'wb'))
pickle.dump(target_train, open('cleaned_data/target_train.pkl', 'wb'))
pickle.dump(features_test, open('cleaned_data/features_test.pkl', 'wb'))
pickle.dump(target_test, open('cleaned_data/target_test.pkl', 'wb'))
pickle.dump(features_val, open('cleaned_data/features_val.pkl', 'wb'))
pickle.dump(target_val, open('cleaned_data/target_val.pkl', 'wb'))


In [52]:
from xgboost import XGBClassifier

# Initialize the model
xgb = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss')

# Define evaluation set (using validation set here)
eval_set = [(features_val, target_val)]

# Fit the model and print logs
xgb.fit(features_train, target_train, eval_set=eval_set, early_stopping_rounds=30, verbose=True)

# Predict on the test set
predictions_xgb = xgb.predict(features_test)


[0]	validation_0-logloss:0.44138
[1]	validation_0-logloss:0.30257
[2]	validation_0-logloss:0.21370
[3]	validation_0-logloss:0.15470
[4]	validation_0-logloss:0.11385
[5]	validation_0-logloss:0.08326
[6]	validation_0-logloss:0.06123
[7]	validation_0-logloss:0.04510
[8]	validation_0-logloss:0.03345
[9]	validation_0-logloss:0.02512
[10]	validation_0-logloss:0.01883
[11]	validation_0-logloss:0.01426
[12]	validation_0-logloss:0.01096
[13]	validation_0-logloss:0.00855
[14]	validation_0-logloss:0.00659
[15]	validation_0-logloss:0.00519
[16]	validation_0-logloss:0.00416
[17]	validation_0-logloss:0.00330
[18]	validation_0-logloss:0.00269
[19]	validation_0-logloss:0.00210
[20]	validation_0-logloss:0.00172
[21]	validation_0-logloss:0.00143
[22]	validation_0-logloss:0.00112
[23]	validation_0-logloss:0.00094
[24]	validation_0-logloss:0.00076
[25]	validation_0-logloss:0.00064
[26]	validation_0-logloss:0.00053
[27]	validation_0-logloss:0.00045
[28]	validation_0-logloss:0.00038
[29]	validation_0-loglos

In [53]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Print the predictions
print("Predictions:", predictions_xgb)

# Calculate and print the accuracy
accuracy = accuracy_score(target_test, predictions_xgb)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Print the classification report
report = classification_report(target_test, predictions_xgb)
print("Classification Report:\n", report)

# Print the confusion matrix
conf_mat = confusion_matrix(target_test, predictions_xgb)
print("Confusion Matrix:\n", conf_mat)


Predictions: [0 0 0 ... 0 0 0]
Accuracy: 100.00%


Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    194329
         1.0       1.00      1.00      1.00     54704

    accuracy                           1.00    249033
   macro avg       1.00      1.00      1.00    249033
weighted avg       1.00      1.00      1.00    249033

Confusion Matrix:
 [[194329      0]
 [     1  54703]]


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf.fit(features_train, target_train)

# Predict on the test set
predictions_rf = rf.predict(features_test)


In [None]:
# Convert columns to PyTorch tensors
data['hops'] = data['hops'].apply(lambda x: torch.tensor(x, dtype=torch.float32))


In [22]:
data['path_failure'] = data['path_failure'].apply(lambda x: torch.tensor(x, dtype=torch.float32))


In [26]:
import ast

def str_to_tensor(s):
    # Remove the 'tensor' part from the string
    s = s.replace('tensor(', '').replace(')', '')
    # Convert the string to a float
    s = float(s)
    # Convert the float to a PyTorch tensor
    return torch.tensor(s, dtype=torch.float32)

data['path_amount'] = data['path_amount'].apply(str_to_tensor)
data['path_amount'] = data['path_amount'].apply(lambda x: torch.tensor(x, dtype=torch.float32))


  data['path_amount'] = data['path_amount'].apply(lambda x: torch.tensor(x, dtype=torch.float32))


In [27]:
data['duration_seconds'] = data['duration_seconds'].apply(str_to_tensor)
data['duration_seconds'] = data['duration_seconds'].apply(lambda x: torch.tensor(x, dtype=torch.float32))


  data['duration_seconds'] = data['duration_seconds'].apply(lambda x: torch.tensor(x, dtype=torch.float32))


In [29]:
data = data.drop(['duration_seconds'], axis=1)
data.head()


Unnamed: 0,hops,path_failure,path_amount
0,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",tensor(1.),tensor(50000000.)
1,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",tensor(1.),tensor(50000000.)
2,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",tensor(1.),tensor(50000000.)
3,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",tensor(1.),tensor(50000000.)
4,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",tensor(1.),tensor(50000000.)


In [31]:
import pandas as pd
pubkey_map = pd.read_pickle('pubkey_dict.pkl')
list(pubkey_map.items())[:5]


[('026ec3e3438308519a75ca4496822a6c1e229174fbcaadeeb174704c377112c331', 0),
 ('03aefa43fbb4009b21a4129d05953974b7dbabbbfb511921410080860fca8ee1f0', 1),
 ('027100442c3b79f606f80f322d98d499eefcb060599efc5d4ecb00209c2cb54190', 2),
 ('02a98e8c590a1b5602049d6b21d8f4c8861970aa310762f42eae1b2be88372e924', 3),
 ('033d8656219478701227199cbd6f670335c8d408a92ae88b962c49d4dc0e83e025', 4)]

In [32]:
scid_map = pd.read_pickle('scid_dict.pkl')
list(scid_map.items())[:5]


[(8.747901429065974e+17, 0),
 (8.801975409563402e+17, 1),
 (8.88120621832536e+17, 2),
 (8.800612016854139e+17, 3),
 (8.69064985909592e+17, 4)]

In [33]:
# Split data into features and target
features = data[['hops', 'path_amount']]
target = data['path_failure']


In [34]:
# Split data into features and target
features = data[['hops', 'path_amount']]
target = data['path_failure']

from sklearn.model_selection import train_test_split
import os
import pickle

# Split data into train and test sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Split train set into train and validation sets
features_train, features_val, target_train, target_val = train_test_split(features_train, target_train, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2


In [None]:

# Create a new directory to save the data
os.makedirs('cleaned_data', exist_ok=True)

# Save the data
pickle.dump(features_train, open('cleaned_data/features_train.pkl', 'wb'))
pickle.dump(target_train, open('cleaned_data/target_train.pkl', 'wb'))
pickle.dump(features_test, open('cleaned_data/features_test.pkl', 'wb'))
pickle.dump(target_test, open('cleaned_data/target_test.pkl', 'wb'))
pickle.dump(features_val, open('cleaned_data/features_val.pkl', 'wb'))
pickle.dump(target_val, open('cleaned_data/target_val.pkl', 'wb'))


In [36]:
data.head()


Unnamed: 0,hops,path_failure,path_amount
0,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",tensor(1.),tensor(50000000.)
1,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",tensor(1.),tensor(50000000.)
2,"[[tensor(0.), tensor(2.), tensor(0.), tensor(0...",tensor(1.),tensor(50000000.)
3,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",tensor(1.),tensor(50000000.)
4,"[[tensor(0.), tensor(3.), tensor(0.), tensor(0...",tensor(1.),tensor(50000000.)


In [35]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf.fit(features_train, target_train)

# Predict on the test set
predictions_rf = rf.predict(features_test)


ValueError: only one element tensors can be converted to Python scalars

In [None]:
from xgboost import XGBClassifier

# Initialize the model
xgb = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss')

# Fit the model
xgb.fit(features_train, target_train)

# Predict on the test set
predictions_xgb = xgb.predict(features_test)


In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Initialize the model
ada = AdaBoostClassifier(n_estimators=100, random_state=42)

# Fit the model
ada.fit(features_train, target_train)

# Predict on the test set
predictions_ada = ada.predict(features_test)


In [None]:
import pytorch_lightning as pl

class MLP(pl.LightningModule):
    def __init__(self, input_dim, output_dim):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim),
        )

    def forward(self, x):
        return self.layers(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.02)


In [None]:
import pytorch_lightning as pl

class LSTMNet(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMNet, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0, c0 = self.init_hidden(x.size(0))
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

    def init_hidden(self, batch_size):
        return (torch.zeros(1, batch_size, self.hidden_dim).to(self.device),
                torch.zeros(1, batch_size, self.hidden_dim).to(self.device))

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.02)
