In [1]:
import pandas as pd

data = pd.read_csv('./cleaned_data/final.csv', index_col=0)
data.head()


Unnamed: 0,path_failure,path_amount,hop_0_destination_pubkey,hop_0_source_pubkey,hop_0_is_final_hop,hop_0_scid,hop_1_destination_pubkey,hop_1_source_pubkey,hop_1_is_final_hop,hop_1_scid,...,hop_10_is_final_hop,hop_10_scid,hop_11_destination_pubkey,hop_11_source_pubkey,hop_11_is_final_hop,hop_11_scid,hop_12_destination_pubkey,hop_12_source_pubkey,hop_12_is_final_hop,hop_12_scid
0,1,50000000,2,0,0,1,1,2,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,1,50000000,3,0,0,2,4,3,0,3,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,1,50000000,2,0,0,1,6,2,0,6,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,1,50000000,3,0,0,2,7,3,0,8,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,1,50000000,3,0,0,2,8,3,0,10,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [2]:
# Define the number of hops
num_hops = 13

# Initialize an empty list to store the new data
new_data = []

# Iterate over each row in the data
for _, row in data.iterrows():
    # Initialize an empty list to store the hops for this path
    path = []
    # Iterate over each hop
    for i in range(num_hops):
        # Get the properties for this hop
        hop = [
            row[f'hop_{i}_destination_pubkey'],
            row[f'hop_{i}_source_pubkey'],
            row[f'hop_{i}_is_final_hop'],
            row[f'hop_{i}_scid']
        ]
        # Add the hop to the path
        path.append(hop)
    # Add the path, path_failure, and path_amount to the new data
    new_data.append([row['path_failure'], row['path_amount'], path])

# Create a new DataFrame with the new data
new_df = pd.DataFrame(new_data, columns=['path_failure', 'path_amount', 'path'])

# Set the index of the new DataFrame to be the same as the original data
new_df.index = data.index


In [3]:
new_df.head()


Unnamed: 0,path_failure,path_amount,path
0,1,50000000,"[[2, 0, 0, 1], [1, 2, 1, 0], [-1, -1, -1, -1],..."
1,1,50000000,"[[3, 0, 0, 2], [4, 3, 0, 3], [-1, -1, -1, -1],..."
2,1,50000000,"[[2, 0, 0, 1], [6, 2, 0, 6], [-1, -1, -1, -1],..."
3,1,50000000,"[[3, 0, 0, 2], [7, 3, 0, 8], [-1, -1, -1, -1],..."
4,1,50000000,"[[3, 0, 0, 2], [8, 3, 0, 10], [-1, -1, -1, -1]..."


In [4]:
new_df.to_csv('cleaned_data/array_final.csv')


In [5]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import os

# Split data into features and target
target = data['path_failure']
features = data.drop('path_failure', axis=1)
features = features.fillna(-1)

# Split data into train+validation and test sets
features_train_val, features_test, target_train_val, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Split train+validation set into separate training and validation sets
features_train, features_val, target_train, target_val = train_test_split(features_train_val, target_train_val, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

# Now apply SMOTE only to the training set
smote = SMOTE(random_state=42)
features_train, target_train = smote.fit_resample(features_train, target_train)


In [7]:

# Create a new directory to save the data
os.makedirs('cleaned_data', exist_ok=True)

# Save the data
features_train.to_csv('cleaned_data/array_features_train.csv')
target_train.to_csv('cleaned_data/array_target_train.csv')
features_test.to_csv('cleaned_data/array_features_test.csv')
target_test.to_csv('cleaned_data/array_target_test.csv')
features_val.to_csv('cleaned_data/array_features_val.csv')
target_val.to_csv('cleaned_data/array_target_val.csv')


In [9]:
import torch
from torch import nn
import pytorch_lightning as pl

class ArrayRNN(pl.LightningModule):
    def __init__(self, hop_input_dim, hidden_dim, output_dim):
        super(ArrayRNN, self).__init__()
        self.rnn = nn.GRU(hop_input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.loss = nn.BCEWithLogitsLoss()
        self.val_losses = []

    def forward(self, x):
        # x is of shape (batch_size, seq_len, hop_input_dim)
        print(x.shape)  # Print the shape of the input
        output, _ = self.rnn(x)
        # Take the output from the final time step
        final_output = output[:, -1, :]
        x = self.fc(final_output)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        # Create a mask to ignore the padding values in the loss computation
        mask = (x != -1).float()
        loss = self.loss(logits * mask, y * mask)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss(logits, y)
        self.log('val_loss', loss)
        self.val_losses.append(loss)

    def on_validation_epoch_end(self):
        avg_loss = torch.stack(self.val_losses).mean()
        self.log('avg_val_loss', avg_loss)
        self.val_losses = []  # reset for the next epoch

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)


In [11]:
from torch.utils.data import DataLoader, TensorDataset

# Convert data to PyTorch tensors
features_train_tensor = torch.tensor(features_train.values, dtype=torch.float32)
target_train_tensor = torch.tensor(target_train.values, dtype=torch.float32)

# Create a DataLoader
train_data = TensorDataset(features_train_tensor, target_train_tensor)
train_loader = DataLoader(train_data, batch_size=32)

# Convert validation data to PyTorch tensors
features_val_tensor = torch.tensor(features_val.values, dtype=torch.float32)
target_val_tensor = torch.tensor(target_val.values, dtype=torch.float32)

# Create a DataLoader for the validation data
val_data = TensorDataset(features_val_tensor, target_val_tensor)
val_loader = DataLoader(val_data, batch_size=32)

# Train the model
model = ArrayRNN(features_train.shape[1], hidden_dim=50, output_dim=1)
trainer = pl.Trainer(max_epochs=10)
trainer.fit(model, train_loader, val_loader)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /home/paperspace/ml-final-project/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type              | Params
-------------------------------------------
0 | rnn  | GRU               | 15.8 K
1 | fc   | Linear            | 51    
2 | loss | BCEWithLogitsLoss | 0     
-------------------------------------------
15.8 K    Trainable params
0         Non-trainable params
15.8 K    Total params
0.063     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/paperspace/.local/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


IndexError: too many indices for tensor of dimension 2

In [12]:
new_df.head()


Unnamed: 0,path_failure,path_amount,path
0,1,50000000,"[[2, 0, 0, 1], [1, 2, 1, 0], [-1, -1, -1, -1],..."
1,1,50000000,"[[3, 0, 0, 2], [4, 3, 0, 3], [-1, -1, -1, -1],..."
2,1,50000000,"[[2, 0, 0, 1], [6, 2, 0, 6], [-1, -1, -1, -1],..."
3,1,50000000,"[[3, 0, 0, 2], [7, 3, 0, 8], [-1, -1, -1, -1],..."
4,1,50000000,"[[3, 0, 0, 2], [8, 3, 0, 10], [-1, -1, -1, -1]..."
