# Setup

In [None]:
!pip install torch
!pip install torch-geometric
!pip install captum # causes dependency issue with numpy as numpy requires a version <2; in colab simply hit restart runtime to use the older version without error
!pip install pandas
!pip install networkx
!pip install matplotlib
!pip install tqdm
!pip install numpy

In [None]:
from itertools import pairwise, product
import networkx as nx
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric as pyg
from tqdm.auto import tqdm
import pandas as pd
import random
import matplotlib.pyplot as plt

In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    return seed
set_seed(0)

In [None]:
# Stores data with all values pertaining to their individual training
experiments = []

def add_experiment(param):
    experiments.append(param)

def get_dataframe():
    df = pd.DataFrame(experiments)
    return df

def clear_experiments():
    experiments.clear()

def save_dataframe(path):
    df = get_dataframe()
    df.to_csv(path)

In [None]:
# This cell must change depending on runtime environment; it is currently configured for Colab.
from google.colab import drive
drive.mount("/content/drive")
folder_path = "/content/drive/MyDrive/FinDSExperiments/GINE_Size_Applicability_by_NumHops/"

# Architecture

In [None]:
class ShortestPathGNN(torch.nn.Module):
    def __init__(
        self,
        hidden_channels: int,
        num_layers: int
    ):
        super().__init__()
        in_channels = 1
        out_channels = 1
        self.encoder = torch.nn.Linear(in_channels, hidden_channels)

        self.layers = torch.nn.ModuleList()
        for _ in range(num_layers):
            layer = pyg.nn.GINEConv(
                nn=torch.nn.Sequential(
                    torch.nn.Linear(hidden_channels, hidden_channels),
                    torch.nn.ReLU(),
                    torch.nn.Linear(hidden_channels, hidden_channels),
                ),edge_dim=1)
            self.layers.append(layer)
        self.decoder = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x: torch.Tensor, edge_index: torch.Tensor, edge_attr: torch.Tensor):
        x = self.encoder(x)
        for layer in self.layers:
            x = layer(x, edge_index, edge_attr)
        x = self.decoder(x)
        return x

# Generate Data

In [None]:
from collections.abc import KeysView
K = 4
large_number = 10000

def convert_networkx_to_pyg_shortest_path(graph: nx.Graph,large_number:int): # Modification here to also store num_hops for each node.
    nx.set_edge_attributes(graph, values={e : 1.0 + 0.1*np.random.randn() for e in graph.edges()}, name='edge_attr')
    data = pyg.utils.convert.from_networkx(graph)
    data.x = torch.Tensor([0] + [large_number for _ in range(data.num_nodes-1)]).unsqueeze(1)
    length_dict = nx.shortest_path_length(graph, source=0, weight="edge_attr")
    data.y = torch.Tensor([length_dict.get(i, large_number) for i in range(data.num_nodes)])
    data.edge_attr = data.edge_attr.unsqueeze(1)
    num_hops_dict = nx.shortest_path_length(graph, source=0)
    data.num_hops = torch.tensor([num_hops_dict.get(i, large_number) for i in range(data.num_nodes)])
    return data

def get_connected_ER_graph(num_nodes: int, p: float):
    while True: # loop until we generate a connected graph
        graph = nx.erdos_renyi_graph(num_nodes, p)
        if nx.is_connected(graph):
            return graph

def create_pyg_dataset(num_nodes: int, num_samples: int, large_number:int):
    set_seed(0)
    return [
        convert_networkx_to_pyg_shortest_path(get_connected_ER_graph(num_nodes, p=0.1),large_number)
        for _ in range(num_samples)
    ]

###Custom experimental setup code:

In [None]:
training_node_sizes = []
testing_node_sizes = []

value = 20.0;
while(value<1000):
    training_node_sizes.append(int(value))
    value*=1.2

value = 5.0;
while(value<2720):
    testing_node_sizes.append(int(value))
    value *= 1.3

In [None]:
set_seed(0)

# Generate testing data
testing_data_by_sizes = {}
for testing_node_size in testing_node_sizes:
    num_test_samples = 100
    test_dataset = create_pyg_dataset(num_nodes=testing_node_size, num_samples=num_test_samples,large_number=large_number)
    testing_data_by_sizes[testing_node_size] = test_dataset

In [None]:
set_seed(0)

# Generate training data
training_data_by_sizes = {}
for training_node_size in training_node_sizes:
    num_train_samples = 100
    train_dataset = create_pyg_dataset(num_nodes=training_node_size, num_samples=num_train_samples,large_number=large_number)
    training_data_by_sizes[training_node_size] = train_dataset

In [None]:
hops_max_in_train_graphs = 0

for key, graphs in training_data_by_sizes.items():
    for graph in graphs:
        for value in graph.num_hops:
            if value.item() > hops_max_in_train_graphs:
                hops_max_in_train_graphs = value.item()

print(hops_max_in_train_graphs)

# Training/Testing

###Custom experimental setup code:

In [None]:
networks = {}
optimizers = {}

In [None]:
# Training basic setup
device = "cuda" if torch.cuda.is_available() else "cpu"
loss_function = torch.nn.MSELoss(reduction='none')

def temp_test(network, test_loader):
    network.eval()
    test_loss = 0.
    with torch.no_grad():
        for batch in test_loader:
            batch = batch.to(device)
            pred = network(batch.x, batch.edge_index, batch.edge_attr)
            loss = loss_function(pred.flatten(), batch.y)
            test_loss += loss.mean().item()
    test_loss /= len(test_loader.dataset)
    return test_loss

def train_one_epoch(number, network, optimizer, train_loader, alpha):
        network.train()
        epoch_loss = 0
        num_adds = 0

        for batch in train_loader:
            network.zero_grad()
            batch = batch.to(device)
            pred = network(batch.x, batch.edge_index, batch.edge_attr)

            target_node_mask = (batch.num_hops == number)

            if target_node_mask.sum() > 0:
                masked_pred = pred.flatten()[target_node_mask]
                masked_y = batch.y[target_node_mask]

                individual_losses = loss_function(masked_pred, masked_y)
                loss = individual_losses.sum()

                # Add L1 regularization
                l1_norm = sum(p.abs().sum() for p in network.parameters())
                loss = loss + alpha * l1_norm

                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()
                num_adds += target_node_mask.sum().item()

        if num_adds == 0:
            print(f"No training data for this batch in network {number}")
            return None

        epoch_loss /= num_adds
        return epoch_loss

def runTraining(hidden_channels_choice, seed, alpha):
    set_seed(seed)
    network = ShortestPathGNN(hidden_channels=hidden_channels_choice, num_layers=K)
    network.to(device)
    optimizer = torch.optim.Adam(network.parameters(), lr=0.0003)
    num_epochs = 1000

    test_loader_temp = pyg.loader.DataLoader(testing_data_by_sizes[14], batch_size=64, shuffle=False) # for testing during training

    for key in range(hops_max_in_train_graphs+1):
        set_seed(seed)
        networks[key] = ShortestPathGNN(hidden_channels=hidden_channels_choice, num_layers=K)
        networks[key].to(device)
        optimizers[key] = torch.optim.Adam(networks[key].parameters(), lr=0.0003)

    pbar = tqdm(range(num_epochs*len(networks.keys())*len(training_node_sizes)))
    pbars = {}
    for key in range(hops_max_in_train_graphs+1):
        pbars[key] = tqdm(range(num_epochs))

    for training_node_size in training_node_sizes: # makes sure all data is run through
        train_loader = pyg.loader.DataLoader(training_data_by_sizes[training_node_size], batch_size=64, shuffle=True)
        for key in networks.keys():
            for epoch in pbars[key]:
                networks[key].to(device)
                train_loss = train_one_epoch(key, networks[key], optimizers[key], train_loader, alpha)
                if train_loss is None:
                    pbar.update(num_epochs - epoch)
                    pbars[key].update(num_epochs)
                    break
                test_loss_temp = temp_test(networks[key], test_loader_temp)
                train_loss_str = f"{train_loss:.4f}" if train_loss is not None else "None"
                pbars[key].set_description(f"Key: {key}, Epoch {epoch}, Train Loss: {train_loss_str}, Test Loss: {test_loss_temp:.4f}")
                pbar.set_description(f"Key: {key}")
                pbar.update()
        if training_node_size != training_node_sizes[-1]:
            for key, ind_pbar in pbars.items():
                pbars[key] = tqdm(range(num_epochs)) # Reset pbars for next training_node_size

def test_model(network, loss_function, test_loader):
    network.eval()
    node_results = []
    with torch.no_grad():
        for batch in test_loader:
            batch = batch.to(device)
            pred = network(batch.x, batch.edge_index, batch.edge_attr)

            individual_losses = loss_function(pred.flatten(), batch.y)
            num_hops_batch = batch.num_hops

            for i in range(pred.shape[0]):
                node_results.append([num_hops_batch[i].item(), individual_losses[i].item()])
    return node_results

def run_testing_by_model(network, network_num):
    test_losses_sum = {}
    test_losses_total_adds = {}

    pbar = tqdm(range(len(testing_data_by_sizes.keys())))
    pbar.set_description(f"Network Num Hops: {network_num}")

    for key,testing_data in testing_data_by_sizes.items():
        test_loader = pyg.loader.DataLoader(testing_data, batch_size=64, shuffle=False)
        test_losses_by_node = test_model(network, loss_function, test_loader)
        for i in test_losses_by_node:
            if not i[0] in test_losses_total_adds.keys():
                test_losses_sum[i[0]] = i[1]
                test_losses_total_adds[i[0]] = 1
            else:
                test_losses_sum[i[0]] += i[1]
                test_losses_total_adds[i[0]] += 1
        pbar.update()

    test_losses_average = {i:test_losses_sum[i]/test_losses_total_adds[i] for i in test_losses_total_adds.keys()}

    return test_losses_average


In [None]:
hidden_channels = 4
alpha = 1.5

runTraining(hidden_channels, 0, alpha) # Runs all training simultaneously to make better use of data loading

In [None]:
test_losses_by_training_num_hops = {}

for key in networks.keys():
    test_losses_by_training_num_hops[key] = run_testing_by_model(networks[key], key)
    print(f"Completed testing network: {key}")

In [None]:
test_losses_data = []
row_labels = []
column_labels = []
for key, val in test_losses_by_training_num_hops.items():
    if not column_labels:
        column_labels = list(val.keys())
    test_losses_data.append(list(val.values()))
    row_labels.append(key)

test_losses_array = np.array(test_losses_data)

panda_array = pd.DataFrame(test_losses_array, index=row_labels, columns=column_labels)
panda_array.to_csv(folder_path + "test_losses_num_hops.csv")