# Node Laveler
This notebook contains the code for compressing graph features (i.e., node and edge features) and structure (via. random walks) into a low dimension space to be sampled and to generate a label for each node. 

In [1]:
import torch
import torch_geometric.datasets as pygdata  # Import datasets submodule
import matplotlib.pyplot as plt
import networkx as nx
from tqdm import tqdm

**Create the Dataset**

In [2]:
"""
Renames a walk from starting at an arbitrary node index to start at 0.

Args:
    walk: A list representing the sequence of nodes visited in the walk.

Returns:
    new_walk: A list representing the sequence of nodes visited in the walk,
    numbered starting at 0.
"""
def re_name_walk(walk):
    mapping = {}
    next_num = 0
    new_walk = []
    for node in walk:
        if node in mapping.keys():
            new_walk.append(mapping[node])
        mapping[node] = next_num
        new_walk.append(mapping[node])
        next_num = next_num + 1
    return new_walk

In [None]:
import random

"""
Performs a random walk of fixed length on a NetworkX graph.

Args:
    G: A NetworkX graph object.
    start_node: The starting node for the random walk.
    walk_length: The fixed length of the walk (number of steps).

Returns:
    walk: A list representing the sequence of nodes visited in the walk.
"""
def random_walk(G, start_node, walk_length):
    walk = [start_node]
    current_node = start_node
    for _ in range(walk_length):
        # Get neighbors of the current node
        neighbors = list(G.neighbors(current_node))
        if not neighbors:  # Handle case with isolated nodes
            break
        # Randomly choose a neighbor
        next_node = random.choice(neighbors)
        walk.append(next_node)
        current_node = next_node
    return re_name_walk(walk)

In [4]:
# Create NetworkX graph
from torch_geometric.datasets import ZINC
dataset = ZINC(root = '', split='train') # valid, test

In [5]:
def convert_row(row):
    node_feat = row.x
    edge_index = row.edge_index
    edge_attr = row.edge_attr

    # Create NetworkX graph
    G = nx.Graph()

    for node, feat in enumerate(node_feat):
        G.add_node(node, attr=feat)

    for edge_idx, (edge_i, edge_j) in enumerate(zip(edge_index[0], edge_index[1])):
        edge_att = edge_attr[edge_idx].item()
        G.add_edge(edge_i.item(), edge_j.item(), attr=edge_att)
    
    return G

In [6]:
# 13 m
graph_lst = []
for i, row in tqdm(enumerate(dataset[:50])):
    # cr = convert_row(row)
    # torch.save(cr, "crs/" + str(i) + "cr.pth")
    cr = torch.load("crs/" + str(i) + "cr.pth")
    graph_lst.append(cr)

50it [00:01, 32.09it/s]


In [7]:
# 10 s
# torch.save(graph_lst, "node_lab_graph_lst.pth")

In [8]:
# 10 s
# graph_lst = torch.load("node_lab_graph_lst.pth")

In [9]:

mn_nodes = 38

In [10]:
HOPS = 3
THRESH = int(.25 * 1155)
walk_length = HOPS

In [11]:
# make the training dataset
data_pairs = [] # input, target
n_walks = 50 # per graph
n_graphs = 20
for i in tqdm(range(n_graphs)):
    G = torch.load("crs/" + str(i) + "cr.pth")
    n_nodes = len(G.nodes)
    for j in range(n_walks):
        start_node = random.randint(0, n_nodes-1)
        walk = random_walk(G, start_node, walk_length)
        target = G.nodes[0] # Change this to node class
        context = walk[1:walk_length+1] # Change this to node ids
        data_pairs.append((torch.tensor(context).float(), target["attr"]))

100%|██████████| 20/20 [00:01<00:00, 18.29it/s]


In [12]:
data_pairs

[(tensor([1., 2., 3.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 2., 3.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 2., 1.]), tensor([0])),
 (tensor([1., 2., 3.]), tensor([0])),
 (tensor([1., 2., 1.]), tensor([0])),
 (tensor([1., 2., 3.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 2., 1.]), tensor([0])),
 (tensor([1., 2., 1.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 2., 3.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 2., 3.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 0., 2.]), tensor([0])),
 (tensor([1., 2., 1.]), tensor([0])),
 (tensor([1.

In [13]:
class MyDataset(torch.utils.data.Dataset):
  """
  A custom dataset class to handle your list of tuples.
  """
  def __init__(self, data):
    """
    Initializes the dataset with the provided data list.

    Args:
        data: A list of tuples (x, y), where x is the input data and y is the target.
    """
    self.data = data

  def __len__(self):
    """
    Returns the length of the dataset (number of data points).
    """
    return len(self.data)

  def __getitem__(self, index):
    """
    Retrieves a data point at the given index.

    Args:
        index: The index of the data point to retrieve.

    Returns:
        A tuple (x, y) representing the data point at the given index.
    """
    x, y = self.data[index]
    return x, y

In [14]:
dataset = MyDataset(data_pairs)

# Define batch size (number of samples per batch) and other parameters
batch_size = 4
shuffle = True  # Shuffle the data for each epoch (optional)

# Create the DataLoader
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [15]:
from torch import nn

class BaseNetwork(nn.Module):
  """
  A simple neural network with one hidden layer.
  """
  def __init__(self, in_features, hidden_size, out_features):
    """
    Initializes the network architecture.

    Args:
        in_features: Number of features in the input data.
        hidden_size: Number of neurons in the hidden layer.
        out_features: Number of features in the output layer.
    """
    super(BaseNetwork, self).__init__()

    # Define layers
    self.linear1 = nn.Linear(in_features, hidden_size)
    self.linear2 = nn.Linear(hidden_size, out_features)
    self.activation = nn.Softmax(dim=0)  # ReLU activation function (replace if needed)

  def forward(self, x):
    """
    Defines the forward pass of the network.

    Args:
        x: Input tensor of shape (batch_size, in_features).

    Returns:
        out: Output tensor of shape (batch_size, out_features).
    """
    # Hidden layer
    hidden = self.linear1(x)

    # Output layer with activation
    out = self.activation(self.linear2(hidden))

    return out

In [16]:
# Loss function and optimizer
from torch.nn import functional as F  # Functional interface for loss functions

def train_model(model, train_loader, optimizer, epochs):
  """
  Trains the model on the provided data loader.

  Args:
      model: The PyTorch neural network model.
      train_loader: A PyTorch DataLoader for the training data.
      optimizer: An optimizer object for updating model weights.
      epochs: Number of training epochs.
  """
  # Set model to training mode
  model.train()

  for epoch in range(epochs):
    for data, target in train_loader:
      # Clear gradients from previous iteration
      optimizer.zero_grad()

      # Forward pass
      output = model(data)

      print("out: " + str(output[0]))
      print("target: " + str(target[0]))

      # print("shape: " + str(output.squeeze().shape))

      # Compute cross-entropy loss
      loss = F.cross_entropy(output.squeeze(), target.squeeze())

      # print("loss: " + str(loss))

      # Backward pass and parameter update
      loss.backward()
      optimizer.step()

      # Print training information (optional)
      if (i+1) % 100 == 0:  # Print every 100 mini-batches
        print(loss.item())
        # print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

In [17]:
# Example usage (assuming you have prepared your data and train_loader)
# 3 mins
emb_dim = 28 # number of classes in zinc
model = BaseNetwork(walk_length, emb_dim, 28)  # Your model instance
# model = torch.load("node_label_embeder.pth")
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Example optimizer with learning rate
epochs = 100  # Number of training epochs

In [19]:
def get_node_label_fr_walk(model, walk):
    return model(torch.tensor(walk).float())

In [20]:
cr = torch.load("crs/8cr.pth")

correct = 0
total = 0
for node in range(len(cr.nodes)):
    actual = cr.nodes[node]["attr"]
    walk = random_walk(cr, node, walk_length)
    preds = get_node_label_fr_walk(model, walk[:walk_length])
    pred = torch.argmax(preds)
    if (actual[0].item() == pred.item()):
        correct += 1
    total += 1
    print("a: " + str(actual[0].item()) + "; p: " + str(pred.item()))

print("acc: " + str(correct / total))

a: 1; p: 13
a: 0; p: 13
a: 2; p: 13
a: 0; p: 14
a: 0; p: 14
a: 0; p: 14
a: 0; p: 14
a: 0; p: 13
a: 5; p: 14
a: 1; p: 13
a: 1; p: 13
a: 2; p: 14
a: 0; p: 14
a: 0; p: 14
a: 0; p: 13
a: 0; p: 14
a: 0; p: 13
a: 0; p: 14
a: 0; p: 13
a: 0; p: 14
a: 0; p: 14
a: 3; p: 13
a: 0; p: 14
a: 3; p: 14
a: 0; p: 13
a: 0; p: 13
a: 6; p: 13
acc: 0.0


In [21]:
train_model(model, data_loader, optimizer, epochs)

out: tensor([0.1941, 0.1638, 0.2188, 0.2782, 0.1880, 0.2846, 0.2633, 0.2083, 0.2226,
        0.2269, 0.2175, 0.2298, 0.3184, 0.2398, 0.3273, 0.2309, 0.2371, 0.1764,
        0.2692, 0.2780, 0.2052, 0.1736, 0.2796, 0.1780, 0.2672, 0.2285, 0.3040,
        0.2063], grad_fn=<SelectBackward0>)
target: tensor([0])
out: tensor([0.2527, 0.1258, 0.2051, 0.2562, 0.1719, 0.2856, 0.2813, 0.1850, 0.1987,
        0.2049, 0.1913, 0.1894, 0.3444, 0.2523, 0.3916, 0.2474, 0.2216, 0.1449,
        0.2861, 0.3848, 0.1851, 0.1143, 0.3083, 0.1622, 0.2796, 0.2753, 0.3168,
        0.1945], grad_fn=<SelectBackward0>)
target: tensor([0])
out: tensor([0.2465, 0.3510, 0.2794, 0.2479, 0.3058, 0.2296, 0.2326, 0.2950, 0.2843,
        0.2797, 0.2900, 0.2917, 0.1966, 0.2508, 0.1728, 0.2534, 0.2675, 0.3292,
        0.2296, 0.1760, 0.2950, 0.3588, 0.2166, 0.3141, 0.2334, 0.2355, 0.2116,
        0.2873], grad_fn=<SelectBackward0>)
target: tensor([1])
out: tensor([0.2500, 0.2500, 0.2500, 0.2500, 0.2500, 0.2500, 0.2500, 0.25

In [27]:
# torch.save(model, "node_label_embeder.pth")

In [23]:
# model = torch.load("node_label_embeder.pth")

In [24]:
def one_hot_to_number(encoded_vector):
  """
  Converts a one-hot encoded tensor to the corresponding number.

  Args:
      encoded_vector: A one-hot encoded tensor of shape (1, num_classes).

  Returns:
      The index of the active element (the class number) as an integer.
  """
  # Find the index of the maximum element (assuming only one class is active)
  active_index = encoded_vector.argmax(dim=1)

  # Extract the number (assuming the active class corresponds to the index)
  class_number = active_index.item()

  return class_number

In [25]:
def get_embedding(model, vector):
    return model.linear1(vector)

In [26]:
cr = torch.load("crs/8cr.pth")

correct = 0
total = 0
for node in range(len(cr.nodes)):
    actual = cr.nodes[node]["attr"]
    walk = random_walk(cr, node, walk_length)
    preds = get_node_label_fr_walk(model, walk[:walk_length])
    pred = torch.argmax(preds)
    if (actual[0].item() == pred.item()):
        correct += 1
    total += 1
    print("a: " + str(actual[0].item()) + "; p: " + str(pred.item()))

print("acc: " + str(correct / total))

a: 1; p: 0
a: 0; p: 0
a: 2; p: 1
a: 0; p: 0
a: 0; p: 1
a: 0; p: 1
a: 0; p: 1
a: 0; p: 0
a: 5; p: 0
a: 1; p: 0
a: 1; p: 0
a: 2; p: 0
a: 0; p: 0
a: 0; p: 0
a: 0; p: 0
a: 0; p: 0
a: 0; p: 0
a: 0; p: 1
a: 0; p: 1
a: 0; p: 1
a: 0; p: 0
a: 3; p: 0
a: 0; p: 1
a: 3; p: 0
a: 0; p: 0
a: 0; p: 1
a: 6; p: 1
acc: 0.37037037037037035
