In [36]:
import os

if os.getcwd().endswith("notebooks"):
    os.chdir("..")
    print("using project root as working dir")

In [37]:
from dataclasses import dataclass
import numpy as np
import networkx as nx
import math
from tqdm.notebook import tqdm
import random
from typing import List, Tuple


@dataclass
class Args:
    random_seed = None
    # torch
    batch_size = 64
    epochs = 30
    layers = 10
    layer_size = 16
    train_size = 0.7
    wandb = False
    # graph
    graph_size = 1000
    graph_shape = 'disc'
    rg_radius = 0.05
    # dataset manipulation
    ds_padded = True

args = Args()

In [38]:
import torch
from torch import nn
from torch.utils.data import Dataset, TensorDataset, DataLoader
from sklearn.metrics import classification_report


# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
print(f"using {device} device")

using cuda device


In [39]:
NodePosition = Tuple[float, float]
NodePositions = List[NodePosition]
NodeIndexPairs = List[Tuple[int, int]]

def gen_nodes(size: int, shape: str = "disc") -> NodePositions:
    if shape == 'disc':
        return __gen_nodes_disc(size)
    else:
        raise f'unsupported node shape: {shape}'


def __gen_nodes_disc(amount: int) -> NodePositions:
    points = []
    with tqdm(total=amount, desc="generating random-uniform nodes on disc") as pbar:
        while len(points) < amount:
            p = (random.uniform(0, 1), random.uniform(0, 1))
            d = (p[0] - 0.5, p[1] - 0.5)
            if math.sqrt(d[0] * d[0] + d[1] * d[1]) > 0.5:
                continue
            points.append(p)
            pbar.update(1)
    return points


def get_node_pairs(n_nodes: int) -> NodeIndexPairs:
    return [
        (i0, i1)
        for i0 in tqdm(range(n_nodes), desc="generating node pairs")
        for i1 in range(i0 + 1, n_nodes)
    ]


# https://stackoverflow.com/a/36460020/10619052
def list_to_dict(items: list) -> dict:
    return {v: k for v, k in enumerate(tqdm(items, desc="creating dict from list"))}

In [40]:
# Define graph builder
class RandomGeometricGraphBuilder:
    def __init__(self):
        # generate graph
        self.nodes = gen_nodes(args.graph_size, args.graph_shape)
        self.n_nodes = len(self.nodes)
        self.graph = nx.random_geometric_graph(
            self.n_nodes,
            args.rg_radius,
            pos=list_to_dict(self.nodes)
        )
        self.node_index_pairs = get_node_pairs(self.n_nodes)
        self.edges: NodeIndexPairs = [
            (i0, i1)
            for (i0, i1) in tqdm(self.node_index_pairs, desc="generating dataset labels from node pairs")
            if self.graph.has_edge(i0, i1)
        ]

In [41]:
# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        #self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(4, args.layer_size),
            nn.ReLU(),
            nn.Linear(args.layer_size, args.layer_size),
            nn.ReLU(),
            nn.Linear(args.layer_size, 2)
        )

    def forward(self, x):
        #x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [42]:
# Define evaluator
class EmbeddingEvaluator:
    def __init__(self, nodes: NodePositions, edges: NodeIndexPairs, embedding: NodePositions):
        self.nodes = nodes # used for 2D representation of graph (not used in training)
        self.n_nodes = len(nodes)
        self.node_index_pairs = get_node_pairs(self.n_nodes)
        self.edges = edges
        self.embedding = embedding
        # generate net
        self.reset_net()
        # generate dataset
        self.ds_values = torch.tensor([
            [*self.embedding[i0], *self.embedding[i1]] # type: [float, float, float, float]
            for (i0, i1) in tqdm(self.node_index_pairs, desc="generating dataset values from node pairs")
        ])
        self.ds_labels = torch.LongTensor([
            1 if (edge in self.edges) else 0
            for edge in tqdm(self.node_index_pairs, desc="generating dataset labels from node pairs")
        ])
        self.dataset = TensorDataset(self.ds_values, self.ds_labels)
        #? do we ant to over-fit?
        self.train_dataset, self.test_dataset = torch.utils.data.random_split(self.dataset, [args.train_size, 1 - args.train_size])
        self.train_dataloader = DataLoader(self.train_dataset, batch_size=args.batch_size, num_workers=0, shuffle=True)
        self.test_dataloader = DataLoader(self.test_dataset, batch_size=args.batch_size, num_workers=0, shuffle=False)


    def reset_net(self):
        self.net = NeuralNetwork().to(device)


    def train(self, loss_fn, optimizer):
        for epoch in range(args.epochs):
            with tqdm(total=len(self.train_dataloader), desc="starting model...") as pbar:
                pbar.set_description(f"Epoch {epoch + 1}")
                self.__train(pbar, loss_fn, optimizer)
                self.__test(pbar, loss_fn)


    def __train(self, pbar, loss_fn, optimizer):
        self.net.train()

        n_train_batches = len(self.train_dataloader)
        intv = np.ceil(n_train_batches / 100.0) # interval in which the pbar is updated (every 1%)
        for batch, (X, y) in enumerate(self.train_dataloader):
            X, y = X.to(device), y.to(device)
            # Compute prediction error
            pred = self.net(X)
            loss = loss_fn(pred, y)
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # update progress
            if batch % intv == 0 or batch == n_train_batches - 1:
                pbar.update(batch - pbar.n)
                pbar.set_postfix_str(f"loss: {loss.item():>6f}")


    def __test(self, pbar, loss_fn):
        self.net.eval()

        n_test_batches = len(self.test_dataloader)
        n_test_values = len(self.test_dataloader.dataset)
        test_loss, correct = 0, 0
        pbar.set_postfix_str(f"evaluating epoch...")
        with torch.no_grad():
            for x, y in self.test_dataloader:
                x, y = x.to(device), y.to(device)
                pred = self.net(x)
                test_loss += loss_fn(pred, y).item()
                correct += (pred.argmax(1) == y).type(torch.float).sum().item()
        test_loss /= n_test_batches
        correct /= n_test_values
        pbar.set_postfix_str(f"epoch result: accuracy: {(100*correct):>0.1f}%, avg_loss: {test_loss:>8f}")


    def predict(self, embedding: NodePositions):
        self.net.eval()

        _node_index_pairs = get_node_pairs(len(embedding))
        _ds_values = torch.tensor([
            [*embedding[i0], *embedding[i1]] # type: [float, float, float, float]
            for (i0, i1) in tqdm(_node_index_pairs, desc="generating dataset values from node pairs")
        ])
        with torch.no_grad():
            return self.net(_ds_values.to(device))


    def evaluate(self):
        self.net.eval()

        with torch.no_grad():
            _predictions = self.net(self.ds_values.to(device))
            _, _pred_labels = torch.max(_predictions, 1)
            _probabilities = torch.nn.Softmax(dim=1)(_predictions)

            print(classification_report(self.ds_labels, _pred_labels.cpu()))
            return _predictions, _pred_labels, _probabilities

In [43]:
# build and run evaluator
graph_builder = RandomGeometricGraphBuilder()
evaluator = EmbeddingEvaluator(
    graph_builder.nodes,
    graph_builder.edges,
    graph_builder.nodes # for random geometric graph, the structure defines the embedding
)

evaluator.train(
    loss_fn=nn.CrossEntropyLoss(),
    optimizer=torch.optim.SGD(evaluator.net.parameters(), lr=1e-3)
)

generating random-uniform nodes on disc:   0%|          | 0/1000 [00:00<?, ?it/s]

creating dict from list:   0%|          | 0/1000 [00:00<?, ?it/s]

generating node pairs:   0%|          | 0/1000 [00:00<?, ?it/s]

generating dataset labels from node pairs:   0%|          | 0/499500 [00:00<?, ?it/s]

generating node pairs:   0%|          | 0/1000 [00:00<?, ?it/s]

generating dataset values from node pairs:   0%|          | 0/499500 [00:00<?, ?it/s]

generating dataset labels from node pairs:   0%|          | 0/499500 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

starting model...:   0%|          | 0/5464 [00:00<?, ?it/s]

In [44]:
# evaluate embedding
predictions, pred_labels, probabilities = evaluator.evaluate()
print(probabilities)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      1.00    494700
           1       0.00      0.00      0.00      4800

    accuracy                           0.99    499500
   macro avg       0.50      0.50      0.50    499500
weighted avg       0.98      0.99      0.99    499500

tensor([[0.9899, 0.0101],
        [0.9878, 0.0122],
        [0.9862, 0.0138],
        ...,
        [0.9863, 0.0137],
        [0.9856, 0.0144],
        [0.9876, 0.0124]], device='cuda:0')


  _warn_prf(average, modifier, msg_start, len(result))
