Code from [scale embedding-decoder](https://git.scc.kit.edu/scale/research/embedding-decoder) with slight changes

In [1]:
import os

if os.getcwd().endswith("notebooks"):
    os.chdir("..")
    print("using project root as working dir")

using project root as working dir


In [2]:
import wandb
from wandb.keras import WandbCallback
import tensorflow as tf
import networkx as nx
import math
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import matplotlib as mpl
import seaborn as sns
import random

from src.map import Map
from src.disc import gen_disc_graph, gen_disc_edge

In [3]:
args = Map(
    batch_size = 64,
    epochs = 30,
    random_seed = None,
    graph_size = 1000,
    graph_average_degree = 10,
    rg_radius = 0.05,
    layers = 10,
    layer_size = 16,
    train_size = 0.7,
    wandb = False,
    ds_padded = True,
)

In [4]:
if args.wandb:
    wandb.login()
    wandb.init(project="embedding-eval-framework", entity="hydrofin")
    wandb.run

In [5]:
def parse_graph(graph):
    graph_nodes = nx.nodes(graph)
    # all combinations of x and y (with x > y)
    node_pairs = [ [i_p0, i_p1] for i_p0 in tqdm(range(graph.number_of_nodes()), desc="generating edge pairs") for i_p1 in range(i_p0 + 1, graph.number_of_nodes()) ]
    ds_values = [ [graph_nodes[ei0]['pos'][0], graph_nodes[ei0]['pos'][1], graph_nodes[ei1]['pos'][0], graph_nodes[ei1]['pos'][1]] for [ei0, ei1] in tqdm(node_pairs, desc="mapping edge positions") ]
    ds_labels = [ 1 if graph.has_edge(ei0, ei1) else 0 for [ei0, ei1] in tqdm(node_pairs, desc="creating labels for edges") ]
    return node_pairs, ds_values, ds_labels

In [6]:
## Padding:: Result for now: -> No big difference
## Padding:: maybe remove non-edges
## Padding:: maybe implement into 'tf.data.Dataset.from_tensor_slices' and normalize batches

## run multiple times and average (min 10)
## preprocess with data padding (duplicate edges in buckets so that all buckets have same amount of edges)

In [9]:
def pad_dataset(ds_values, ds_labels):
    label_diff = ds_labels.count(0) - ds_labels.count(1)
    label1_is = [i for i, el in tqdm(enumerate(ds_labels), desc="generating duplicates") if el == 1]
    label1_is_sample = random.sample(label1_is, label_diff, counts=([label_diff] * len(label1_is)))
    pad_values = ds_values + [ds_values[i] for i in tqdm(label1_is_sample, desc="adding duplicates for positions")]
    pad_labels = ds_labels + [ds_labels[i] for i in tqdm(label1_is_sample, desc="adding duplicates for labels")]
    return pad_values, pad_labels


def prepare_dataset(ds_values, ds_labels):
    if args.ds_padded:
        pad_values, pad_labels = pad_dataset(ds_values, ds_labels)
    else:
        pad_values, pad_labels = ds_values, ds_labels

    n_values = len(pad_values)

    full_dataset = tf.data.Dataset\
        .from_tensor_slices((pad_values, pad_labels))\
        .batch(args.batch_size)\
        .shuffle(np.ceil(n_values / 2))
    n_train = int(args.train_size * n_values)
    train_dataset = full_dataset.take(n_train)
    test_dataset = full_dataset.skip(n_train)
    return train_dataset, test_dataset


def run_model(train_dataset, test_dataset):
    # build model
    model_array = [tf.keras.layers.InputLayer(input_shape=4)]
    for i in range(args.layers):
        model_array.append(tf.keras.layers.Dense(args.layer_size, activation='relu'))
    model_array.append(tf.keras.layers.Flatten())
    model_array.append(tf.keras.layers.Dense(1, activation='sigmoid'))
    dense_model = tf.keras.Sequential(model_array)
    dense_model.compile(
        optimizer='adam',
        loss=tf.keras.losses.BinaryCrossentropy(),  # TODO try other loss function
        metrics=[
            'accuracy',
            tf.keras.metrics.Recall(thresholds=0),
            tf.keras.metrics.AUC(
                curve="PR"
            ),
        ]
    )
    callbacks = []
    if args.wandb:
        callbacks.append(WandbCallback())
    # run model
    dense_model.fit(train_dataset, epochs=args.epochs, callbacks=callbacks, verbose=1)
    eval_result = list(dense_model.evaluate(test_dataset, verbose=1)) # list(loss, acc, recall, auc)
    return dense_model, eval_result

## iterate over un-padded edges and calculate https://discord.com/channels/934839185855086662/988688735161946144/1070345614782632017
## - does the padding change the result?
## - weight edges higher than non-edges
## - use as loss function and metric

## mathematically define the BinaryCrossentropy in this model
## - used to compare to fastgae
## - compare to other loss functions

In [11]:
# run multiple times
results = []
models = []
for iteration in range(3):
    print(f'starting iteration {iteration}')
    # generate graph
    graph, node_positions, _ = gen_disc_graph(args.graph_size, args.graph_average_degree, args.rg_radius)
    # create & run model
    node_pairs, ds_values, ds_labels = parse_graph(graph)
    ds_train, ds_test = prepare_dataset(ds_values, ds_labels)
    model, result = run_model(ds_train, ds_test)
    results.append(result)
    models.append((model, graph, node_positions, node_pairs, ds_values, ds_labels))

losses, accs, recalls, aucs = zip(*results)

print(f'finished training models')
print(f'avg_loss: {np.average(losses)}  std_loss: {np.std(losses)}')

starting iteration 0


generating edge pairs: 100%|██████████| 1000/1000 [00:00<00:00, 1786.98it/s]
mapping edge positions: 100%|██████████| 499500/499500 [00:00<00:00, 586821.00it/s]
creating labels for edges: 100%|██████████| 499500/499500 [00:00<00:00, 2720359.37it/s]
generating duplicates: 499500it [00:00, 4757282.42it/s]
adding duplicates for positions: 100%|██████████| 489750/489750 [00:00<00:00, 4604698.00it/s]
adding duplicates for labels: 100%|██████████| 489750/489750 [00:00<00:00, 5364029.30it/s]


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


  numdigits = int(np.log10(self.target)) + 1


OverflowError: cannot convert float infinity to integer

In [None]:
# get best model
(model, graph, node_positions, node_pairs, ds_values, ds_labels) = models[(losses.index(min(losses)))]
predictions = [pred[0] for pred in model.predict(ds_values)]

In [None]:
# print best model
fig, ax = plt.subplots(1, 2)

# print original graph
ax[0].set_axis_off()
ax[0].set_aspect('equal')
ax[0].set_title("original graph")
nx.draw_networkx(graph, node_positions, ax=ax[0], node_size=5, with_labels=False, labels={})

# generate predict graph
threshold = 0.1
colors_filtered = np.array([pred for pred in tqdm(predictions, desc=f'generating colors') if pred > threshold])
colormap = sns.color_palette("flare", as_cmap=True)
pred_graph = nx.Graph()
pred_node_pairs = [edge for i, edge in enumerate(node_pairs) if predictions[i] > threshold]
pred_graph.add_edges_from(pred_node_pairs)

# print predicted graph
ax[1].set_axis_off()
ax[1].set_aspect('equal')
ax[1].set_title("reconstructed graph")
nx.draw_networkx(pred_graph, node_positions, ax=ax[1], node_size=5, with_labels=False, labels={}, edge_color=colors_filtered, edge_cmap=colormap)

# add color bar for predictions
cax = fig.add_axes([ax[1].get_position().x1 + 0.01, ax[1].get_position().y0, 0.02, ax[1].get_position().height])
fig.colorbar(mpl.cm.ScalarMappable(cmap=colormap), cax=cax, label="confidence")

plt.savefig('./filename.png', dpi=300)
plt.show()

# OLD

In [None]:
if args.wandb:
    wandb.finish()

### Additional Plots

Additional plots of information about the decoder.

In [None]:
distances = [math.dist([px, py], [qx, qy]) for [px, py, qx, qy] in ds_edges_pos]
fig, ax = plt.subplots()
ax.scatter(distances, edge_prediction, s=0.01)
plt.show()
## smaller points
## also with lines between (has to be sorted first)
## plot into #of-edges per distance (see how much data/information the nn gets per distance)
## more points (for more information around threshold distance)

## get threshold back with ml/wsk-theory

In [None]:
fig, ax = plt.subplots()
ax.hist2d([dist for dist in distances], edge_prediction, bins=(np.arange(0, 1, 0.01), np.arange(0, 1, 0.01)))
ax.set(xlim=(0, 1), ylim=(0, 1))
plt.show()

In [None]:
test_distances = np.arange(0, 1, 0.01)
dist_dup = 10
test_edges = [(d, gen_disc_edge(d)) for d in np.tile(test_distances, dist_dup)]
test_edges

In [None]:
test_predictions = dense_model.predict([edge for (_, edge) in test_edges]) # cartesian
test_edge_prediction = [pred[0] for pred in test_predictions]
len(test_edges), len(test_edge_prediction)

In [None]:
fig, ax = plt.subplots()
ax.scatter([d for (d, _) in test_edges], test_edge_prediction)
plt.show()

In [None]:
sorted_dist, sorted_pred = zip(*sorted(zip([d for (d, _) in test_edges], test_edge_prediction), key = lambda x: x[0]))
fig, ax = plt.subplots()
ax.plot(sorted_dist, sorted_pred, marker='o', linewidth=1, markersize=3)
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.hist2d([d for (d, _) in test_edges], test_edge_prediction, bins=(np.arange(0, 1, 0.1), np.arange(0, 1, 0.1)))
ax.set(xlim=(0, 1), ylim=(0, 1))

plt.show()