In [1]:
import tensorflow as tf

import networkx as nx
from gem.embedding.sdne import SDNE
from gem.embedding.node2vec import node2vec
from gem.embedding.lap import LaplacianEigenmaps
from gem.utils import graph_util


import stellargraph as sg
from stellargraph.core import StellarGraph, StellarDiGraph
from stellargraph.layer import GCN, GraphSAGE
from stellargraph.mapper.full_batch_generators import FullBatchNodeGenerator
from stellargraph.mapper.sampled_node_generators import GraphSAGENodeGenerator

from tensorflow.keras import layers, optimizers, losses, metrics, Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.manifold import TSNE
from sklearn import preprocessing, model_selection

import pandas as pd
from time import time
import numpy as np
from Scripts.Lab03.starter_code import read_embeddings, split_data, encode_classes, calculate_metrics
import os

Instructions for updating:
non-resource variables are not supported in the long term


Using TensorFlow backend.


In [3]:
print(tf.__version__)
# maybe go with 2.0 if there are errors

2.4.1


In [4]:
def read_graph(path):
 # Load the graph from edgelist
 edgelist = pd.read_table(path,
 header=None, names=["target", "source"])
 edgelist["label"] = "cites"
 num_nodes = len(edgelist["source"].unique())
 graph = nx.from_pandas_edgelist(edgelist,source="source",target="target", edge_attr="label")
 nx.set_node_attributes(graph, "paper", "label")
 # Load the features and subject for the nodes

 feature_names = ["w_{}".format(ii) for ii in range(1433)]
 column_names = feature_names + ["subject"]
 node_data = pd.read_table("./data/cora/cora.content",
 header=None, names=column_names)
 return graph, node_data, feature_names, num_nodes

In [5]:
def save_embeddings(file_path, embs, nodes, n_to_vec = False):
    """Save node embeddings

    :param file_path: path to the output file
    :type file_path: str
    :param embs: matrix containing the embedding vectors
    :type embs: numpy.array
    :param nodes: list of node names
    :type nodes: list(int)
    :return: None
    """
    if n_to_vec:
        data = list(embs.items())
        np_embs = np.array(data)
        embs = np_embs[:,1:]
        flat_embs = [list(np.concatenate(reg).flat) for reg in embs]
        embs = np.array(flat_embs)
    with open(file_path, 'w') as f:
        f.write(f'{embs.shape[0]} {embs.shape[1]}\n')
        for node, emb in zip(nodes, embs):
            f.write(f'{node} {" ".join(map(str, emb.tolist()))}\n')

In [6]:
graph, node_data, feature_names, numNodes = read_graph("./data/cora/cora.cites")
print(node_data[:]["w_0"])

31336      0
1061127    0
1106406    0
13195      0
37879      0
          ..
1128975    0
1128977    0
1128978    0
117328     0
24043      0
Name: w_0, Length: 2708, dtype: int64


In [7]:
methods = []
method1 = LaplacianEigenmaps(d=50)
method2 = node2vec(d=50, max_iter=3, walk_len=20, num_walks=10, con_size=10, ret_p=4, inout_p=1)
method3 = SDNE(d=50, beta=5, alpha=1, nu1=1e-6, nu2=1e-6, K=3, n_units=[100, 50, ], n_iter=50, xeta=0.01, n_batch=500, modelfile=['enc_model.json', 'dec_model.json'], weightfile=['enc_weights.hdf5', 'dec_weights.hdf5'])
methods.append(method1)
methods.append(method2)
methods.append(method3)
for method, i in zip(methods, range(len(methods))):
    tm = time()
    emb, load_time = method.learn_embedding(graph=graph, edge_f=None, is_weighted=False, no_python=True,)
    print(method.get_method_name(), " -> Time for training: ", time() - tm)
    if method.get_method_name() == "node2vec_rw":
        save_embeddings("Lab03_Output/embedding{}.txt".format(i+1),emb, graph.nodes(),n_to_vec=True)
    else:
        save_embeddings("Lab03_Output/embedding{}.txt".format(i+1),emb, graph.nodes())

Laplacian matrix recon. error (low rank): 58.807160
lap_eigmap_svd  -> Time for training:  4.602272272109985
node2vec_rw  -> Time for training:  5.412131071090698
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
sdne  -> Time for training:  78.17769503593445


  np_embs = np.array(data)
  y[i + 1] = Dense(n_units[i], activation=activation_fn,
  y[i + 1] = Dense(n_units[i], activation=activation_fn,
  y[K] = Dense(d, activation=activation_fn,
  encoder = Model(input=x, output=y[K])
  y_hat[i] = Dense(n_units[i - 1],
  y_hat[i] = Dense(n_units[i - 1],
  y_hat[0] = Dense(node_num, activation=activation_fn,
  decoder = Model(input=y, output=x_hat)
  autoencoder = Model(input=x, output=[x_hat, y])
  x_diff1 = merge([x_hat1, x1],
  merge_layer = Merge(input_layers, mode=mode,
  x_diff2 = merge([x_hat2, x2],
  y_diff = merge([y2, y1],
  self._model = Model(input=x_in, output=[x_diff1, x_diff2, y_diff])
  self._model.fit_generator(
  self._model.fit_generator(


## Classification

In [8]:
# Load EMBEDDINGS
emb1 = read_embeddings("Lab03_Output/embedding1.txt")
emb2 = read_embeddings("Lab03_Output/embedding2.txt")
emb3 = read_embeddings("Lab03_Output/embedding3.txt")

In [9]:
# print(node_data['subject'].value_counts().to_frame())
train_data, test_data = split_data(node_data)
train_targets, test_targets = encode_classes(train_data, test_data)
train_features, test_features = train_data[feature_names], test_data[feature_names]
train_nodes = train_features.index.values.tolist()
test_nodes = test_features.index.values.tolist()

train_subjects, test_subjects = model_selection.train_test_split(
    node_data['subject'], train_size=0.7, test_size=None, stratify=node_data['subject']
)
val_subjects, test_subjects = model_selection.train_test_split(
    test_subjects, train_size=0.2, test_size=None, stratify=test_subjects
)
target_encoding = preprocessing.LabelBinarizer()

print(train_targets, "len: {}".format(len(train_targets)))
print(test_targets, "len: {}".format(len(test_targets)))

train_targetsNEW = target_encoding.fit_transform(train_subjects)
val_targetsNEW = target_encoding.transform(val_subjects)
test_targetsNEW = target_encoding.transform(test_subjects)
print(train_targetsNEW, "len: {}".format(len(train_targetsNEW)))
print(val_targetsNEW, "len: {}".format(len(val_targetsNEW)))
print(test_targetsNEW, "len: {}".format(len(test_targetsNEW)))

[[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [1. 0. 0. ... 0. 0. 0.]] len: 1895
[[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]] len: 813
[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 1 0 0]] len: 1895
[[0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 1 0]] len: 162
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]] len: 651


## Zad 2

??? What classification models to use for a prediction with embedding inputs

### Zad 3

In [10]:
# for StellarGraph we are using FullBatchNode generator


G = StellarGraph(graph, node_features=node_data[feature_names])
print(G.info())
generator = FullBatchNodeGenerator(G, method="gcn", sparse=False)
train_gen = generator.flow(train_data.index, train_targetsNEW)
gcn = GCN( layer_sizes=[16, 16], activations=['relu', 'relu'], generator=generator, dropout=0.5 )

x_inp, x_out = gcn.node_model()

StellarGraph: Undirected multigraph
 Nodes: 2708, Edges: 5278

 Node types:
  paper: [2708]
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [5278]

Using GCN (local pooling) filters...
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [11]:
print(x_inp)
print(x_out)
pred = layers.Dense(units=train_targetsNEW.shape[1],activation="softmax")(x_out)
model = Model(inputs=x_inp,outputs=pred)
model.compile(optimizer=optimizers.Adam(lr=0.01), loss=losses.categorical_crossentropy, metrics=["acc"])
val_gen = generator.flow(val_subjects.index, val_targetsNEW)
es_callback = EarlyStopping(monitor="val_acc", patience=50, restore_best_weights=True)

history = model.fit(train_gen,
    epochs=200,
    validation_data=val_gen,
    verbose=2,
    shuffle=False,  # this should be False, since shuffling data means shuffling the whole graph
    callbacks=[es_callback],)

[<tf.Tensor 'input_4:0' shape=(1, 2708, 1433) dtype=float32>, <tf.Tensor 'input_5:0' shape=(1, ?) dtype=int32>, <tf.Tensor 'input_6:0' shape=(1, 2708, 2708) dtype=float32>]
Tensor("graph_convolution_2/ExpandDims:0", shape=(1, ?, 16), dtype=float32)


AttributeError: 'tuple' object has no attribute 'layer'

In [None]:
test_gen = generator.flow(test_subjects.index, test_targetsNEW)

### Zad 4

In [13]:
batch_size = 50
num_samples = [10, 5]
generator = GraphSAGENodeGenerator(G, batch_size, num_samples)
train_gen = generator.flow(train_data.index, train_targets, shuffle=True)
graphsage_model = GraphSAGE( layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.5)



# trans = TSNE(n_components=2)
# trans.fit_transform(X)


TypeError: float() argument must be a string or a number, not 'dict'