<a href="https://colab.research.google.com/github/LaZzyMan/Notebook/blob/master/sdne.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import math
import networkx as nx
import numpy as np
from functools import reduce
import pickle
import os
import numpy
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras import Model, backend as K, regularizers
from tensorflow.keras.layers import Dense, Embedding, Input, Multiply, Subtract, Lambda, BatchNormalization
from tensorflow.keras.layers import LeakyReLU
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed
import psycopg2
import geopandas as gpd

# sdne-keras

In [None]:
def preprocess_nxgraph(graph):
    # table from index to node and from node to index
    node2idx = {}
    idx2node = []
    node_size = 0
    for node in graph.nodes():
        node2idx[node] = node_size
        idx2node.append(node)
        node_size += 1
    return idx2node, node2idx

In [None]:
def build_reconstruction_loss(beta):
    # 二阶相似度损失函数
    def reconstruction_loss(true_y, pred_y):
        # diff = K.abs(true_y - pred_y)
        # y = K.cast(true_y, 'bool')
        # y = K.cast(y, 'int8')
        # y = K.cast(y, 'float32')
        # weight = y * (beta - 1) + 1
        # weighted_diff = diff * weight
        # 改用pearson相似度，消除尺度影响
        # true_y_nore = tf.norm(true_y, axis=-1, keepdims=False)
        # pred_y_norm = tf.norm(pred_y, axis=-1, keepdims=False)
        # p = 1. - K.sum(true_y * pred_y, axis=-1) / (true_y_nore * pred_y_norm + 1e-6)
        # 弱化权重影响的曼哈顿距离，同时惩罚稀疏预测
        y_max = K.maximum(true_y, pred_y) + .1
        y_min = K.minimum(true_y, pred_y) + .1
        return K.mean(K.mean((y_max - y_min) / y_min, axis=-1))

    return reconstruction_loss

In [None]:
def edge_wise_loss(true_y, embedding_diff):
    # 一阶相似度损失函数
    # return K.mean(K.sum(K.square(embedding_diff), axis=1))
    return K.mean(K.sum(embedding_diff, axis=-1))

In [None]:
class SDNE():
    def __init__(self,
                 graph,
                 encode_dim,
                 encoding_layer_dims,
                 weight='weight',
                 beta=2, alpha=2, theta=2,
                 l2_param=1):
        self.encode_dim = encode_dim
        self.graph = graph
        self.idx2node, self.node2idx = preprocess_nxgraph(self.graph)
        self.N = graph.number_of_nodes()
        self.adj_mat = nx.adjacency_matrix(self.graph).toarray()
        self.edges = np.array(list(self.graph.edges()))
        weights = [graph[u][v].get(weight, 1.0)
                   for u, v in self.graph.edges()]
        self.weights = np.array(weights, dtype=np.float32)[:, None]

        # input_a = Input(shape=(self.N,), name='input-a', dtype='float32')
        # input_b = Input(shape=(self.N,), name='input-b', dtype='float32')

        input_a = Input(shape=(1,), name='input-a', dtype='int32')
        input_b = Input(shape=(1,), name='input-b', dtype='int32')
        edge_weight = Input(shape=(1,), name='edge_weight', dtype='float32')

        encoding_layers = []
        decoding_layers = []
        embedding_layer = Embedding(output_dim=self.N, input_dim=self.N,
                                    trainable=False, input_length=1, name='nbr-table')
        embedding_layer.build((None,))
        embedding_layer.set_weights([self.adj_mat])

        encoding_layers.append(embedding_layer)
        encoding_layers.append(Lambda(lambda x: tf.reduce_sum(x, axis=1, keepdims=False), name='reduce-dim'))

        # encoding
        encoding_layer_dims += [encode_dim]
        for i, dim in enumerate(encoding_layer_dims):
            layer = Dense(dim, activation=LeakyReLU(alpha=.3),
                          kernel_regularizer=regularizers.l2(l2_param),
                          name='encoding-layer-{}'.format(i))
            encoding_layers.append(layer)
            # bn = BatchNormalization()
            # encoding_layers.append(bn)

        # decoding
        decoding_layer_dims = encoding_layer_dims[::-1][1:] + [self.N]
        for i, dim in enumerate(decoding_layer_dims):
            if i == len(decoding_layer_dims) - 1:
                activation = 'relu'
                # activation = LeakyReLU(alpha=.3)
            else:
                activation = 'relu'
                # activation = LeakyReLU(alpha=.3)
            layer = Dense(
                dim, activation=activation,
                kernel_regularizer=regularizers.l2(l2_param),
                name='decoding-layer-{}'.format(i))
            decoding_layers.append(layer)
            # bn = BatchNormalization()
            # decoding_layers.append(bn)
        all_layers = encoding_layers + decoding_layers

        encoded_a = reduce(lambda arg, f: f(arg), encoding_layers, input_a)
        encoded_b = reduce(lambda arg, f: f(arg), encoding_layers, input_b)
        decoded_a = reduce(lambda arg, f: f(arg), all_layers, input_a)
        decoded_b = reduce(lambda arg, f: f(arg), all_layers, input_b)
        
        # embedding_diff = Subtract()([encoded_a, encoded_b])
        # 一阶相似度损失函数从LE改为KL散度省略常数项 -w*log(sigmoid(yi*yj))
        embedding_diff = Multiply()([encoded_a, encoded_b])
        embedding_diff = Lambda(lambda x: - K.log(K.sigmoid(tf.reduce_sum(x, axis=-1, keepdims=True))))(embedding_diff)
        embedding_diff = Multiply(name='1st')([embedding_diff, edge_weight])

        self.model = Model([input_a, input_b, edge_weight], [decoded_a, decoded_b, embedding_diff])

        reconstruction_loss = build_reconstruction_loss(beta)

        self.model.compile(optimizer=optimizers.Adam(),
                           loss=[reconstruction_loss, reconstruction_loss, edge_wise_loss],
                           loss_weights=[theta, theta, alpha])

        self.encoder = Model(input_a, encoded_a)

        # for pre-training
        self.auto_encoder = Model(input_a, decoded_a)
        self.auto_encoder.compile(optimizer=optimizers.Adam(), loss=reconstruction_loss)

        self._embeddings = {}

    def pretrain(self, **kwargs):
        """pre-train the auto encoder without edges"""
        nodes = np.arange(self.graph.number_of_nodes())
        node_neighbors = self.adj_mat[nodes]

        self.auto_encoder.fit(nodes[:, None],
                              node_neighbors,
                              shuffle=True,
                              **kwargs)

    def train_data_generator(self, batch_size=32):
        # this can become quadratic if using dense
        m = self.graph.number_of_edges()
        shuffled_edges = np.random.permutation(self.edges)
        while True:
            for i in range(math.ceil(m / batch_size)):
                sel = slice(i * batch_size, (i + 1) * batch_size)
                nodes_a = shuffled_edges[sel, 0][:, None]
                nodes_b = shuffled_edges[sel, 1][:, None]
                weights = np.array([[self.graph.edges[edge].get('weight', 1.)] for edge in shuffled_edges[sel]])

                nodes_a_idx = np.array([[self.node2idx[node]] for node in nodes_a.flatten()])
                nodes_b_idx = np.array([[self.node2idx[node]] for node in nodes_b.flatten()])
                neighbors_a = self.adj_mat[nodes_a_idx.flatten()]
                neighbors_b = self.adj_mat[nodes_b_idx.flatten()]

                # requires to have the same shape as embedding_diff
                dummy_output = np.zeros((nodes_a.shape[0], self.encode_dim))

                yield ([nodes_a_idx, nodes_b_idx, weights],
                       [neighbors_a, neighbors_b, dummy_output])

                # yield ([neighbors_a, neighbors_b, weights],
                #        [neighbors_a, neighbors_b, dummy_output])

    def fit(self, log=False, **kwargs):
        """kwargs: keyword arguments passed to `model.fit`"""
        if log:
            callbacks = [keras.callbacks.TensorBoard(
                log_dir='./log', histogram_freq=0,
                write_graph=True, write_images=False)]
        else:
            callbacks = []

        callbacks += kwargs.get('callbacks', [])
        if 'callbacks' in kwargs:
            del kwargs['callbacks']

        if 'batch_size' in kwargs:
            batch_size = kwargs['batch_size']
            del kwargs['batch_size']
            gen = self.train_data_generator(batch_size=batch_size)
        else:
            gen = self.train_data_generator()

        self.model.fit(
            gen,
            shuffle=True,
            callbacks=callbacks,
            **kwargs)

    def get_node_embedding(self):
        nodes = np.array([self.node2idx[node] for node in self.graph.nodes()])[:, None]
        embeddings = self.encoder.predict(nodes)
        for embedding, node in zip(embeddings, self.graph.nodes):
            self._embeddings[node] = embedding
        return self._embeddings

    def save(self, path):
        self.model.save(path)

In [None]:
def train(graph, alpha, theta, l2_param, pretrain_epochs, epochs, beta, encoding_layer_dims, batch_size):
    model = SDNE(graph,
                 encode_dim=128,
                 encoding_layer_dims=encoding_layer_dims,
                 beta=beta,
                 theta=theta,
                 alpha=alpha,
                 l2_param=l2_param)
    model.pretrain(epochs=pretrain_epochs, batch_size=32)
    n_batches = math.ceil(graph.number_of_edges() / batch_size)
    model.fit(epochs=epochs, log=True, batch_size=batch_size, steps_per_epoch=n_batches)
    embeddings = model.get_node_embedding()
    with open('/content/drive/My Drive/Data/sdne_keras_embedding.pickle', 'wb') as f:
        pickle.dump(embeddings, f)
        f.close()
    evaluate_embeddings(embeddings, labels, 'sdne_keras_eva')

# evaluation

In [None]:
class TopKRanker(OneVsRestClassifier):
    def predict(self, X, top_k_list):
        probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
        all_labels = []
        for i, k in enumerate(top_k_list):
            probs_ = probs[i, :]
            labels = self.classes_[probs_.argsort()[-k:]].tolist()
            probs_[:] = 0
            probs_[labels] = 1
            all_labels.append(probs_)
        return numpy.asarray(all_labels)

In [None]:
class Classifier(object):
    def __init__(self, embeddings, clf):
        self.embeddings = embeddings
        self.clf = TopKRanker(clf)
        self.binarizer = MultiLabelBinarizer(sparse_output=True)

    def train(self, X, Y, Y_all):
        self.binarizer.fit(Y_all)
        X_train = [self.embeddings[x] for x in X]
        Y = self.binarizer.transform(Y)
        self.clf.fit(X_train, Y)

    def evaluate(self, X, Y):
        # top_k_list = [len(l) for l in Y]
        top_k_list = [1] * len(Y)
        Y_ = self.predict(X, top_k_list)
        Y = self.binarizer.transform(Y)
        averages = ["micro", "macro"]
        results = {}
        for average in averages:
            results[average] = f1_score(Y, Y_, average=average)
        results['acc'] = accuracy_score(Y, Y_)
        return results

    def predict(self, X, top_k_list):
        X_ = numpy.asarray([self.embeddings[x] for x in X])
        Y = self.clf.predict(X_, top_k_list=top_k_list)
        return Y

    def split_train_evaluate(self, X, Y, train_precent):
        state = numpy.random.get_state()
        training_size = int(train_precent * len(X))
        # numpy.random.seed()
        shuffle_indices = numpy.random.permutation(numpy.arange(len(X)))
        X_train = [X[shuffle_indices[i]] for i in range(training_size)]
        Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
        X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
        Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
        self.train(X_train, Y_train, Y)
        numpy.random.set_state(state)
        return self.evaluate(X_test, Y_test)

In [None]:
def evaluate_embeddings(embeddings, labels, filename):
    X = []
    Y = []
    for node, label in labels.items():
        X.append(node)
        Y.append(label.strip())
    # print("Training classifier using {:.2f}% nodes...".format(tr_frac * 100))
    clf = Classifier(embeddings=embeddings, clf=LogisticRegression(solver='liblinear'))
    results = {'train_percent': [], 'micro-f1': [], 'macro-f1': [], 'acc': []}
    for tr_frac in [.1, .2, .5, .8]:
        result = clf.split_train_evaluate(X, Y, tr_frac)
        results['train_percent'].append('%02d%%' % (tr_frac * 100))
        results['micro-f1'].append(result['micro'])
        results['macro-f1'].append(result['macro'])
        results['acc'].append(result['acc'])
    df = pd.DataFrame(results)
    print(df)
    df.to_csv('/content/drive/My Drive/Data/%s.csv' % filename)

def f1_micro_percent_50(embeddings, labels):
    X = []
    Y = []
    for node, label in labels.items():
        X.append(node)
        Y.append(label.strip())
    clf = Classifier(embeddings=embeddings, clf=LogisticRegression(solver='liblinear'))
    result = clf.split_train_evaluate(X, Y, .5)
    return result['micro']

# train

In [None]:
with open('/content/drive/My Drive/Data/graph_undirected_20151201.gpickle', 'rb') as f:
  graph = pickle.load(f)
  f.close()
labels = pickle.load(open('/content/drive/My Drive/Data/label_100.pickle', 'rb'))

In [None]:
train(graph, alpha=2., theta=1., l2_param=1e-3, pretrain_epochs=30, epochs=10, beta=3, encoding_layer_dims=[500, 200], batch_size=128)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
  train_percent  micro-f1  macro-f1       acc
0           10%  0.630466  0.415060  0.630466
1           20%  0.665574  0.424167  0.665574
2           50%  0.664042  0.425106  0.664042
3           80%  0.662295  0.453306  0.662295


In [None]:
def fit_with_param(param):
  model = SDNE(graph, encode_dim=128, encoding_layer_dims=[200], beta=3, theta=param['theta'], alpha=param['alpha'], l2_param=param['l2_param'])
  model.pretrain(epochs=10, batch_size=32)
  n_batches = math.ceil(graph.number_of_edges() / param['batch_size'])
  model.fit(epochs=10, log=True, batch_size=param['batch_size'], steps_per_epoch=n_batches)
  embeddings = model.get_node_embedding()
  score = f1_micro_percent_50(embeddings, labels)
  result = {'params':param, 'score': score}
  print(result)
  return result
param_dist = {'alpha': np.linspace(0.01, 1, 10),
        'theta': np.linspace(0.01, 1, 10),
        'l2_param': [1e-1, 1e-3, 1e-5],
        'batch_size': [32, 64, 128, 256, 512, 1024]}
param_grid = ParameterGrid(param_dist)
results = Parallel(n_jobs=-1, verbose=1)(delayed(fit_with_param)(param) for param in param_grid)

In [None]:
results

[{'params': {'alpha': 0.1,
   'batch_size': 128,
   'l2_param': 0.001,
   'theta': 0.01},
  'score': 0.6060037523452159},
 {'params': {'alpha': 0.2,
   'batch_size': 128,
   'l2_param': 0.001,
   'theta': 0.01},
  'score': 0.6067415730337079}]

In [None]:
embeddings = pkl.load(open('/content/drive/My Drive/Data/sdne_keras_embedding.pkl', 'rb'))
label = pkl.load(open('/content/drive/My Drive/Data/label.pickle', 'rb'))

emb_df = pd.DataFrame(embeddings)
emb_df.to_csv('/content/drive/My Drive/Data/sdne_keras_embedding.tsv', sep='\t', header=False, index=False)

embedding_var = tf.Variable(embeddings, name='node_embeddings')


LOG_DIR = 'log'


df = pd.Series(labels, name='label')
df.to_frame().to_csv(LOG_DIR + '/node_labels.tsv', index=False, header=False)

with tf.Session() as sess:
  saver = tf.train.Saver([embedding_var])

  sess.run(embedding_var.initializer)
  saver.save(sess, os.path.join(LOG_DIR, 'embeddings.ckpt'))
    
  config = projector.ProjectorConfig()
  # One can add multiple embeddings.
  embedding = config.embeddings.add()
  embedding.tensor_name = embedding_var.name
  # Link this tensor to its metadata file (e.g. labels).
  embedding.metadata_path = 'node_labels.tsv'
  # Saves a config file that TensorBoard will read during startup.
    
  projector.visualize_embeddings(tf.summary.FileWriter(LOG_DIR), config)