# GCN

Semi-Supervised Classification with Graph Convolutional Networks

https://arxiv.org/abs/1609.02907



In [None]:
import sys
sys.path.append("..")
sys.dont_write_bytecode = True

import os
import numpy as np
import scipy.sparse as sp
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from deep_recommenders.layers.gnn import GCN

## 1 数据获取与处理

### 1.1 数据集下载

CORA数据集下载地址: https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz

In [None]:
!wget https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz -O cora.tgz
!tar -zxvf cora.tgz

### 1.2 读取数据集
- Features归一化
- Labels进行one-hot编码


In [None]:
idx_features_labels = np.genfromtxt(os.path.join("cora", "cora.content"), dtype=np.dtype(str))
features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
features /= features.sum(axis=1).reshape(-1, 1)

# labels one-hot encoding
labels = idx_features_labels[:, -1]
classes = set(labels)
classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
onehot_labels = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)

### 1.3 构建邻接矩阵

In [None]:
def convert_symmetric(X, sparse=True):
    if sparse:
        X += X.T - sp.diags(X.diagonal())
    else:
        X += X.T - np.diag(X.diagonal())
    return X

def normalize_adj(adj, symmetric=True):
    if symmetric:
        d = sp.diags(np.power(np.array(adj.sum(1)), -0.5).flatten(), 0)
        a_norm = adj.dot(d).transpose().dot(d).tocsr()
    else:
        d = sp.diags(np.power(np.array(adj.sum(1)), -1).flatten(), 0)
        a_norm = d.dot(adj).tocsr()
    return a_norm

def preprocess_adj(adj, symmetric=True):
    adj = adj + sp.eye(adj.shape[0])
    adj = normalize_adj(adj, symmetric)
    return adj

# 构建邻接矩阵
idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
idx_map = {j: i for i, j in enumerate(idx)}
edges_unordered = np.genfromtxt(os.path.join("cora", "cora.cites"), dtype=np.int32)
edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)
adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(onehot_labels.shape[0], onehot_labels.shape[0]), dtype=np.float32)
adj = convert_symmetric(adj)
adj = preprocess_adj(adj)

### 1.4 切割数据集

In [None]:
def sample_mask(idx, l):
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

def get_splits(y,):
    idx_list = np.arange(len(y))
    # train_val, idx_test = train_test_split(idx_list, test_size=0.2, random_state=1024)  # 1000
    # idx_train, idx_val = train_test_split(train_val, test_size=0.2, random_state=1024)  # 500

    idx_train = []
    label_count = {}
    for i, label in enumerate(y):
        label = np.argmax(label)
        if label_count.get(label, 0) < 20:
            idx_train.append(i)
            label_count[label] = label_count.get(label, 0) + 1

    idx_val_test = list(set(idx_list) - set(idx_train))
    idx_val = idx_val_test[0:500]
    idx_test = idx_val_test[500:1500]

    y_train = np.zeros(y.shape, dtype=np.int32)
    y_val = np.zeros(y.shape, dtype=np.int32)
    y_test = np.zeros(y.shape, dtype=np.int32)
    y_train[idx_train] = y[idx_train]
    y_val[idx_val] = y[idx_val]
    y_test[idx_test] = y[idx_test]
    train_mask = sample_mask(idx_train, y.shape[0])
    val_mask = sample_mask(idx_val, y.shape[0])
    test_mask = sample_mask(idx_test, y.shape[0])

    return y_train, y_val, y_test,train_mask, val_mask, test_mask

y_train, y_val, y_test, train_mask, val_mask, test_mask = get_splits(onehot_labels)

## 2 模型构建与训练

### 2.1 构建模型

In [None]:
def build_gcn(adj_dim, embddings_dim, num_classes):
    adj = tf.keras.layers.Input(shape=(adj_dim,))
    embeddings = tf.keras.layers.Input(shape=(embddings_dim,))

    x = GCN(64)(embeddings, adj)
    x = GCN(64)(x, adj)
    outputs = GCN(num_classes, activation="softmax")(x, adj)
    return tf.keras.Model([adj, embeddings], outputs)

### 2.2 训练模型

In [None]:
model = build_gcn(adj.shape[-1], features.shape[-1], y_train.shape[1])
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.01), 
    loss="categorical_crossentropy",
    weighted_metrics=["acc"]
)
model.fit(
    [adj, features],
    y_train,
    sample_weight=train_mask, 
    validation_data=([adj, features], y_val, val_mask),
    batch_size=adj.shape[0], 
    epochs=200, 
    shuffle=False, 
    verbose=2, 
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5)])

### 2.3 模型验证

In [None]:
eval_results = model.evaluate(
        [adj, features], 
        y_test, 
        sample_weight=test_mask, 
        batch_size=adj.shape[0],
        verbose=0)
print("Test Loss: {:.4f}".format(eval_results[0]))
print("Test Accuracy: {:.4f}".format(eval_results[1]))

## 3 可视化Embedding

### 3.1 提取节点Embedding 

In [None]:
embedding_model = tf.keras.Model(model.input, outputs=model.layers[-1].output)
embedding_weights = embedding_model.predict([adj, features], batch_size=adj.shape[0])

### 3.2 TSNE可视化

In [None]:
def plot_embeddings(embeddings, X, Y):

    emb_list = []
    for k in X:
        emb_list.append(embeddings[k])
    emb_list = np.array(emb_list)

    model = TSNE(n_components=2)
    node_pos = model.fit_transform(emb_list)

    color_idx = {}
    for i in range(len(X)):
        color_idx.setdefault(Y[i][:], [])
        color_idx[Y[i][:]].append(i)

    for c, idx in color_idx.items():
        plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
    plt.legend()
    plt.savefig("assets/gcn_embeddings.png")
    plt.show()

y  = np.genfromtxt(os.path.join("cora", "cora.content"), dtype=np.dtype(str))[:, -1]
plot_embeddings(embedding_weights, np.arange(adj.shape[0]), y)