In [1]:
import networkx as nx
import numpy as np
import pickle as pkl
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
import sys

### Data Loading

In [2]:
def sample_mask(index, length):
    """Return a multi-hot vector with the 1 at specified index. Used to select rows from matrix.
    
    Args:
        index: specified index for the 1 cells.
        length: the length of the mask.
    Returns:
        the multi-hot vector with 0->False, 1->True.
    """
    mask = np.zeros(length)
    mask[index] = 1
    return np.array(mask, dtype=np.bool)

def parse_index_file(file_path):
    """Read the index file to get the indices of the test nodes."""
    indices = []
    for line in open(file_path):
        indices.append(int(line.strip()))
    return indices

def load_citeseer(data_dir):
    """Loads input data from citeseer data directory
    
    Args:
        data_dir: under this folder ...
            ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
            ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
            ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
                (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
            ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
            ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
            ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
            ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
                object;
            ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
            All objects above must be saved using python pickle module.
    Returns: 
        All data input files loaded (as well the training/test data).
    """
    # Load raw data from files (see Data/ file for details).
    names = ["x", "y", "tx", "ty", "allx", "ally", "graph"]
    objects = []
    for name in names:
        with open(data_dir + "ind.citeseer." + name, "rb") as f:
            objects.append(pkl.load(f, encoding="latin1"))
    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_index_reorder = parse_index_file(data_dir + "ind.citeseer.test.index") # unordered indices.    
    test_index_range = np.sort(test_index_reorder) # ordered indices.
    # Fix citeseer data by assigning 0 vectors to isolated notes.
    test_index_range_full = range(min(test_index_reorder), max(test_index_reorder)+1)
    tx_extended = sp.lil_matrix((len(test_index_range_full), x.shape[1])) # linked list based matrix.
    tx_extended[test_index_range - min(test_index_range), :] = tx
    tx = tx_extended
    ty_extended = np.zeros((len(test_index_range_full), y.shape[1]))
    ty_extended[test_index_range - min(test_index_range), :] = ty
    ty = ty_extended
    # Load features
    features = sp.vstack((allx, tx)).tolil() # to linked list based matrix.
    features[test_index_reorder, :] = features[test_index_range, :] # sort the test nodes related rows.
    # Load adjacent matrix
    adj_matrix = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) # number_nodes x number_nodes shape.
    # Load labels
    labels = np.vstack((ally, ty)) # number_nodes x number_classes.
    labels[test_index_reorder, :] = labels[test_index_range, :] # sort the test nodes related rows.
    # Make mask for row selection
    indices_test = test_index_range.tolist()
    indices_train = range(len(y))
    indices_valid = range(len(y), len(y)+500) # select the first 500 nodes after the labeled nodes as validation.
    train_mask = sample_mask(indices_train, labels.shape[0])
    valid_mask = sample_mask(indices_valid, labels.shape[0])
    test_mask = sample_mask(indices_test, labels.shape[0])
    # Make containers for predictions
    y_train = np.zeros(labels.shape)
    y_valid = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    y_train[train_mask, :] = labels[train_mask, :]
    y_valid[valid_mask, :] = labels[valid_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]
    
    return adj_matrix, features, y_train, y_valid, y_test, train_mask, valid_mask, test_mask 
    

In [3]:
data_dir = "/home/jacobsuwang/Documents/UTA/Fall2018/LIN389C/GCN/Data/"

In [4]:
adj_matrix, features, y_train, y_valid, y_test, train_mask, valid_mask, test_mask = load_citeseer(data_dir)

In [5]:
def print_info(objects, object_names):
    for obj,obj_name in zip(objects,object_names):
        print(obj_name)
        print(type(obj), obj.shape)

objects = [adj_matrix, features, y_train, y_valid, y_test, train_mask, valid_mask, test_mask]
object_names = ["adj_matrix", "features", "y_train", "y_valid", "y_test", "train_mask", "valid_mask", "test_mask"]
print_info(objects, object_names)

adj_matrix
<class 'scipy.sparse.csr.csr_matrix'> (3327, 3327)
features
<class 'scipy.sparse.lil.lil_matrix'> (3327, 3703)
y_train
<class 'numpy.ndarray'> (3327, 6)
y_valid
<class 'numpy.ndarray'> (3327, 6)
y_test
<class 'numpy.ndarray'> (3327, 6)
train_mask
<class 'numpy.ndarray'> (3327,)
valid_mask
<class 'numpy.ndarray'> (3327,)
test_mask
<class 'numpy.ndarray'> (3327,)


In [6]:
# sample_x = pkl.load(open(data_dir+"ind.citeseer.x", "rb"), encoding="latin1")
# sample_tx = pkl.load(open(data_dir+"ind.citeseer.tx", "rb"), encoding="latin1")
# sample_allx = pkl.load(open(data_dir+"ind.citeseer.allx", "rb"), encoding="latin1")

### Input Formatting

In [7]:
def sparse_to_tuple(sparse_matrix):
    """Decompose a sparse matrix into its coordinates, values and shape.
    
    Args:
        sparse_matrix: a scipy.sparse.csr_matrix object.
    Returns:
        coords: a <number_values, 2> array (2: <x,y> coordinates).
        values: the values that fill the coordinates.
        shape: the shape of the matrix.
    Example:
        >> row  = np.array([0, 3, 1, 0])
        >> col  = np.array([0, 3, 1, 2])
        >> data = np.array([4, 5, 7, 9])
        >> csr_sample = sp.csr_matrix((data, (row, col)), shape=(4, 4))
        >> csr_sample.toarray()
        array([[4, 0, 9, 0],
               [0, 7, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 5]], dtype=int64)
        >> sparse_to_tuple(csr_sample)
        (array([[0, 0],
                [0, 2],
                [1, 1],
                [3, 3]], dtype=int32), array([4, 9, 7, 5], dtype=int64), (4, 4))
    """
    def to_tuple(matrix):
        if not sp.isspmatrix_coo(matrix):
            matrix = matrix.tocoo()
        coords = np.vstack((matrix.row, matrix.col)).transpose()
        values = matrix.data
        shape = matrix.shape
        return coords, values, shape
    if isinstance(sparse_matrix, list): # if is a list of sparse matrices.
        for i in range(len(sparse_matrix)):
            sparse_matrix[i] = to_tuple(sparse_matrix[i])
    else:
        sparse_matrix = to_tuple(sparse_matrix)
    return sparse_matrix
            
def to_A_tilde(A):
    """Add self-connection (Kipf & Welling, 2017, section 2)."""
    return A + sp.eye(A.shape[0])

def to_A_hat(A_tilde):
    """Normalize adjacent matrix (with self-connections), (Kipf & Welling, 2017, section 3.1)."""
    A_tilde = sp.coo_matrix(A_tilde)
    rowsum = np.array(A_tilde.sum(axis=1)) # compute diagonal values for D.
    D_inv_sqrt = np.power(rowsum, -0.5).flatten() # compute inverse square root of D.
    D_inv_sqrt[np.isinf(D_inv_sqrt)] = 0. # convert inf values to 0.
    D_inv_sqrt = sp.diags(D_inv_sqrt) # convert vectorized diagonal values to a diagonal matrix.
    return A_tilde.dot(D_inv_sqrt).transpose().dot(D_inv_sqrt).tocoo() # formula for A_hat.



In [154]:
import torch
from torch.autograd import Variable
import torch.nn as nn

CUDA = torch.cuda.is_available()

class GraphConvolution(nn.Module):
    """Single graph convolution layer."""
    
    def __init__(self, in_features, hidden_size):
        super(GraphConvolution, self).__init__()
        self.linear = nn.Linear(in_features, hidden_size)
        
    def forward(self, A_hat, X):
        return self.linear(A_hat.mm(X))

class GCN(nn.Module):
    """Graph Convolutional Network (Kipf & Welling, 2017, Eq.9)."""
    
    def __init__(self, in_features, hidden_size_1, hidden_size_2):
        super(GCN, self).__init__()
        self.layer_1 = GraphConvolution(in_features, hidden_size_1)
        self.layer_2 = GraphConvolution(hidden_size_1, hidden_size_2)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=-1)
    
    def forward(self, A_hat, X):
        out = self.layer_1(A_hat, X)
        out = self.relu(out)
        out = self.layer_2(A_hat, out)
        out = self.softmax(out)
        return out

def to_categorical(one_hot):
    number_classes = one_hot.shape[1]+1
    categorical = []
    for row in one_hot:
        index = np.where(row==1)[0]
        if len(index) == 0:
            categorical.append([number_classes-1])
        else:
            categorical.append([index[0]])
    return np.array(categorical)

    
def to_tensor(raw_inputs, tensor_type=torch.FloatTensor):
    """numpy ndarray or sparse matrix to torch tensor."""
    if type(raw_inputs) != np.ndarray:
        raw_inputs = raw_inputs.toarray()
    tensor = Variable(torch.Tensor(raw_inputs).type(tensor_type))
    if CUDA:
        return tensor.cuda()
    return tensor

TRAIN_FROM, TRAIN_TO = 0, 120
VALID_FROM, VALID_TO = 120, 620
TEST_FROM, TEST_TO = 2312, 3327
    
def train_gcn(adj_matrix, features, 
              y_train, y_valid, y_test,
              train_mask, valid_mask, test_mask,
              number_iterations=100, print_every=5):
    
    A_tilde = to_A_tilde(adj_matrix)
    A_hat = to_tensor(to_A_hat(A_tilde))
    X = to_tensor(features)
    
    in_features = X.shape[1]
    hidden_size_1 = 50
    hidden_size_2 = y_train.shape[1]+1 # one more for all-0 rows.    
    
    y_train = to_tensor(to_categorical(y_train * train_mask[:, np.newaxis]).squeeze())
    y_valid = to_tensor(to_categorical(y_valid * valid_mask[:, np.newaxis]).squeeze())
    y_test = to_tensor(to_categorical(y_test * test_mask[:, np.newaxis]).squeeze())
    number_train = train_mask.sum()
    number_valid = valid_mask.sum()
    number_test = test_mask.sum()
    
    gcn = GCN(in_features, hidden_size_1, hidden_size_2)
    if CUDA:
        gcn = gcn.cuda()
        
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(gcn.parameters(), lr=1e-4)
    
    for i in range(number_iterations):
        out = gcn(A_hat, X)
        # NB: arg1 has type .FloatTensor, arg2 has type .LongTensor.
        train_loss = criterion(out, y_train.type(torch.cuda.LongTensor))
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        if i != 0 and i % print_every == 0:
            print("Iteration " + str(i) + ":\n")
            print("Train loss =", train_loss.cpu().data.numpy()[0], "(at step "+str(i)+")")
            _, predictions = torch.max(out.data, 1)
            number_correct_train = predictions[TRAIN_FROM:TRAIN_TO] \
                .eq(y_train[TRAIN_FROM:TRAIN_TO].type(torch.cuda.LongTensor).data).sum()
            number_correct_valid = predictions[VALID_FROM:VALID_TO] \
                .eq(y_valid[VALID_FROM:VALID_TO].type(torch.cuda.LongTensor).data).sum()
            number_correct_test = predictions[TEST_FROM:TEST_TO] \
                .eq(y_test[TEST_FROM:TEST_TO].type(torch.cuda.LongTensor).data).sum()      
            accuracy_train = number_correct_train / number_train
            accuracy_valid = number_correct_valid / number_valid
            accuracy_test = number_correct_test / number_test
            print("Train/Valid/Test accuracy: %.4f | %.4f | %.4f\n" % (accuracy_train,
                                                                       accuracy_valid,
                                                                       accuracy_test))

In [155]:
train_gcn(adj_matrix, features, y_train, y_valid, y_test, train_mask, valid_mask, test_mask,
          number_iterations=100, print_every=5)

Iteration 5:

Train loss = 1.9540343 (at step 5)
Train/Valid/Test accuracy: 0.2500 | 0.2260 | 0.2290

Iteration 10:

Train loss = 1.9502562 (at step 10)
Train/Valid/Test accuracy: 0.2500 | 0.2200 | 0.2240

Iteration 15:

Train loss = 1.9460775 (at step 15)
Train/Valid/Test accuracy: 0.2417 | 0.2100 | 0.2170

Iteration 20:

Train loss = 1.9414792 (at step 20)
Train/Valid/Test accuracy: 0.2083 | 0.2040 | 0.2120

Iteration 25:

Train loss = 1.936476 (at step 25)
Train/Valid/Test accuracy: 0.1583 | 0.1800 | 0.1700

Iteration 30:

Train loss = 1.9310786 (at step 30)
Train/Valid/Test accuracy: 0.0750 | 0.1120 | 0.1100

Iteration 35:

Train loss = 1.9252805 (at step 35)
Train/Valid/Test accuracy: 0.0333 | 0.0520 | 0.0340

Iteration 40:

Train loss = 1.9190581 (at step 40)
Train/Valid/Test accuracy: 0.0000 | 0.0120 | 0.0100

Iteration 45:

Train loss = 1.9123733 (at step 45)
Train/Valid/Test accuracy: 0.0000 | 0.0040 | 0.0020

Iteration 50:

Train loss = 1.9051807 (at step 50)
Train/Valid/Test

In [164]:
torch.__version__

'0.3.1'

In [None]:
# adj_matrix
# <class 'scipy.sparse.csr.csr_matrix'> (3327, 3327)
# features
# <class 'scipy.sparse.lil.lil_matrix'> (3327, 3703)
# y_train
# <class 'numpy.ndarray'> (3327, 6)
# y_valid
# <class 'numpy.ndarray'> (3327, 6)
# y_test
# <class 'numpy.ndarray'> (3327, 6)
# train_mask
# <class 'numpy.ndarray'> (3327,)
# valid_mask
# <class 'numpy.ndarray'> (3327,)
# test_mask
# <class 'numpy.ndarray'> (3327,)