# Collaboration Network Node Classification

In [5]:
%cd /home/joel/repos/cscollab/backend

/home/joel/repos/cscollab/backend


In [11]:
import torch
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
from sklearn.decomposition import TruncatedSVD
import kuzudb.query_kuzu as query
import collections
import torch
import torch_geometric
import torch_geometric.transforms as T
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import numpy as np
import functools


plt.style.use('default')
sns.set_style("whitegrid")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Running on', device)

Running on cuda


## Generate Dataset

First we create some mappings for the kuzudb data:
A mapping from the subarea id to the area id and a mapping of authors to their affiliated institution.
Further each reasearch area and subarea is onehot encoded and saved in a dictionary 

In [21]:

""" mapping of sub areas to areas"""
area_mapping = query.get_area_mapping()
subarea_mapping = dict(zip(area_mapping["sub-area-id"],area_mapping["area-id"]))

"""one hot encoding"""
onehot_encoder = OneHotEncoder(sparse_output=False)

area_ids = list(area_mapping["area-id"].unique())
n_area = len(area_ids)
onehot_encoded_areas = onehot_encoder.fit_transform(np.arange(n_area).reshape(n_area,1))
tensor_list_ohe_areas = list(map(lambda x: torch.tensor(x, dtype=torch.float), onehot_encoded_areas))
area_ohe_mapping = dict(zip(area_ids,tensor_list_ohe_areas))
print(area_ohe_mapping)

sub_area_ids = area_mapping["sub-area-id"].unique()
n_sub_area = len(sub_area_ids)
onehot_encoded_sub_areas = onehot_encoder.fit_transform(np.arange(n_sub_area).reshape(n_sub_area,1))
# tensor_list_ohe_sub_areas = list(map(lambda x: torch.tensor(x, dtype=torch.float), onehot_encoded_sub_areas))
sub_area_ohe_mapping = dict(zip(sub_area_ids,onehot_encoded_sub_areas))

"""mapping of author to institution"""
csauthors_all = query.get_csauthors()
author_inst_map = dict(zip(csauthors_all["pid"],csauthors_all["institution"]))

{'systems': tensor([1., 0., 0., 0.]), 'ai': tensor([0., 1., 0., 0.]), 'theory': tensor([0., 0., 1., 0.]), 'interdiscip': tensor([0., 0., 0., 1.])}


Given a collaboration network we want to get for each node (author or institution) what the frequency of publications in the different areas and sub areas are and in which sub/area the most publications. For this we define a function to count the frequency and a function to get the collaboration network based on a config

In [16]:
""" count the frequency of sub/areas for a node"""
def area_frequency_counter(collab, node):
    collab_node = collab[(collab["a"]==node) | (collab["b"]==node)]
    area_counter = dict(collections.Counter(collab_node["rec_area"]))
    top_area = max(area_counter, key=area_counter.get)
    sub_area_counter = dict(collections.Counter(collab_node["rec_sub_area"]))
    top_sub_area = max(sub_area_counter, key=sub_area_counter.get)
    freq= {"area_freq":area_counter,
            "top_area":top_area,
            "sub_area_freq":sub_area_counter,
            "top_sub_area":top_sub_area}
    return freq


def get_collab_data(config):
    """get collaboration"""
    collab = query.get_collaboration(config)  
    collab["rec_area"]=list(map(lambda x: subarea_mapping[x], collab["rec_sub_area"]))  
        
    if config.get("institution",False):
        collab["a"] = list(map(lambda x: author_inst_map[x], collab["a"].values))
        collab["b"] = list(map(lambda x: author_inst_map[x], collab["b"].values))


    """map each institution or author to an int based on the positional index"""
    nodes = list(set(collab["a"]).union(set(collab["b"])))
    node_idx = list(range(len(nodes)))
    node_idx_mapping = dict(zip(nodes, node_idx))

    frequency_map = dict(zip(node_idx, list(map(lambda n: area_frequency_counter(collab, n), nodes))))

    collab_weighted = query.get_weighted_collab(config)
    collab_weighted["a"] = list(map(lambda n: node_idx_mapping[n],collab_weighted["a"]))
    collab_weighted["b"] = list(map(lambda n: node_idx_mapping[n],collab_weighted["b"]))
    
    adjacency_list = list(map(list,zip(collab_weighted["a"], collab_weighted["b"])))
    weights = collab_weighted["weight"].values

    data = {"nodes": node_idx,
            "edges":adjacency_list,
            "weights":weights,
            "freq": frequency_map}
    return data

we define a config a check the collaboration network

In [17]:
"""define config as the collabortaion between institutions among Austria, Germany and Switzerland later than 2015"""
config = { "from_year": 2015,
            "region_id":"dach",
            "strict_boundary":True,
            "institution":True}

collab_data = get_collab_data(config)
print()
# Gather some statistics about the graph.
print(f'Number of first 10 nodes: {collab_data["nodes"][:10]}')
print(f'Number of first 10 edges: {collab_data["edges"][:10]}')
print(f'Number of first 10 weights: {collab_data["weights"][:10]}')
print(f'Frequency mapping of the first node: {collab_data["freq"][0]}')


Number of first 10 nodes: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Number of first 10 edges: [[59, 59], [78, 59], [77, 77], [1, 1], [13, 13], [5, 5], [66, 66], [38, 5], [59, 5], [12, 32]]
Number of first 10 weights: [88 81 76 74 73 69 53 48 42 38]
Frequency mapping of the first node: {'area_freq': {'interdiscip': 1, 'ai': 3, 'systems': 1}, 'top_area': 'ai', 'sub_area_freq': {'robotics': 1, 'ml': 3, 'networks': 1}, 'top_sub_area': 'ml'}


Next we create the torch data object:

* y is the onehot encoding of the research area with the most publications
* x is the percentage of published subareas calculated using the ohe of the subareas

In [48]:

"""get target label as the area with the most records published"""
def get_y(nodes, freq):
    top_areas = list(map(lambda x: freq[x]["top_area"], nodes))
    
    """get target label as torch tensor"""
    y = torch.tensor(list(map(lambda x: list(area_ohe_mapping[x]),top_areas)),dtype=torch.float)
    # y = torch.tensor(list(map(lambda x: area_ids.index(x),top_areas)),dtype=torch.float)
    return y

def sub_area_frequency(freq,n):
    sfreq = freq[n]["sub_area_freq"]
    """sum up one hot endocings of the subareas by their frequency"""
    freq_array = functools.reduce(lambda x, key: x + sfreq[key] * sub_area_ohe_mapping[key], sfreq, np.zeros(n_sub_area))
    """get frequency as percentage such that it sums up to 1"""
    freq_array_p = freq_array/sum(sfreq.values())
    return freq_array_p 

"""the percentage of published subareas"""
def get_x(nodes,freq):
    x =  torch.tensor(np.array(list(map(lambda node: sub_area_frequency(freq,node) ,nodes))), dtype=torch.float)
    return x

def gen_torch_data(nodes,edges, freq, use_x=True ,weights=None):
    
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    
    y = get_y(nodes,freq)
    
    ohe = torch.eye(len(nodes))
    if use_x:
        x = torch.cat((get_x(nodes, freq), ohe), dim=1)
    else:
        x = ohe
    
    if weights is not None:
        weights = torch.tensor(weights,dtype=torch.long)
        
    """create torch data object without weights"""
    data = torch_geometric.data.Data(x=x, y=y, edge_index=edge_index, num_nodes=len(nodes))
    data = T.ToUndirected()(data) # the collaboration network is undirected
    data = T.AddSelfLoops()(data) # by adding self-loops, we ensure that aggregated messages from neighbors 
    data = T.NormalizeFeatures()(data) # features will sum up to 1
    """ define train test split"""
    transform = torch_geometric.transforms.RandomNodeSplit(split='train_rest', num_val=0.3, num_test=0)
    transform(data)
    return data

def collab_to_torch(config, weighted=False, use_x =True):
    collab_data = get_collab_data(config)
    nodes = collab_data["nodes"]
    edges = collab_data["edges"]
    weights = collab_data["weights"] if weighted else None
    freq = collab_data["freq"]
    data = gen_torch_data(nodes, edges, freq, use_x=use_x, weights=weights)
    return data

In [49]:
"""torch data object"""
data = collab_to_torch(config)
print(data)
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Number of features: {data.num_features}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')
""" take a closer look at features"""
print(f'Features of first node: {data.x[0]}')
print(f'Sub area frequency of the first node: {collab_data["freq"][0]["sub_area_freq"]}')
print(f'Sum of ohe of the subfrquencies of the first node: {sub_area_ohe_mapping["robotics"] + 3*sub_area_ohe_mapping["ml"] + sub_area_ohe_mapping["networks"]}')

Data(x=[81, 104], edge_index=[2, 1129], y=[81, 4], num_nodes=81, train_mask=[81], val_mask=[81], test_mask=[81])
Number of nodes: 81
Number of edges: 1129
Number of features: 104
Number of training nodes: 57
Has isolated nodes: False
Has self-loops: True
Is undirected: True
Features of first node: tensor([0.0000, 0.3000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.1000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.1000, 0.5000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.000

## Implement GNN

In [40]:
class GraphConvolution(torch_geometric.nn.MessagePassing):

    def __init__(self, in_ch, out_ch):
        super().__init__(aggr='add')

        # this linear function is used to transform node features 
        # into messages that are then "sent" to neighbors
        self.linear = torch.nn.Linear(in_ch, out_ch)
        
    def forward(self, x, edge_index):
        """
        This function uses the edges captured in edge_index, performs 
        the graph convolution function according to (Kipf, Welling 2017)
        and propagates the transformed features along the edges of the graph
        """

        # we linearly transform the features of *all* nodes stored in x
        x = self.linear(x)

        # extract source and target nodes of all edges
        source, target = edge_index
        
        # compute the (in-)degrees $d_i$ of source nodes
        deg = torch_geometric.utils.degree(target, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0

        # with this, the normalization to be applied in the propagation step can be expressed as
        # this corresponds to D^{-0.5} A * D^{-0.5} in (Kipf, Welling 2017)
        norm = deg_inv_sqrt[source] * deg_inv_sqrt[target]
        
        # the propagate function propagates messages along the edges of the graph
        # this function internally calls the functions: message(), aggregate() and update()
        # the normalization is applied in the message() function
        return self.propagate(edge_index, x=x, norm=norm)
    
    def message(self, x_j, norm):
        # x_j is a so-called **lifted** tensor which contains the source node features of each edge, 
        # i.e. it has a shape (m, out_ch) where m is the number of edges

        # a call to view(-1, 1) returns a reshaped tensor, where the second dimension 
        # is one and the first dimension is inferred automatically
        return norm.view(-1,1) * x_j



In [41]:
class GCN(torch.nn.Module):

    def __init__(self, data: torch_geometric.data.Data, out_ch, hidden_dim=16):
        super().__init__()

        # first convolution layer 
        self.input_to_hidden = GraphConvolution(data.num_node_features, hidden_dim)

        # second convolution layer
        self.hidden_to_output =  GraphConvolution(hidden_dim, out_ch)
        
    def forward(self, x, edge_index):
        
        # first graph convolution -> map nodes to representations in hidden_dim dimensions
        x = self.input_to_hidden(x, edge_index)

        # non-linear activation function
        x = torch.sigmoid(x)

        # second graph convolution -> maps node representations to output classes
        x = self.hidden_to_output(x, edge_index)

        # output class probabilities
        return torch.sigmoid(x)


In [53]:
model = GCN(data, out_ch=4, hidden_dim=4)

epochs = 10000
lrn_rate = 0.1

optimizer = torch.optim.SGD(model.parameters(), lr=lrn_rate)
print(model)

GCN(
  (input_to_hidden): GraphConvolution()
  (hidden_to_output): GraphConvolution()
)


Trainging of the model

Does not converge to 0 !!! 

In [54]:
indices = np.arange(data.num_nodes)
losses = []
model.train()
for epoch in range(epochs):

    error = 0
    
    np.random.shuffle(indices)
    for i in indices:

        if data.train_mask[i]:

            # set gradients to zero
            optimizer.zero_grad()

            # compute loss function for training sample and backpropagate
            output = model(data.x, data.edge_index)
            loss = torch.nn.functional.binary_cross_entropy(output[i], data.y[i])
            loss.backward()

            # update parameters
            optimizer.step()

            error += loss.detach().numpy()

    losses.append(error)

# plot evolution of loss function
plt.plot(range(epochs), losses)

In [52]:
output = model.forward(data.x, data.edge_index)
prediction = output

true_prediction = [data.y[x].argmax().item()==prediction[x].argmax().item() for x in range(data.num_nodes)]
accuracy=sum(true_prediction)/len(true_prediction)
print(prediction)

tensor([[0.4790, 0.4799, 0.0467, 0.1288],
        [0.4637, 0.4667, 0.0073, 0.0427],
        [0.4718, 0.4741, 0.0218, 0.0829],
        [0.4630, 0.4649, 0.0052, 0.0348],
        [0.4745, 0.4766, 0.0296, 0.0992],
        [0.4506, 0.4542, 0.0011, 0.0133],
        [0.4625, 0.4650, 0.0056, 0.0364],
        [0.4716, 0.4730, 0.0181, 0.0739],
        [0.4713, 0.4735, 0.0187, 0.0756],
        [0.4690, 0.4710, 0.0125, 0.0592],
        [0.4656, 0.4678, 0.0075, 0.0434],
        [0.4778, 0.4789, 0.0415, 0.1206],
        [0.4701, 0.4720, 0.0154, 0.0671],
        [0.4504, 0.4541, 0.0011, 0.0133],
        [0.4757, 0.4768, 0.0311, 0.1018],
        [0.4606, 0.4639, 0.0048, 0.0334],
        [0.4740, 0.4749, 0.0235, 0.0862],
        [0.4544, 0.4576, 0.0018, 0.0181],
        [0.4596, 0.4629, 0.0041, 0.0303],
        [0.4734, 0.4751, 0.0232, 0.0857],
        [0.4733, 0.4757, 0.0274, 0.0949],
        [0.4679, 0.4705, 0.0129, 0.0605],
        [0.4472, 0.4503, 0.0006, 0.0089],
        [0.4730, 0.4751, 0.0249, 0