# Graph Neural Networks with PyG on Node Classification, Link Prediction, and Anomaly Detection

In this notebook, we will review PyG code implementations on major graph problems including node classification, link prediction, and anomaly detection.

In [None]:
%%capture
import torch
version = torch.__version__
i = version.find('+')
version = version[:i-1] + '0' + version[i:]
url = 'https://data.pyg.org/whl/torch-' + version + '.html'

!pip install torch-scatter -f $url
!pip install torch-sparse -f $url
!pip install torch-geometric
!pip install torch-cluster -f $url
!pip install pygod
!pip install --upgrade scipy

In [None]:
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx
import torch_geometric.transforms as T

import matplotlib.pyplot as plt
import networkx as nx
import random
import numpy as np
from collections import Counter

In [None]:
def make_deterministic(random_seed = 123):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)

make_deterministic()

# 1. Loading Data
We are using [Cora dataset](https://paperswithcode.com/dataset/cora) for the following GCN implementations. The Cora dataset is a paper citation network data that consists of 2,708 scientific publications. Each node in the graph represents each publication and a pair of nodes is connected with an edge if one paper cites the other.

Through this notebook, we are using [PyG (Pytorch Geometric)](https://www.pyg.org/) to implement GCN which is one of the popular GNN libraries. The Cora dataset can also be loaded using PyG module.

In [None]:
def show_graph_stats(graph):
    print(f"Number of nodes: {graph.x.shape[0]}")
    print(f"Number of node features: {graph.x.shape[1]}")
    print(f"Number of edges: {graph.edge_index.shape[1]}")


In [None]:
from torch_geometric.nn import RGCNConv, GCNConv
import torch.nn.functional as F
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(
            dim=-1
        )  # product of a pair of nodes on each edge

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

class RGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_relations):
        super().__init__()
        self.conv1 = RGCNConv(in_channels, hidden_channels, num_relations, num_bases=min(num_relations, 10))
        self.conv2 = RGCNConv(hidden_channels, out_channels, num_relations, num_bases=min(num_relations, 10))

    def encode(self, x, edge_index, edge_type):
        x = self.conv1(x, edge_index, edge_type).relu()
        return self.conv2(x, edge_index, edge_type)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

In [None]:
from sklearn.metrics import roc_auc_score
from torch_geometric.utils import negative_sampling


def train_link_predictor(
    model, train_data, val_data, optimizer, criterion, n_epochs=1000, patience=50
):
    best_val_auc = 0  # Best observed validation AUC
    epochs_without_improvement = 0  # Counter for epochs without improvement

    for epoch in range(1, n_epochs + 1):
        # Train the model
        model.train()
        optimizer.zero_grad()

        # Pass edge_type to the model's encode method if RGCN model
        if isinstance(model, RGCN):
            z = model.encode(x=train_data.x, edge_index=train_data.edge_index, edge_type=train_data.edge_types)
        else:
            z = model.encode(x=train_data.x, edge_index=train_data.edge_index)

        # Sample negatives
        neg_edge_index = negative_sampling(
            edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
            num_neg_samples=train_data.edge_label_index.size(1), method='sparse')

        edge_label_index = torch.cat([train_data.edge_label_index, neg_edge_index], dim=-1)
        edge_label = torch.cat([
            train_data.edge_label,
            train_data.edge_label.new_zeros(neg_edge_index.size(1))
        ], dim=0)

        # Compute loss
        out = model.decode(z, edge_label_index).view(-1)
        loss = criterion(out, edge_label)
        loss.backward()
        optimizer.step()

        # Compute validation AUC
        val_auc = eval_link_predictor(model, val_data)

        # Check for improvement
        if val_auc > best_val_auc:
            best_val_auc = val_auc  # Update best validation AUC
            epochs_without_improvement = 0  # Reset counter
        else:
            epochs_without_improvement += 1  # Increment counter

        if epoch % 10 == 0:
            print(f"Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}")

        # Check for early stopping
        if epochs_without_improvement >= patience:
            print(f"No improvement in validation AUC for {patience} epochs, stopping")
            break

    return model



@torch.no_grad()
def eval_link_predictor(model, data):

    model.eval()
    if isinstance(model, RGCN):
      z = model.encode(data.x, data.edge_index, data.edge_types)
    else:
      z = model.encode(data.x, data.edge_index)
    out = model.decode(z, data.edge_label_index).view(-1).sigmoid()

    # print(out.cpu().numpy()[:10])
    # print(data.edge_label.cpu().numpy()[:10])


    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())

In [None]:
import pickle
from google.colab import drive
import glob
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

drive.mount('/content/drive')

pathnames = [path for path in glob.glob('/content/drive/MyDrive/full_graphs/paper_update/*.pth')]



Mounted at /content/drive


In [None]:
for path in pathnames:
  graph = torch.load(path)
  show_graph_stats(graph)
  graph = graph.to(device)

  split = T.RandomLinkSplit(
      num_val=0.05,
      num_test=0.1,
      is_undirected=True,
      add_negative_train_samples=False,
      neg_sampling_ratio=1.0,
  )
  train_data, val_data, test_data = split(graph)

  use_relation_types = False

  if use_relation_types:
    num_relations = len(graph.edge_types.unique())
    model = RGCN(graph.x.shape[1], 128, 64, num_relations).to(device)
  else:
    model = Net(graph.x.shape[1], 128, 64).to(device)

  optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001)
  criterion = torch.nn.BCEWithLogitsLoss()

  model = train_link_predictor(model, train_data, val_data, optimizer, criterion)

  test_auc = eval_link_predictor(model, test_data)
  print(f"Test: {test_auc:.3f}")

  # get the embeddings
  with torch.no_grad():  # we don't need gradients for this operation
      embeddings = model.encode(graph.x, graph.edge_index)

  # Save learned embeddings as the dataset to disk
  graph.x = embeddings
  graph_name = os.path.basename(path).replace('.pth', '') + "_embedded.pth"
  torch.save(graph, '/content/drive/MyDrive/embedded/' + graph_name)

Number of nodes: 6449
Number of node features: 5
Number of edges: 8646
Epoch: 010, Train Loss: 0.477, Val AUC: 0.896
Epoch: 020, Train Loss: 0.477, Val AUC: 0.897
Epoch: 030, Train Loss: 0.475, Val AUC: 0.897
Epoch: 040, Train Loss: 0.476, Val AUC: 0.897
Epoch: 050, Train Loss: 0.478, Val AUC: 0.897
Epoch: 060, Train Loss: 0.477, Val AUC: 0.897
Epoch: 070, Train Loss: 0.474, Val AUC: 0.897
Epoch: 080, Train Loss: 0.471, Val AUC: 0.897
No improvement in validation AUC for 50 epochs, stopping
Test: 0.886
Number of nodes: 6918
Number of node features: 5
Number of edges: 9282
Epoch: 010, Train Loss: 0.480, Val AUC: 0.899
Epoch: 020, Train Loss: 0.479, Val AUC: 0.899
Epoch: 030, Train Loss: 0.479, Val AUC: 0.898
Epoch: 040, Train Loss: 0.476, Val AUC: 0.898
Epoch: 050, Train Loss: 0.476, Val AUC: 0.897
Epoch: 060, Train Loss: 0.478, Val AUC: 0.897
No improvement in validation AUC for 50 epochs, stopping
Test: 0.896
