<a href="https://colab.research.google.com/github/Hungpai/ML4G-Project-3/blob/main/TAGCN_Reproducibility_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Make sure to set runtime to GPU in colab.

In [None]:
# Ensure that the right PyG is installed and compatible with the recent update of colab!
import torch 

def format_pytorch_version(version):
  return version.split('+')[0]
1 
def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

# set runtime to GPU in collab for fast computations!
CUDA_version = torch.version.cuda
#CUDA = "cpu"
CUDA = format_cuda_version(CUDA_version)

!pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric

Looking in links: https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 36.3 MB/s 
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9
Looking in links: https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_sparse-0.6.12-cp37-cp37m-linux_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 27.2 MB/s 
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.12
Looking in links: https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html
Collecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_cluster-1.5.9-cp37-cp37m-linux_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2

In [None]:
# imports
import torch
import torch.nn as nn

from torch_geometric.datasets import Planetoid
from torch_geometric.nn import TAGConv, GCNConv, ChebConv

# Datasets

In [None]:
# Pubmed
pubmed_dataset = Planetoid(root="data", name="PubMed", split="public")

# Citeseer
citeseer_dataset = Planetoid(root="data", name="CiteSeer", split="public")

# Cora dataset
cora_dataset = Planetoid(root="data", name="Cora", split="public")

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.test.index
Processing...
Done!
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/in

# Dataset statistics

In [None]:
def print_statistics(dataset):
  print('================')
  print(f'Dataset: {dataset}')
  print(f'Number of graphs: {len(dataset)}')
  print(f'Number of features: {dataset.num_features}')
  print(f'Number of classes: {dataset.num_classes}')

  data = dataset[0]

  print(data)
  print('==============================================================')

  print(f'Number of nodes: {data.num_nodes}')
  print(f'Number of edges: {data.num_edges}')
  print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
  print(f'Number of training nodes: {data.train_mask.sum()}')
  print(f'Number of validation nodes: {data.val_mask.sum()}')
  print(f'Number of test nodes: {data.test_mask.sum()}')
  print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:f}')
  print(f'Contains isolated nodes: {data.has_isolated_nodes()}')
  print(f'Contains self-loops: {data.has_self_loops()}')
  print(f'Is undirected: {data.is_undirected()}\n')

print_statistics(pubmed_dataset)
print_statistics(citeseer_dataset)
print_statistics(cora_dataset)

Dataset: PubMed()
Number of graphs: 1
Number of features: 500
Number of classes: 3
Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])
Number of nodes: 19717
Number of edges: 88648
Average node degree: 4.50
Number of training nodes: 60
Number of validation nodes: 500
Number of test nodes: 1000
Training node label rate: 0.003043
Contains isolated nodes: False
Contains self-loops: False
Is undirected: True

Dataset: CiteSeer()
Number of graphs: 1
Number of features: 3703
Number of classes: 6
Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])
Number of nodes: 3327
Number of edges: 9104
Average node degree: 2.74
Number of training nodes: 120
Number of validation nodes: 500
Number of test nodes: 1000
Training node label rate: 0.036069
Contains isolated nodes: True
Contains self-loops: False
Is undirected: True

Dataset: Cora()
Number of graphs: 1
Number of features: 1433
Numbe

# Models: TAGCN, GCN, ChebNet

In [None]:
class TAGCNNet(nn.Module):
  def __init__(self, num_features, num_classes, filter_number, filter_size):
    super().__init__()
    self.conv1 = TAGConv(num_features, filter_number, filter_size)
    self.relu  = nn.ReLU()
    self.dropout = nn.Dropout()
    self.conv2 = TAGConv(filter_number, num_classes, filter_size)
    self.softmax = nn.Softmax(dim=-1)
  
  def forward(self, x, edge_index, edge_attr):
    x = self.conv1(x, edge_index, edge_attr)
    x = self.relu(x)
    x = self.dropout(x)
    x = self.conv2(x, edge_index, edge_attr)
    x = self.softmax(x)
    return x

class GCNNet(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.conv1 = GCNConv(num_features, 16, cached=True, normalize=True)
        self.relu  = nn.ReLU()
        self.dropout = nn.Dropout()
        self.conv2 = GCNConv(16, num_classes, cached=True, normalize=True)
        self.softmax = nn.Softmax(dim=-1)
        

    def forward(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index, edge_attr)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = self.softmax(x)
        return x

class ChebNet(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.conv1 = ChebConv(num_features, 16, K=2)
        self.relu  = nn.ReLU()
        self.dropout = nn.Dropout()
        self.conv2 = ChebConv(16, num_classes, K=2)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index, edge_attr)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = self.softmax(x)
        return x

# Training Structure

In [None]:
def train(model, optimizer, loss_func, x, y, train_mask, edge_attr, edge_index):
    model.train()
    optimizer.zero_grad()
    pred = model(x, edge_index, edge_attr)
    loss = loss_func(pred[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()

def test(model, data, x, y, edge_attr, edge_index):
    model.eval()
    logits, accs = model(x, edge_index, edge_attr), []
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        pred = logits[mask].argmax(1)
        acc = (pred == y[mask]).sum().item() / mask.sum().item()
        accs.append(acc)
    return accs

def training_loop(dataset, model=0, filter_number=16, filter_size=2, stats=True):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  if model == 0:
    model = TAGCNNet(dataset.num_features, dataset.num_classes, filter_number, filter_size).to(device)
  elif model ==1:
    model = GCNNet(dataset.num_features, dataset.num_classes).to(device)
  else:
    model = ChebNet(dataset.num_features, dataset.num_classes).to(device)

  data = dataset[0].to(device)

  epochs= 200
  optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
  loss_func = nn.CrossEntropyLoss()
  file_name = s=f'{(str(dataset)[0:-2])}.pt'

  # early stopping
  counter = 0
  best_val_acc = -float('inf')
  patience = 45

  for epoch in range(epochs):
      train(model, optimizer, loss_func, data.x, data.y, data.train_mask, data.edge_attr, data.edge_index)
      train_acc, val_acc, tmp_test_acc = test(model, data, data.x, data.y, data.edge_attr, data.edge_index)

      # stop mechanism
      if val_acc > best_val_acc:
        counter = 0
        best_val_acc = val_acc
        test_acc = tmp_test_acc
        torch.save(model.state_dict(), file_name)
      else:
        counter += 1
        if counter > patience:
          if stats:
            print(f"Validation loss over the last {patience} epochs not improved, terminate training\n")
          model.load_state_dict(torch.load(file_name))
          break
      
      if epoch % 10 == 9 and stats:
        print(f'Epoch: [{epoch+1}/{epochs}], Train: {train_acc:.4f}, Val: {best_val_acc:.4f}, Test: {test_acc:.4f}')
    
  model.load_state_dict(torch.load(file_name))
  return model

# Training TAGCN models with different configuration

In [None]:
datasets = [pubmed_dataset, citeseer_dataset, cora_dataset]
configs = [(1,16),(2,16),(3,16),(4,16),(2,8)] # (filter_size, filter_number)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
runs = 10

for dataset in datasets:
  for config in configs:
    acc = 0
    for run in range(runs):
      filter_size, filter_number = config
      model = training_loop(dataset, 0, filter_number, filter_size, False)
      
      data = dataset[0].to(device)
      _, _, accuracy = test(model, data, data.x, data.y, data.edge_attr, data.edge_index)
      acc += accuracy
  
    avg_acc = acc / runs
    print(f"dataset: {str(dataset):<12} filter_size:{filter_size} \tfilter_number:{filter_number} \taccuracy:{avg_acc*100:.2f}%")
  print('')

dataset: PubMed()     filter_size:1 	filter_number:16 	accuracy:77.74%
dataset: PubMed()     filter_size:2 	filter_number:16 	accuracy:78.95%
dataset: PubMed()     filter_size:3 	filter_number:16 	accuracy:78.70%
dataset: PubMed()     filter_size:4 	filter_number:16 	accuracy:79.19%
dataset: PubMed()     filter_size:2 	filter_number:8 	accuracy:78.46%

dataset: CiteSeer()   filter_size:1 	filter_number:16 	accuracy:67.93%
dataset: CiteSeer()   filter_size:2 	filter_number:16 	accuracy:67.82%
dataset: CiteSeer()   filter_size:3 	filter_number:16 	accuracy:68.72%
dataset: CiteSeer()   filter_size:4 	filter_number:16 	accuracy:68.24%
dataset: CiteSeer()   filter_size:2 	filter_number:8 	accuracy:64.26%

dataset: Cora()       filter_size:1 	filter_number:16 	accuracy:78.15%
dataset: Cora()       filter_size:2 	filter_number:16 	accuracy:81.01%
dataset: Cora()       filter_size:3 	filter_number:16 	accuracy:80.70%
dataset: Cora()       filter_size:4 	filter_number:16 	accuracy:80.75%
datase

# Training GCN and ChebNet model

In [None]:
models = [1,2] #1: GCN, 2: Chebnet
for mod in models:
  for dataset in datasets:
    acc = 0
    for run in range(runs):
      model = training_loop(dataset, mod, stats=False)
      
      data = dataset[0].to(device)
      _, _, accuracy = test(model, data, data.x, data.y, data.edge_attr, data.edge_index)
      acc += accuracy
    avg_acc = acc / runs
    if mod == 1:
      m = 'GCN'
    else:
      m = 'ChebNet'
    print(f"dataset: {str(dataset):<12} model:{m} \taccuracy:{avg_acc*100:.2f}%")
  print('')


dataset: PubMed()     model:GCN 	accuracy:78.38%
dataset: CiteSeer()   model:GCN 	accuracy:68.47%
dataset: Cora()       model:GCN 	accuracy:80.41%

dataset: PubMed()     model:ChebNet 	accuracy:77.85%
dataset: CiteSeer()   model:ChebNet 	accuracy:63.26%
dataset: Cora()       model:ChebNet 	accuracy:78.70%

