In [36]:
import torch
import torch.nn as nn
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

In [37]:
dataset = Planetoid(root = 'data', name = 'cora')
len(dataset)

1

In [38]:
data = dataset[0]

**Feature Manipulation**

In [40]:
len(data.x) # 2708 -> rows or nodes
len(data.x[0])  # 1433 -> bag of word encoding dim or feture dim
len(data.x[0][data.x[0] == 1]) # 9 -> only 9 words of 1433 are included
len(data.x[0][data.x[0] == 0]) # 1424

1424

In [41]:
lst_high = []
lst_low = []
for i in range(len(data.x)):
  n = len(data.x[i][data.x[i] > 0])
  lst_high.append(n)
  lst_low.append(1433 - n)

In [42]:
import numpy as np
highs = np.array(lst_high)
np.max(highs)
highs[:10]

array([ 9, 23, 19, 21, 18, 13, 18, 14, 20,  3])

In [50]:
X = data.x
num_features = X.size(1) # embedding dim -> 1433
word_counts = torch.sum(X, dim = 0)
counts = [(i, word_counts[i].item()) for i in range(num_features)]
counts_sorted = sorted(counts, key=lambda x: x[1], reverse=True)

def normalize_features(X):
  row_sum = X.sum(dim = 1, keepdim = True)
  X = X/row_sum.clamp(min = 1)
  return X

# choosing top k
def choose_top_k(k = 600, data = data, X = X):
  top_k_indices = [idx for idx, freq in counts_sorted[:k]]

  # filter feature matrix
  x_top_k = X[:, top_k_indices] # new embeddings -> 2708 x k

  lst_high_top = []
  lst_low_top = []
  for i in range(len(data.x)):
    n = len(x_top_k[i][x_top_k[i] == 1])
    lst_high_top.append(n)
    lst_low_top.append(1433 - n)
  x_top_k = normalize_features(x_top_k)

  return x_top_k

**Building Adjacency Matrix**

In [51]:
def build_adjacency(data):
  N = data.num_nodes 
  edge_index =  data.edge_index

  A = torch.zeros((N,N))
  A[edge_index[0], edge_index[1]] = 1
  A = A + A.T
  A[A > 1] = 1
  return A

def normalise_adj(A):
  N = A.size(0)
  I = torch.eye(N)
  A_hat = A + I # adding self loops

  D_hat = torch.diag(A_hat.sum(dim = 1))
  D_hat_inv_sqrt = torch.linalg.inv(torch.sqrt(D_hat))

  A_norm = D_hat_inv_sqrt @ A_hat @ D_hat_inv_sqrt
  return A_norm

A = build_adjacency(data)
A_norm = normalise_adj(A)
A_norm.shape

torch.Size([2708, 2708])

In [52]:
class GCN(nn.Module):
  def __init__(self, in_dim, hidden_dim, out_dim):
    super().__init__()
    self.W1 = torch.nn.Parameter(torch.randn(in_dim, hidden_dim))
    self.W2 = torch.nn.Parameter(torch.randn(hidden_dim, hidden_dim))
    self.W3 = torch.nn.Parameter(torch.randn(hidden_dim, out_dim))

  def forward(self, A_norm , X):
    H = torch.relu(A_norm @ X @ self.W1) # 1st Aggreation and projection
    H = torch.relu(A_norm @ H @ self.W2) # 2nd Aggregation & projection
    H = A_norm @ H @ self.W3 # outputs logits for num_classes
    return H # logits

In [53]:
labels_cpu = data.y.cpu()

print(labels_cpu.min())
print(labels_cpu.max())
print(labels_cpu.dtype)
print(dataset.num_classes)


tensor(0)
tensor(6)
torch.int64
7


In [54]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

TRAIN_ACC = []
TEST_ACC = []
k_val = [200,300,400,500,600,700,800,1000]

A_norm = A_norm.to(device)
labels = data.y.to(device)
labels = data.y.long().to(device)

train_mask = data.train_mask.to(device)
test_mask = data.test_mask.to(device)

for k_ in k_val: 
  x_top_k = choose_top_k(k= k_)
  model = GCN(in_dim = x_top_k.size(1),hidden_dim = 16,out_dim = dataset.num_classes).to(device)

  X = x_top_k.to(device)

  optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3, weight_decay = 1e-2)
  loss_fn = nn.CrossEntropyLoss()

  train_acc = 0
  for epoch in range(5000):
    model.train()
    optimizer.zero_grad()

    out = model(A_norm, X)
    loss = loss_fn(out[train_mask], labels[train_mask])

    loss.backward()
    optimizer.step()

    # if epoch % 20 == 0:
    with torch.no_grad():
      pred = out.argmax(dim=1)
      train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
    # print(f"Epoch {epoch} | Loss {loss:.4f} | Train Acc {acc:.3f}")
  TRAIN_ACC.append(train_acc.item())

  model.eval()
  with torch.no_grad():
      out = model(A_norm, X)
      pred = out.argmax(dim=1)
      test_acc = (pred[test_mask] == labels[test_mask]).float().mean()
      # print("Test Accuracy:", test_acc.item())
  TEST_ACC.append(test_acc.item())
  del X, x_top_k, out, loss, pred, model, optimizer

In [55]:
print(k_val)
print(TRAIN_ACC)
print([f"{acc:.2f}" for acc in TEST_ACC])


[200, 300, 400, 500, 600, 700, 800, 1000]
[0.9428571462631226, 0.9285714626312256, 0.9142857193946838, 0.9357143044471741, 0.9357143044471741, 0.9142857193946838, 0.9428571462631226, 0.9214285612106323]
['0.76', '0.77', '0.77', '0.77', '0.78', '0.74', '0.77', '0.77']


### GCN using Library

**Model**

In [61]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN_lib(torch.nn.Module):
  def __init__(self, in_channels, hidden_channels, out_channels):
    super().__init__()
    self.conv1 = GCNConv(in_channels, hidden_channels)
    self.conv2 = GCNConv(hidden_channels, hidden_channels)
    self.conv3 = GCNConv(hidden_channels, out_channels)

  def forward(self, x, edge_index):
    x = self.conv1(x, edge_index)
    x = F.relu(x)
    x = self.conv2(x, edge_index)
    x = F.relu(x)
    x = self.conv3(x, edge_index)
    return x

**Initializing Model & Training**

In [68]:
data = data.to(device)
def train(model, X, data, optimizer, labels, train_mask):
  model.train()
  optimizer.zero_grad()

  out = model(X, data.edge_index)
  loss = F.cross_entropy(out[train_mask], labels[train_mask])

  loss.backward()
  optimizer.step()
  return loss.item()

def test(model, X, data, labels, test_mask):
  model.eval()
  out = model(X, data.edge_index)
  pred = out.argmax(dim=1)

  correct = (pred[test_mask] == labels[data.test_mask]).sum()
  acc = int(correct) / int(data.test_mask.sum())
  return acc
device

device(type='cuda')

In [71]:

TRAIN_ACC = []
TEST_ACC = []

labels = data.y.to(device)
labels = data.y.long().to(device)

train_mask = data.train_mask.to(device)
test_mask = data.test_mask.to(device)

k_val = [200,300,400,500,600,700,800,1000, 1400]

for k_ in k_val: 
  x_top_k = choose_top_k(k= k_).to(device)
  model = GCN_lib(in_channels = x_top_k.size(1),hidden_channels = 16, out_channels = dataset.num_classes).to(device)

  optimizer = torch.optim.Adam(model.parameters(), lr = 1e-2, weight_decay = 5e-4)

  for epochs in range(5000):
    loss = train(model, x_top_k, data, optimizer, labels, train_mask)
  acc = test(model, x_top_k, data, labels, test_mask)
  TEST_ACC.append(acc)

In [72]:
print(k_val)
print(TRAIN_ACC)
print([f"{acc:.2f}" for acc in TEST_ACC])

[200, 300, 400, 500, 600, 700, 800, 1000, 1400]
[]
['0.74', '0.75', '0.76', '0.79', '0.76', '0.79', '0.80', '0.76', '0.77']
