### Assignment 1. Given the CiteSeer graph in Pytorch Geometric as shown in the following:

Questions:

1) Show some graph statistics of the CiteSeer graph (Number of nodes, Number of edges, and node feature length, ...)
2) Running the GAT model on Node classification on the CiteSeer graph

In [1]:
import torch

from torch_geometric.datasets import Planetoid

# Import dataset from PyTorch Geometric
dataset = Planetoid(root=".", name="CiteSeer")

data = dataset[0]

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.test.index
Processing...
Done!


In [2]:
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from torch_geometric.datasets import Planetoid

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Extract necessary data and transfer to GPU
features = data.x.to(device)  # Node feature matrix
labels = data.y.to(device)  # Node labels
edge_index = data.edge_index.to(device)  # Edge indices

# Convert edge index to adjacency matrix
num_nodes = features.shape[0]
adj = torch.zeros((num_nodes, num_nodes)).to(device)
adj[edge_index[0], edge_index[1]] = 1

# Add self-loops
adj += torch.eye(num_nodes).to(device)

# Print shapes for verification
print(f"Features shape: {features.shape}")
print(f"Labels shape: {labels.shape}")
print(f"Adjacency matrix shape: {adj.shape}")

Using device: cuda
Features shape: torch.Size([3327, 3703])
Labels shape: torch.Size([3327])
Adjacency matrix shape: torch.Size([3327, 3327])


In [4]:
import torch
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import remove_isolated_nodes

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

dataset = Planetoid(root=".", name="CiteSeer")
data = dataset[0]

print("Dataset Information:")
print('-------------------')
print(f"Dataset: {dataset}")
print(f"Number of graphs: {len(dataset)}")
print(f"Number of nodes: {data.num_nodes}")
print(f"Number of edges: {data.num_edges}")
print(f"Node feature length: {data.num_node_features}")
print(f"Number of classes: {dataset.num_classes}")

print(f"\nGraph Structure:")
print('-------------------')
print(f"Edges are directed: {data.is_directed()}")
print(f"Graph has isolated nodes: {data.has_isolated_nodes()}")
print(f"Graph has self-loops: {data.has_self_loops()}")

isolated = (remove_isolated_nodes(data.edge_index)[2] == False).sum(dim=0).item()
print(f"Number of isolated nodes = {isolated}")

features = data.x.to(device) 
labels = data.y.to(device)  
edge_index = data.edge_index.to(device)  

num_nodes = features.shape[0]
adj = torch.zeros((num_nodes, num_nodes)).to(device)
adj[edge_index[0], edge_index[1]] = 1

adj += torch.eye(num_nodes).to(device)

print(f"\nMatrix Shapes:")
print('-------------------')
print(f"Features shape: {features.shape}")
print(f"Labels shape: {labels.shape}")
print(f"Adjacency matrix shape: {adj.shape}")


Using device: cuda
Dataset Information:
-------------------
Dataset: CiteSeer()
Number of graphs: 1
Number of nodes: 3327
Number of edges: 9104
Node feature length: 3703
Number of classes: 6

Graph Structure:
-------------------
Edges are directed: False
Graph has isolated nodes: True
Graph has self-loops: False
Number of isolated nodes = 48

Matrix Shapes:
-------------------
Features shape: torch.Size([3327, 3703])
Labels shape: torch.Size([3327])
Adjacency matrix shape: torch.Size([3327, 3327])


In [5]:
class GraphAttentionLayer(torch.nn.Module):
    def __init__(self, in_features, out_features, dropout, alpha, concat=True):
        super(GraphAttentionLayer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha
        self.concat = concat

        # Learnable weight matrix
        self.W = torch.nn.Parameter(torch.zeros(size=(in_features, out_features)))
        torch.nn.init.xavier_uniform_(self.W.data, gain=1.414)

        # Attention mechanism weight
        self.a = torch.nn.Parameter(torch.zeros(size=(2 * out_features, 1)))
        torch.nn.init.xavier_uniform_(self.a.data, gain=1.414)

        # LeakyReLU non-linearity
        self.leakyrelu = torch.nn.LeakyReLU(self.alpha)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, h, adj):
        # Linear transformation
        Wh = torch.mm(h, self.W)  # (N, out_features)

        # Compute attention coefficients
        e = self._prepare_attentional_mechanism_input(Wh)
        zero_vec = -9e15 * torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        attention = F.softmax(attention, dim=1)
        attention = self.dropout(attention)

        # Aggregate node features
        h_prime = torch.matmul(attention, Wh)

        # If `concat` is True, apply activation function (ELU)
        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

    def _prepare_attentional_mechanism_input(self, Wh):
        Wh1 = torch.mm(Wh, self.a[:self.out_features, :])  # (N, 1)
        Wh2 = torch.mm(Wh, self.a[self.out_features:, :])  # (N, 1)
        e = Wh1 + Wh2.T  # Broadcasting (N, N)
        return self.leakyrelu(e)

In [6]:
class GAT(torch.nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout, alpha, nheads):
        super(GAT, self).__init__()
        self.dropout = dropout

        # Layer 1: Multi-head attention (input -> hidden dimension)
        self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in range(nheads)]
        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)

        # Layer 2: Single-head attention (concatenated hidden -> output classes)
        self.out_att = GraphAttentionLayer(nhid * nheads, nclass, dropout=dropout, alpha=alpha, concat=False)

    def forward(self, x, adj):
        x = F.dropout(x, self.dropout, training=self.training)
        x = torch.cat([att(x, adj) for att in self.attentions], dim=1)  # Concatenate multiple heads
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.out_att(x, adj)  # Output layer
        return F.log_softmax(x, dim=1)

In [7]:
# Define training parameters
nfeat = features.shape[1]  # Number of input features
nhid = 4  # Number of hidden units per attention head
nclass = dataset.num_classes  # Number of output classes
dropout = 0.6  # Dropout rate
alpha = 0.2  # Alpha for the LeakyReLU
nheads = 4  # Number of attention heads

# Initialize the GAT model
model = GAT(nfeat=nfeat, nhid=nhid, nclass=nclass, dropout=dropout, alpha=alpha, nheads=nheads).to(device)

# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

# Mask to split data into train, validation, and test
idx_train = data.train_mask
idx_val = data.val_mask
idx_test = data.test_mask

# Training function
def train():
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)
    loss_train = F.nll_loss(output[idx_train], labels[idx_train])
    acc_train = accuracy(output[idx_train], labels[idx_train])
    loss_train.backward()
    optimizer.step()
    return loss_train.item(), acc_train

# Evaluation function
def evaluate(mask):
    model.eval()
    with torch.no_grad():
        output = model(features, adj)
        loss = F.nll_loss(output[mask], labels[mask])
        acc = accuracy(output[mask], labels[mask])
    return loss.item(), acc

# Accuracy calculation
def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)

In [8]:
epochs = 100
train_losses, train_accuracies = [], []  # Initialize lists to store training metrics
val_losses, val_accuracies = [], []  # Initialize lists to store validation metrics

for epoch in range(epochs):
    train_loss, train_acc = train()
    val_loss, val_acc = evaluate(idx_val)

    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    val_losses.append(val_loss)
    val_accuracies.append(val_acc)

    print(f"Epoch: {epoch + 1:03d}, "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

# Final evaluation on test set
test_loss, test_acc = evaluate(idx_test)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

Epoch: 001, Train Loss: 2.0495, Train Acc: 0.1667, Val Loss: 1.7662, Val Acc: 0.2660
Epoch: 002, Train Loss: 1.8903, Train Acc: 0.1667, Val Loss: 1.7288, Val Acc: 0.3620
Epoch: 003, Train Loss: 1.9907, Train Acc: 0.2500, Val Loss: 1.6905, Val Acc: 0.4060
Epoch: 004, Train Loss: 1.6940, Train Acc: 0.2417, Val Loss: 1.6555, Val Acc: 0.4380
Epoch: 005, Train Loss: 1.6932, Train Acc: 0.2917, Val Loss: 1.6223, Val Acc: 0.4660
Epoch: 006, Train Loss: 1.6772, Train Acc: 0.3167, Val Loss: 1.5904, Val Acc: 0.5080
Epoch: 007, Train Loss: 1.5484, Train Acc: 0.4083, Val Loss: 1.5603, Val Acc: 0.5120
Epoch: 008, Train Loss: 1.5660, Train Acc: 0.3417, Val Loss: 1.5313, Val Acc: 0.5400
Epoch: 009, Train Loss: 1.5344, Train Acc: 0.4167, Val Loss: 1.5032, Val Acc: 0.5480
Epoch: 010, Train Loss: 1.4870, Train Acc: 0.4250, Val Loss: 1.4785, Val Acc: 0.5480
Epoch: 011, Train Loss: 1.3781, Train Acc: 0.4833, Val Loss: 1.4553, Val Acc: 0.5680
Epoch: 012, Train Loss: 1.3912, Train Acc: 0.5000, Val Loss: 1.43

### Assignment 2. Load the  CiteSeer dataset from Torch Geometric and then do the node classification task by using the GATv2 model

In [13]:
import torch.nn as nn

class GATv2Layer(nn.Module):
    def __init__(self, in_features, out_features, heads=1, concat=True):
        super(GATv2Layer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.heads = heads
        self.concat = concat

        # Linear transformation for node features
        self.linear = nn.Linear(in_features, out_features * heads, bias=False)

        # Attention mechanism
        self.attention = nn.Parameter(torch.Tensor(1, heads, 2 * out_features))
        nn.init.xavier_uniform_(self.attention.data, gain=1.414)

        # LeakyReLU for attention coefficients
        self.leakyrelu = nn.LeakyReLU(0.2)

    def forward(self, h, adj):
        # Apply linear transformation: (N, in_features) -> (N, heads * out_features)
        Wh = self.linear(h)
        Wh = Wh.reshape(-1, self.heads, self.out_features)  # Using reshape instead of view

        # Create attention scores for self and neighbors
        Wh_i = Wh.unsqueeze(1).repeat(1, adj.size(1), 1, 1)  # Shape: (N, N, heads, out_features)
        Wh_j = Wh.unsqueeze(0).repeat(adj.size(0), 1, 1, 1)  # Shape: (N, N, heads, out_features)

        # Concatenate Wh_i and Wh_j for attention calculation
        a_input = torch.cat([Wh_i, Wh_j], dim=-1)  # Shape: (N, N, heads, 2 * out_features)

        # Compute attention scores using the learned parameter and LeakyReLU
        e = self.leakyrelu((a_input * self.attention).sum(dim=-1))  # Shape: (N, N, heads)

        # Masking: attention scores for unconnected nodes should be -infinity
        zero_vec = -9e15 * torch.ones_like(e)
        attention = torch.where(adj.unsqueeze(-1) > 0, e, zero_vec)  # Shape: (N, N, heads)

        # Softmax along neighbors
        attention = F.softmax(attention, dim=1)

        # Compute the attention-weighted sum of node features
        h_prime = torch.einsum('ijh,jhf->ihf', attention, Wh)  # Shape: (N, heads, out_features)

        if self.concat:
            return h_prime.reshape(-1, self.heads * self.out_features)  # Use reshape instead of view
        else:
            return h_prime.mean(dim=1)  # Average heads

In [14]:
import torch.optim as optim
import torch.nn.functional as F

class GATv2(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout, heads=1):
        super(GATv2, self).__init__()
        self.gat1 = GATv2Layer(nfeat, nhid, heads=heads, concat=True)
        self.gat2 = GATv2Layer(nhid * heads, nclass, heads=1, concat=False)
        self.dropout = dropout

    def forward(self, x, adj):
        x = F.dropout(x, self.dropout, training=self.training)
        x = F.elu(self.gat1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gat2(x, adj)
        return F.log_softmax(x, dim=1)

# Model and optimizer
model = GATv2(nfeat=features.shape[1], nhid=4, nclass=int(labels.max().item()) + 1, dropout=0.6, heads=4).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

In [15]:
def train():
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)
    loss = F.nll_loss(output[data.train_mask], labels[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate():
    model.eval()
    with torch.no_grad():
        output = model(features, adj)
        preds = output.argmax(dim=1)
        correct = preds[data.val_mask].eq(labels[data.val_mask]).sum().item()
        return correct / data.val_mask.sum().item()

In [16]:
epochs = 100
train_loss = []
val_acc = []

for epoch in range(epochs):
    loss = train()
    acc = evaluate()
    train_loss.append(loss)
    val_acc.append(acc)

    # Print loss and accuracy for each epoch
    print(f"Epoch {epoch+1:03d} | Loss: {loss:.4f} | Val Accuracy: {acc:.4f}")

Epoch 001 | Loss: 1.7923 | Val Accuracy: 0.5120
Epoch 002 | Loss: 1.7324 | Val Accuracy: 0.6360
Epoch 003 | Loss: 1.6561 | Val Accuracy: 0.6540
Epoch 004 | Loss: 1.5935 | Val Accuracy: 0.6700
Epoch 005 | Loss: 1.5264 | Val Accuracy: 0.6700
Epoch 006 | Loss: 1.4514 | Val Accuracy: 0.6740
Epoch 007 | Loss: 1.3754 | Val Accuracy: 0.6800
Epoch 008 | Loss: 1.3063 | Val Accuracy: 0.6780
Epoch 009 | Loss: 1.2154 | Val Accuracy: 0.6800
Epoch 010 | Loss: 1.1376 | Val Accuracy: 0.6820
Epoch 011 | Loss: 1.0736 | Val Accuracy: 0.6780
Epoch 012 | Loss: 1.0486 | Val Accuracy: 0.6800
Epoch 013 | Loss: 0.9495 | Val Accuracy: 0.6780
Epoch 014 | Loss: 0.8877 | Val Accuracy: 0.6780
Epoch 015 | Loss: 0.7989 | Val Accuracy: 0.6760
Epoch 016 | Loss: 0.7576 | Val Accuracy: 0.6720
Epoch 017 | Loss: 0.7031 | Val Accuracy: 0.6740
Epoch 018 | Loss: 0.6740 | Val Accuracy: 0.6740
Epoch 019 | Loss: 0.6077 | Val Accuracy: 0.6780
Epoch 020 | Loss: 0.5738 | Val Accuracy: 0.6760
Epoch 021 | Loss: 0.5351 | Val Accuracy: