In [1]:
import torch
from torch import nn


class GraphAttentionLayer(nn.Module):
    """
    ## Graph attention layer

    This is a single graph attention layer.
    A GAT is made up of multiple such layers.

    It takes
    $$\mathbf{h} = \{ \overrightarrow{h_1}, \overrightarrow{h_2}, \dots, \overrightarrow{h_N} \}$$,
    where $\overrightarrow{h_i} \in \mathbb{R}^F$ as input
    and outputs
    $$\mathbf{h'} = \{ \overrightarrow{h'_1}, \overrightarrow{h'_2}, \dots, \overrightarrow{h'_N} \}$$,
    where $\overrightarrow{h'_i} \in \mathbb{R}^{F'}$.
    """
    def __init__(self, in_features: int, out_features: int, n_heads: int,
                 is_concat: bool = True,
                 dropout: float = 0.6,
                 leaky_relu_negative_slope: float = 0.2):
        """
        * `in_features`, $F$, is the number of input features per node
        * `out_features`, $F'$, is the number of output features per node
        * `n_heads`, $K$, is the number of attention heads
        * `is_concat` whether the multi-head results should be concatenated or averaged
        * `dropout` is the dropout probability
        * `leaky_relu_negative_slope` is the negative slope for leaky relu activation
        """
        super().__init__()

        self.is_concat = is_concat
        self.n_heads = n_heads

        # Calculate the number of dimensions per head
        if is_concat:
            assert out_features % n_heads == 0
            # If we are concatenating the multiple heads
            self.n_hidden = out_features // n_heads
        else:
            # If we are averaging the multiple heads
            self.n_hidden = out_features

        # Linear layer for initial transformation;
        # i.e. to transform the node embeddings before self-attention
        self.linear = nn.Linear(in_features, self.n_hidden * n_heads, bias=False)
        # Linear layer to compute attention score $e_{ij}$
        self.attn = nn.Linear(self.n_hidden * 2, 1, bias=False)
        # The activation for attention score $e_{ij}$
        self.activation = nn.LeakyReLU(negative_slope=leaky_relu_negative_slope)
        # Softmax to compute attention $\alpha_{ij}$
        self.softmax = nn.Softmax(dim=1)
        # Dropout layer to be applied for attention
        self.dropout = nn.Dropout(dropout)

    def forward(self, h: torch.Tensor, adj_mat: torch.Tensor):
        """
        * `h`, $\mathbf{h}$ is the input node embeddings of shape `[n_nodes, in_features]`.
        * `adj_mat` is the adjacency matrix of shape `[n_nodes, n_nodes, n_heads]`.
        We use shape `[n_nodes, n_nodes, 1]` since the adjacency is the same for each head.

        Adjacency matrix represent the edges (or connections) among nodes.
        `adj_mat[i][j]` is `True` if there is an edge from node `i` to node `j`.
        """

        # Number of nodes
        n_nodes = h.shape[0]
        # The initial transformation,
        # $$\overrightarrow{g^k_i} = \mathbf{W}^k \overrightarrow{h_i}$$
        # for each head.
        # We do single linear transformation and then split it up for each head.
        g = self.linear(h).view(n_nodes, self.n_heads, self.n_hidden)

        # #### Calculate attention score
        #
        # We calculate these for each head $k$. *We have omitted $\cdot^k$ for simplicity*.
        #
        # $$e_{ij} = a(\mathbf{W} \overrightarrow{h_i}, \mathbf{W} \overrightarrow{h_j}) =
        # a(\overrightarrow{g_i}, \overrightarrow{g_j})$$
        #
        # $e_{ij}$ is the attention score (importance) from node $j$ to node $i$.
        # We calculate this for each head.
        #
        # $a$ is the attention mechanism, that calculates the attention score.
        # The paper concatenates
        # $\overrightarrow{g_i}$, $\overrightarrow{g_j}$
        # and does a linear transformation with a weight vector $\mathbf{a} \in \mathbb{R}^{2 F'}$
        # followed by a $\text{LeakyReLU}$.
        #
        # $$e_{ij} = \text{LeakyReLU} \Big(
        # \mathbf{a}^\top \Big[
        # \overrightarrow{g_i} \Vert \overrightarrow{g_j}
        # \Big] \Big)$$

        # First we calculate
        # $\Big[\overrightarrow{g_i} \Vert \overrightarrow{g_j} \Big]$
        # for all pairs of $i, j$.
        #
        # `g_repeat` gets
        # $$\{\overrightarrow{g_1}, \overrightarrow{g_2}, \dots, \overrightarrow{g_N},
        # \overrightarrow{g_1}, \overrightarrow{g_2}, \dots, \overrightarrow{g_N}, ...\}$$
        # where each node embedding is repeated `n_nodes` times.
        g_repeat = g.repeat(n_nodes, 1, 1)
        # `g_repeat_interleave` gets
        # $$\{\overrightarrow{g_1}, \overrightarrow{g_1}, \dots, \overrightarrow{g_1},
        # \overrightarrow{g_2}, \overrightarrow{g_2}, \dots, \overrightarrow{g_2}, ...\}$$
        # where each node embedding is repeated `n_nodes` times.
        g_repeat_interleave = g.repeat_interleave(n_nodes, dim=0)
        # Now we concatenate to get
        # $$\{\overrightarrow{g_1} \Vert \overrightarrow{g_1},
        # \overrightarrow{g_1} \Vert \overrightarrow{g_2},
        # \dots, \overrightarrow{g_1}  \Vert \overrightarrow{g_N},
        # \overrightarrow{g_2} \Vert \overrightarrow{g_1},
        # \overrightarrow{g_2} \Vert \overrightarrow{g_2},
        # \dots, \overrightarrow{g_2}  \Vert \overrightarrow{g_N}, ...\}$$
        g_concat = torch.cat([g_repeat_interleave, g_repeat], dim=-1)
        # Reshape so that `g_concat[i, j]` is $\overrightarrow{g_i} \Vert \overrightarrow{g_j}$
        g_concat = g_concat.view(n_nodes, n_nodes, self.n_heads, 2 * self.n_hidden)

        # Calculate
        # $$e_{ij} = \text{LeakyReLU} \Big(
        # \mathbf{a}^\top \Big[
        # \overrightarrow{g_i} \Vert \overrightarrow{g_j}
        # \Big] \Big)$$
        # `e` is of shape `[n_nodes, n_nodes, n_heads, 1]`
        e = self.activation(self.attn(g_concat))
        # Remove the last dimension of size `1`
        e = e.squeeze(-1)

        # The adjacency matrix should have shape
        # `[n_nodes, n_nodes, n_heads]` or`[n_nodes, n_nodes, 1]`
        assert adj_mat.shape[0] == 1 or adj_mat.shape[0] == n_nodes
        assert adj_mat.shape[1] == 1 or adj_mat.shape[1] == n_nodes
        assert adj_mat.shape[2] == 1 or adj_mat.shape[2] == self.n_heads
        # Mask $e_{ij}$ based on adjacency matrix.
        # $e_{ij}$ is set to $- \infty$ if there is no edge from $i$ to $j$.
        e = e.masked_fill(adj_mat == 0, float('-inf'))

        # We then normalize attention scores (or coefficients)
        # $$\alpha_{ij} = \text{softmax}_j(e_{ij}) =
        # \frac{\exp(e_{ij})}{\sum_{k \in \mathcal{N}_i} \exp(e_{ik})}$$
        #
        # where $\mathcal{N}_i$ is the set of nodes connected to $i$.
        #
        # We do this by setting unconnected $e_{ij}$ to $- \infty$ which
        # makes $\exp(e_{ij}) \sim 0$ for unconnected pairs.
        a = self.softmax(e)

        # Apply dropout regularization
        a = self.dropout(a)

        # Calculate final output for each head
        # $$\overrightarrow{h'^k_i} = \sum_{j \in \mathcal{N}_i} \alpha^k_{ij} \overrightarrow{g^k_j}$$
        #
        # *Note:* The paper includes the final activation $\sigma$ in $\overrightarrow{h_i}$
        # We have omitted this from the Graph Attention Layer implementation
        # and use it on the GAT model to match with how other PyTorch modules are defined -
        # activation as a separate layer.
        attn_res = torch.einsum('ijh,jhf->ihf', a, g)

        # Concatenate the heads
        if self.is_concat:
            # $$\overrightarrow{h'_i} = \Bigg\Vert_{k=1}^{K} \overrightarrow{h'^k_i}$$
            return attn_res.reshape(n_nodes, self.n_heads * self.n_hidden)
        # Take the mean of the heads
        else:
            # $$\overrightarrow{h'_i} = \frac{1}{K} \sum_{k=1}^{K} \overrightarrow{h'^k_i}$$
            return attn_res.mean(dim=1)

In [2]:
"""
---
title: Train a Graph Attention Network (GAT) on Cora dataset
summary: >
  This trains is a  Graph Attention Network (GAT) on Cora dataset
---

# Train a Graph Attention Network (GAT) on Cora dataset
"""

from typing import Dict

import numpy as np
import torch
from torch import nn

from pathlib import Path

from download import download_file, extract_tar

# from labml import lab, monit, tracker, experiment
# from labml.configs import BaseConfigs, option, calculate
# from labml.utils import download
# from labml_helpers.device import DeviceConfigs
# from labml_helpers.module import Module
# from labml_nn.graphs.gat import GraphAttentionLayer
# from labml_nn.optimizers.configs import OptimizerConfigs


class CoraDataset:
    def __init__(self, include_edges: bool = True):
        self.include_edges = include_edges

        # Download dataset
        self._download()

        # Read the paper ids, feature vectors, and labels
        content = np.genfromtxt('cora/cora/cora.content', dtype=np.dtype(str))

        # Load the citations, it's a list of pairs of integers.
        citations = np.genfromtxt('cora/cora/cora.cites', dtype=np.int32)

        features = torch.tensor(np.array(content[:, 1:-1], dtype=np.float32))
        self.features = features / features.sum(dim=1, keepdim=True)

        self.classes = {s: i for i, s in enumerate(set(content[:, -1]))}
        self.labels = torch.tensor([self.classes[i] for i in content[:, -1]], dtype=torch.long)

        paper_ids = np.array(content[:, 0], dtype=np.int32)
        ids_to_idx = {id_: i for i, id_ in enumerate(paper_ids)}

        self.adj_mat = torch.eye(len(self.labels), dtype=torch.bool)

        if self.include_edges:
            for e in citations:
                e1, e2 = ids_to_idx[e[0]], ids_to_idx[e[1]]
                self.adj_mat[e1][e2] = True
                self.adj_mat[e2][e1] = True

    def _download(self):
        data_dir = Path('cora')
        if not data_dir.exists():
            data_dir.mkdir(parents=True)

        tgz_file = data_dir / 'cora.tgz'
        content_file = data_dir / 'cora.content'

        if not content_file.exists():
            download_file('https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz', tgz_file)
            extract_tar(tgz_file, data_dir)


class GAT(nn.Module):
    def __init__(self, in_features: int, n_hidden: int, n_classes: int, n_heads: int, dropout: float):
        super().__init__()

        self.layer1 = GraphAttentionLayer(in_features, n_hidden, n_heads, is_concat=True, dropout=dropout)
        self.activation = nn.ELU()
        self.output = GraphAttentionLayer(n_hidden, n_classes, 1, is_concat=False, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, adj_mat):
        x = self.dropout(x)
        x = self.layer1(x, adj_mat)
        x = self.activation(x)
        x = self.dropout(x)
        return self.output(x, adj_mat)


def accuracy(output: torch.Tensor, labels: torch.Tensor):
    """
    A simple function to calculate the accuracy
    """
    return output.argmax(dim=-1).eq(labels).sum().item() / len(labels)


class CoraConfig:
    def __init__(self):
        self.include_edges = True
        self.epochs = 10 #1000
        self.model = None
        self.training_samples = 5 #500
        self.in_features = None
        self.n_hidden = 64
        self.n_heads = 8
        self.n_classes = None
        self.dropout = 0.6
        self.dataset = CoraDataset(self.include_edges)
        self.loss_func = nn.CrossEntropyLoss()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.optimizer = None #torch.optim.Adam(self.model.parameters())

    def run(self):
        features = self.dataset.features.to(self.device)
        labels = self.dataset.labels.to(self.device)
        edges_adj = self.dataset.adj_mat.to(self.device)
        edges_adj = edges_adj.unsqueeze(-1)

        idx_rand = torch.randperm(len(labels))
        idx_train = idx_rand[:self.training_samples]
        idx_valid = idx_rand[self.training_samples:]

        for epoch in range(self.epochs):
            print(f'Epoch {epoch + 1}')
            self.model.train()
            self.optimizer.zero_grad()
            output = self.model(features, edges_adj)
            loss = self.loss_func(output[idx_train], labels[idx_train])
            loss.backward()
            self.optimizer.step()

            self.model.eval()
            with torch.no_grad():
                output = self.model(features, edges_adj)
                loss = self.loss_func(output[idx_valid], labels[idx_valid])


# Initialize the configuration
config = CoraConfig()


from torch.optim import Adam

# Define your GAT model
class GATModel(nn.Module):
    def __init__(self, in_features, n_hidden, n_classes, n_heads, dropout):
        super(GATModel, self).__init__()
        self.gat = GAT(in_features, n_hidden, n_classes, n_heads, dropout)

    def forward(self, x, adj_mat):
        return self.gat(x, adj_mat)

# Create your GAT model and optimizer
config.in_features = config.dataset.features.shape[1]
config.n_classes = len(config.dataset.classes)
config.model = GATModel(config.in_features, config.n_hidden, config.n_classes, config.n_heads, config.dropout)
config.optimizer = Adam(config.model.parameters(), lr=5e-3, weight_decay=5e-4)

PATH="weights"
config.model.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [4]:
from torch.utils.data import DataLoader

def evaluate(model, data, idx_valid):
    features = data.features.to(config.device)
    labels = data.labels.to(config.device)
    edges_adj = data.adj_mat.to(config.device)
    edges_adj = edges_adj.unsqueeze(-1)
    model.eval()
    with torch.no_grad():
        output = config.model(features, edges_adj)
        loss = config.loss_func(output[idx_valid], labels[idx_valid])
        acc = accuracy(output[idx_valid], labels[idx_valid])
    return loss.item(), acc

# Define your evaluation set (validation set)
labels = config.dataset.labels.to(config.device)
features = config.dataset.features.to(config.device)

# Use the remaining samples as the validation set
idx_valid = range(config.training_samples, len(labels))

# Perform model evaluation on the validation set
validation_loss, validation_accuracy = evaluate(config.model, config.dataset, idx_valid)

print(f'Validation Loss: {validation_loss:.4f}')
print(f'Validation Accuracy: {validation_accuracy * 100:.2f}%')


RuntimeError: [enforce fail at alloc_cpu.cpp:80] data. DefaultCPUAllocator: not enough memory: you tried to allocate 3754631168 bytes.

In [1]:
def evaluate_individual(model, data, idx):
    features = data.features.to(config.device)
    labels = data.labels.to(config.device)
    edges_adj = data.adj_mat.to(config.device)
    edges_adj = edges_adj.unsqueeze(-1)
    print(labels[idx])
    print(features[idx])
    print(edges_adj[idx])
    #model.eval()
    #with torch.no_grad():
    #    output = config.model(features, edges_adj)
    #    loss = config.loss_func(output[idx_valid], labels[idx_valid])
    #    acc = accuracy(output[idx_valid], labels[idx_valid])
  
evaluate_individual(config.model, config.dataset, 5)

NameError: name 'config' is not defined