**GRAPH ATTENTION NETWORK**


This code is a simplified version of a Graph Attention Network. It starts by implementing a single Graph Attention Layer. This layer takes as input the features of nodes in a graph and its adjacency matrix, then applies a linear transformation to the node features. This is used to calculate attention coefficients between each pair of connected nodes using a learnable attention mechanism, involving a Leaky ReLU activation. These attention coefficients are normalized using a softmax function, and a droupout for regularization is applied. Finally, it aggregates the features of each node neighbors based on these attention weights to produce new, potentially more informative, feature representations for each node. The forward method starts these steps, and the example usage demonstrates how to initialize the layer with specified input and output features, dropout probability, and Leaky ReLU alpha, and then perform a forward pass using simulated node features and an adjacency matrix to obtain the transformed output node features.


Structure


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as f

In [3]:
class GATlayer(nn.Module):
    #Simple PyTorch Implementation of GAT
    def __init__(self):
        super(GATlayer, self).__init__()
    def forward(self, input, adj):
        print('')

In [4]:
in_features = 5
out_features = 2
nb_nodes = 4

#Xavier Paramiter Initializator (Avoids vanishing or exploding gradients)
W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
nn.init.xavier_uniform_(W.data, gain =1.414)

#Linear Transformation
input = torch.rand(nb_nodes, in_features)
h = torch.mm(input, W)
N = h.size()[0]

print(h.shape)

torch.Size([4, 2])


Attention 

In [5]:
#Attention Implementation
a = nn.Parameter(torch.zeros(size=(2*out_features, 1)))
#Xavier Paramiter Initializator
nn.init.xavier_uniform_(a.data, gain=1.414)
print(a.shape)
#Activation Function
leakyrelu = nn.LeakyReLU(0,2)

torch.Size([4, 1])


In [6]:
#Attention Coefficients
a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2*out_features)
e = leakyrelu(torch.matmul(a_input, a).squeeze(2))
print(a_input.shape, a.shape)
print(torch.matmul(a_input,a).shape)
print(torch.matmul(a_input,a).squeeze(2).shape)
e

torch.Size([4, 4, 4]) torch.Size([4, 1])
torch.Size([4, 4, 1])
torch.Size([4, 4])


tensor([[1.2363, 1.3500, 1.2268, 1.1665],
        [0.5699, 0.6836, 0.5603, 0.5001],
        [1.3419, 1.4556, 1.3323, 1.2721],
        [1.8977, 2.0114, 1.8882, 1.8279]], grad_fn=<AsStridedBackward0>)

Masked Attention


In [7]:
#Adjacency Matrix
adj = torch.randint(2, (nb_nodes, nb_nodes))
zero_vec = -9e15*torch.ones_like(e)
print(zero_vec.shape)

torch.Size([4, 4])


In [8]:
#Apply Mask
attention = torch.where(adj>0, e, zero_vec)
print(adj,"\n",e,"\n",zero_vec)
attention

tensor([[0, 1, 1, 0],
        [0, 1, 1, 1],
        [0, 1, 1, 1],
        [1, 0, 1, 1]]) 
 tensor([[1.2363, 1.3500, 1.2268, 1.1665],
        [0.5699, 0.6836, 0.5603, 0.5001],
        [1.3419, 1.4556, 1.3323, 1.2721],
        [1.8977, 2.0114, 1.8882, 1.8279]], grad_fn=<AsStridedBackward0>) 
 tensor([[-9.0000e+15, -9.0000e+15, -9.0000e+15, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -9.0000e+15, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -9.0000e+15, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -9.0000e+15, -9.0000e+15]])


tensor([[-9.0000e+15,  1.3500e+00,  1.2268e+00, -9.0000e+15],
        [-9.0000e+15,  6.8359e-01,  5.6032e-01,  5.0005e-01],
        [-9.0000e+15,  1.4556e+00,  1.3323e+00,  1.2721e+00],
        [ 1.8977e+00, -9.0000e+15,  1.8882e+00,  1.8279e+00]],
       grad_fn=<WhereBackward0>)

In [9]:
#Normalize Attention Score
attention = f.softmax(attention, dim=1)
#Weighted Node Features
h_prime = torch.matmul(attention, h)
print(h_prime,"\n",h)

tensor([[-0.3083, -0.7552],
        [-0.4786, -0.9165],
        [-0.4786, -0.9165],
        [-0.7139, -1.0121]], grad_fn=<MmBackward0>) 
 tensor([[-0.6325, -0.8479],
        [-0.0020, -0.6057],
        [-0.6548, -0.9243],
        [-0.8641, -1.2816]], grad_fn=<MmBackward0>)


Layer Build


In [10]:
class GATLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout, alpha=0.2, concat=True):
        super().__init__()
        self.dropout = dropout
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha
        self.concat = concat

        self.linear = nn.Linear(in_features, out_features, bias=False)
        self.attn_fc = nn.Linear(2 * out_features, 1, bias=False)
        self.leaky_relu = nn.LeakyReLU(self.alpha)

        self._init_weights()

    def _init_weights(self):
        #Initializes weights (Xavier Uniform initialization)
        init.xavier_uniform_(self.linear.weight)
        init.xavier_uniform_(self.attn_fc.weight)

    def forward(self, input, adj):
        h = self.linear(input)
        N = h.size(0)

        #Attention inputs
        attn_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features)
        e = self.leaky_relu(self.attn_fc(attn_input).squeeze(2))

        #Masked attention
        zero_vec = -9e15 * torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)

        #Normalize attention scores
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)

        #Weighted sum of neighbor features
        h_prime = torch.matmul(attention, h)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

Example Use

In [11]:
#Parameters
in_feat = 5
out_feat = 2
dropout_prob = 0.6
alpha_val = 0.2
num_nodes = 3

#Create GAT layer
gat_layer = GATLayer(in_feat, out_feat, dropout_prob, alpha_val)

#Create input features and adjacency matrix
input_features = torch.randn(num_nodes, in_feat)
adjacency_matrix = torch.randint(0, 2, (num_nodes, num_nodes)).float()

#Forward pass (Feed input to network)
output_features = gat_layer(input_features, adjacency_matrix)

print("Input Features Shape:", input_features.shape)
print("Adjacency Matrix:\n", adjacency_matrix)
print("Output Features Shape:", output_features.shape)
print("Output Features:\n", output_features)

Input Features Shape: torch.Size([3, 5])
Adjacency Matrix:
 tensor([[1., 0., 1.],
        [1., 0., 1.],
        [1., 1., 0.]])
Output Features Shape: torch.Size([3, 2])
Output Features:
 tensor([[ 0.6170,  1.4479],
        [ 0.0000,  0.0000],
        [-0.1414,  0.1540]], grad_fn=<EluBackward0>)
