In [226]:
import copy

import torch
from halfhop import HalfHop
from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import WikipediaNetwork
from torch_geometric.datasets import WebKB
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GATConv
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
import torch.optim as optim

# Implementing a GNN and halfhop

Loading Dataset

In [369]:
# Four different datasets
CiteSeer_dataset = Planetoid(root='/tmp/CiteSeer', name='CiteSeer')
cora_dataset = Planetoid(root='/tmp/Cora', name='Cora')
chameleon_dataset = WikipediaNetwork(root='/tmp/Chameleon', name='Chameleon')
texas_dataset = WebKB(root='/tmp/Texas', name='Texas')

# Select one dataset
dataset = texas_dataset

# Assign data
raw_data = dataset[0]

# Extract data and information
num_node_features = dataset.num_node_features
num_classes = dataset.num_classes

# Reshape the masks to be 1-dimensional if it is 2-dimensional
if len(raw_data.train_mask.shape) == 2:
    raw_data.train_mask = raw_data.train_mask[:, 0]
    raw_data.val_mask = raw_data.val_mask[:, 0]
    raw_data.test_mask = raw_data.test_mask[:, 0]

original_data = copy.deepcopy(raw_data)
print(original_data)

Data(x=[183, 1703], edge_index=[2, 325], y=[183], train_mask=[183], val_mask=[183], test_mask=[183])


Modify graph with Half-Hop

In [370]:
transform = HalfHop(alpha=0.5, p=0.99)
data = transform(raw_data)
print(data)

Data(x=[487, 1703], edge_index=[2, 933], y=[183], train_mask=[183], val_mask=[183], test_mask=[183], slow_node_mask=[487])


### Define GNN Model

In [278]:
class GNN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)  # First GCN layer
        self.conv2 = GCNConv(16, 32)  # Second GCN layer
        self.conv3 = GCNConv(32, num_classes)  # Output layer

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # First layer
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        # Second layer
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        # Output layer
        x = self.conv3(x, edge_index)
        return F.log_softmax(x, dim=1)

### Define GAT Model

In [279]:
class GATGNN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GATGNN, self).__init__()
        self.gat1 = GATConv(num_node_features, 8, heads=8, dropout=0.6)
        self.gat2 = GATConv(8 * 8, num_classes, heads=1, dropout=0.6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.gat1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.gat2(x, edge_index)

        return F.log_softmax(x, dim=1)

### Define GraphSAGE Mdoel

In [280]:
class GraphSAGEGNN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GraphSAGEGNN, self).__init__()
        self.sage1 = SAGEConv(num_node_features, 16)
        self.sage2 = SAGEConv(16, 32)
        self.sage3 = SAGEConv(32, num_classes)  # Output layer

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # First GraphSAGE layer
        x = F.relu(self.sage1(x, edge_index))
        x = F.dropout(x, training=self.training)

        # Second GraphSAGE layer
        x = F.relu(self.sage2(x, edge_index))
        x = F.dropout(x, training=self.training)

        # Output layer
        x = self.sage3(x, edge_index)
        return F.log_softmax(x, dim=1)

### Training

In [371]:
# Detect cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Change the model_to_use to select a different model
model_to_use = GNN # Select among: GNN, GATGNN, GraphSAGEGNN
model = model_to_use(num_node_features, num_classes)

# to cuda
model.to(device)
data = data.to(device)
original_data = original_data.to(device)

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.005)  # Learning rate can be adjusted

# Training loop
def training(num_iter, origin=False):
    for epoch in range(num_iter):  # Number of epochs can be adjusted
        model.train()
        optimizer.zero_grad()  # Clear existing gradients
        if origin:
            out = model(original_data)  # Perform a forward pass
        else:
            out = model(data)  # Perform a forward pass
            out = out[~data.slow_node_mask] # Get rid of slow nodes added

        # Compute the loss
        loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])

        loss.backward()  # Perform backpropagation
        optimizer.step()  # Update the model's parameters

        # Optionally print the loss every few epochs
        if epoch % 50 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item()}")


### Evaluation

In [372]:
def evaluate(model, origin=False):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        if origin:
            out = model(original_data)  # Perform a forward pass
        else:
            out = model(data)  # Perform a forward pass
            out = out[~data.slow_node_mask] # Get rid of slow nodes added
        preds = out.argmax(dim=1)  # Get the index of the max log-probability
        correct = preds[data.test_mask] == data.y[data.test_mask]
        acc = int(correct.sum()) / int(data.test_mask.sum())
    return acc

# Initialize a list to store accuracies of multiple initialization of model and trainings
accuracies = []
HH_accuracies = []

# Function for multiple times of trainings
def multiple_trainings(num_iter, origin=False):
    for i in range(num_iter):
        print("Currently training:", i, "origin:", origin)
        if origin:
            training(300, True)
            accuracy = evaluate(model, True)
            accuracies.append(accuracy)
        else:
            training(300)
            accuracy = evaluate(model)
            HH_accuracies.append(accuracy)
        print(f'Accuracy: {accuracy}')

# 20 initializations
multiple_trainings(20)
multiple_trainings(20, True)

# Print average accuracy of 20 initializations
average_accuracy = sum(accuracies)/len(accuracies)
average_accuracy_deviation = (sum([((x - average_accuracy) ** 2) for x in accuracies]) / len(accuracies)) ** 0.5
HH_average_accuracy = sum(HH_accuracies)/len(HH_accuracies)
HH_average_accuracy_deviation = (sum([((x - HH_average_accuracy) ** 2) for x in HH_accuracies]) / len(HH_accuracies)) ** 0.5
print(f'Average accuracy without half-hop: {average_accuracy}')
print(f'Standard deviation without half-hop: {average_accuracy_deviation}')
print(f'Average accuracy with half-hop: {HH_average_accuracy}')
print(f'Standard deviation with half-hop: {HH_average_accuracy_deviation}')

Currently training: 0 origin: False
Epoch 0, Loss: 1.6580770015716553
Epoch 50, Loss: 0.35917413234710693
Epoch 100, Loss: 0.17813044786453247
Epoch 150, Loss: 0.06577899307012558
Epoch 200, Loss: 0.09886477142572403
Epoch 250, Loss: 0.05726570263504982
Accuracy: 0.4594594594594595
Currently training: 1 origin: False
Epoch 0, Loss: 0.02751295082271099
Epoch 50, Loss: 0.025687916204333305
Epoch 100, Loss: 0.0572870634496212
Epoch 150, Loss: 0.05590810999274254
Epoch 200, Loss: 0.14235062897205353
Epoch 250, Loss: 0.05857732519507408
Accuracy: 0.43243243243243246
Currently training: 2 origin: False
Epoch 0, Loss: 0.06296152621507645
Epoch 50, Loss: 0.1407254934310913
Epoch 100, Loss: 0.028070420026779175
Epoch 150, Loss: 0.07443662732839584
Epoch 200, Loss: 0.12099496275186539
Epoch 250, Loss: 0.1065288707613945
Accuracy: 0.40540540540540543
Currently training: 3 origin: False
Epoch 0, Loss: 0.05563979968428612
Epoch 50, Loss: 0.11408531665802002
Epoch 100, Loss: 0.06674143671989441
Epoc