### Outline:
    Import Libraries and datasets from PyTorch Geometric
    1. Extracting subgraphs from set of nodes
    2. Extracting k_hop_subgraph from nodes
    3. Dropping a random walk with a probability
    4. MixHop

### Import Libraries

In [39]:
import torch
torchversion = torch.__version__

# Install PyTorch Scatter, PyTorch Sparse, and PyTorch Geometric
!pip install torch_geometric
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-{torchversion}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-{torchversion}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

# pip install wandb
# pip install ogb

# Numpy for matrices
import numpy as np
np.random.seed(0)

# Visualization
import networkx as nx
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt



### Dataset

In [40]:
from torch_geometric.datasets import Planetoid
!pip install numpy==1.24.1
# Import dataset from PyTorch Geometric
dataset = Planetoid(root=".", name="cora")

data = dataset[0]




### Print information about the dataset

In [41]:
# Print information about the dataset
print(f'Dataset: {dataset}')
print('-------------------')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of nodes: {data.x.shape[0]}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

# Print information about the graph
print(f'\nGraph:')
print('------')
print(f'Edges are directed: {data.is_directed()}')
print(f'Graph has isolated nodes: {data.has_isolated_nodes()}')
print(f'Graph has loops: {data.has_self_loops()}')

Dataset: cora()
-------------------
Number of graphs: 1
Number of nodes: 2708
Number of features: 1433
Number of classes: 7

Graph:
------
Edges are directed: False
Graph has isolated nodes: False
Graph has loops: False


In [42]:
from torch_geometric.utils import remove_isolated_nodes

isolated = (remove_isolated_nodes(data['edge_index'])[2] == False).sum(dim=0).item()
print(f'Number of isolated nodes = {isolated}')

Number of isolated nodes = 0


### 1. Extracting subgraphs from set of nodes

In [43]:

from typing import List, Optional, Tuple, Union

import torch
from torch import Tensor

from torch_geometric.typing import OptTensor, PairTensor
from torch_geometric.utils import scatter
from torch_geometric.utils.map import map_index
from torch_geometric.utils.mask import index_to_mask
from torch_geometric.utils.num_nodes import maybe_num_nodes

In [44]:
import torch
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.utils import subgraph

# Load Cora dataset
dataset = Planetoid(root='.', name='Cora', transform=T.NormalizeFeatures())
data = dataset[0]
edge_index  = data.edge_index

# Example: Extract subgraph for nodes 0, 1, 2, 3
sampled_nodes = torch.tensor([0, 1, 2, 3])
subgraph_data = subgraph(sampled_nodes, edge_index, edge_attr=None)

print("Original Graph:")
print(data)

print("\nSubgraph:")
print(subgraph_data)

Original Graph:
Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

Subgraph:
(tensor([[1, 2],
        [2, 1]]), None)


### 2. Extracting k_hop_subgraph from nodes

In [45]:
#node_idx: The target node(s).
#num_hops: The number of hops :math:`k`.
#edge_index: The edge indices.
def k_hop_subgraph(
    node_idx: Union[int, List[int], Tensor],
    num_hops: int,
    edge_index: Tensor,
    relabel_nodes: bool = False,
    num_nodes: Optional[int] = None,
    flow: str = 'source_to_target',
    directed: bool = False,) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
 
    num_nodes = maybe_num_nodes(edge_index, num_nodes)

    assert flow in ['source_to_target', 'target_to_source']
    if flow == 'target_to_source':
        row, col = edge_index
    else:
        col, row = edge_index

    node_mask = row.new_empty(num_nodes, dtype=torch.bool)
    edge_mask = row.new_empty(row.size(0), dtype=torch.bool)

    if isinstance(node_idx, (int, list, tuple)):
        node_idx = torch.tensor([node_idx], device=row.device).flatten()
    else:
        node_idx = node_idx.to(row.device)

    subsets = [node_idx]

    for _ in range(num_hops):
        node_mask.fill_(False)
        node_mask[subsets[-1]] = True
        torch.index_select(node_mask, 0, row, out=edge_mask)
        subsets.append(col[edge_mask])

    subset, inv = torch.cat(subsets).unique(return_inverse=True)
    inv = inv[:node_idx.numel()]

    node_mask.fill_(False)
    node_mask[subset] = True

    if not directed:
        edge_mask = node_mask[row] & node_mask[col]

    edge_index = edge_index[:, edge_mask]

    if relabel_nodes:
        node_idx = row.new_full((num_nodes, ), -1)
        node_idx[subset] = torch.arange(subset.size(0), device=row.device)
        edge_index = node_idx[edge_index]

    return subset, edge_index, inv, edge_mask

#### Get subgraphs from target node 6 with 2 hops

In [46]:
import torch
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.utils import subgraph

# Load Cora dataset
dataset = Planetoid(root='.', name='Cora', transform=T.NormalizeFeatures())
data = dataset[0]
edge_index  = data.edge_index

subset, edge_index, mapping, edge_mask = k_hop_subgraph(6, 2, edge_index, relabel_nodes=True)

In [47]:
print(edge_index)

tensor([[ 0,  0,  0,  0,  1,  2,  3,  3,  3,  3,  3,  4,  4,  4,  4,  5,  6,  7,
          8,  8,  9,  9,  9, 10, 11, 12, 13, 14, 14, 15, 15, 16, 16, 17, 17, 17,
         17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
         17, 17, 18, 18, 19, 19, 19, 20, 21, 22, 23, 23, 23, 23, 23, 23, 23, 23,
         23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24, 25, 26, 26, 27, 27, 27, 27,
         27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 29, 29, 30, 30, 31, 32, 33, 33,
         34, 34, 34, 34, 35, 35, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40,
         40, 40, 41, 41, 42, 42, 43, 43],
        [ 9, 17, 23, 27, 23, 23, 17, 19, 23, 26, 36, 27, 28, 37, 41, 27, 23, 23,
         27, 29,  0, 16, 17, 17, 23, 17, 23, 17, 40, 23, 30,  9, 17,  0,  3,  9,
         10, 12, 14, 16, 18, 19, 20, 21, 25, 26, 28, 33, 34, 35, 36, 37, 38, 40,
         42, 43, 17, 43,  3, 17, 23, 17, 17, 27,  0,  1,  2,  3,  6,  7, 11, 13,
         15, 19, 24, 27, 29, 30, 31, 32, 33, 34, 23, 17,  3, 17,  0

### 3. Drop a random walk path

Drops edges from the adjacency matrix  based on random walks. The source nodes to start random walks from ar   sampled froedge index with probability p, following a Bernoulli distribution. on.

#### Construct GCN model

In [48]:
class GCN_dropout_path(torch.nn.Module):
  """Graph Convolutional Network"""
  def __init__(self, dim_in, dim_h, dim_out):
    super().__init__()
    self.gcn1 = GCNConv(dim_in, dim_h)
    self.gcn2 = GCNConv(dim_h, dim_out)
    self.optimizer = torch.optim.Adam(self.parameters(),
                                      lr=0.01,
                                      weight_decay=5e-4)

  def forward(self, x, edge_index):
    h = F.dropout(x, p=0.5, training=self.training)
    h = self.gcn1(h, edge_index)
    h = torch.relu(h)
    h = F.dropout(h, p=0.5, training=self.training)
    h = self.gcn2(h, edge_index)
    return h, F.log_softmax(h, dim=1)

#### Training the model

In [49]:
from torch_geometric.utils import dropout_path

def train_dropout_path(model, data):
    """Train a GNN model and return the trained model."""
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = model.optimizer
    epochs = 5

    model.train()
    for epoch in range(epochs+1):
        # Training
        optimizer.zero_grad()
        edge_index1, _ = dropout_path(data.edge_index, p= 0.2,walks_per_node = 1, walk_length = 3)
        _, out = model(data.x, edge_index1 )
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        acc = accuracy(out[data.train_mask].argmax(dim=1), data.y[data.train_mask])
        loss.backward()
        optimizer.step()

        # Validation
        val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
        val_acc = accuracy(out[data.val_mask].argmax(dim=1), data.y[data.val_mask])

        # Print metrics every 10 epochs
        if(epoch % 1 == 0):
            print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc: '
                  f'{acc*100:>6.2f}% | Val Loss: {val_loss:.2f} | '
                  f'Val Acc: {val_acc*100:.2f}%')

    return model

In [50]:
def accuracy(pred_y, y):
    """Calculate accuracy."""
    return ((pred_y == y).sum() / len(y)).item()
def test(model, data):
    """Evaluate the model on test set and print the accuracy score."""
    model.eval()
    _, out = model(data.x, data.edge_index)
    acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
    return acc

In [52]:

%%time
from torch_geometric.nn import GCNConv
# Create GCN model
gcn_dropout_path = GCN_dropout_path(dataset.num_features, 16, dataset.num_classes).to(device)
print(gcn_dropout_path)

# Train
train_dropout_path(gcn_dropout_path, data.to(device))

# Test
acc = test(gcn_dropout_path, data)
print(f'\nGCN test accuracy: {acc*100:.2f}%\n')

GCN_dropout_path(
  (gcn1): GCNConv(1433, 16)
  (gcn2): GCNConv(16, 7)
)
Epoch   0 | Train Loss: 1.945 | Train Acc:  16.43% | Val Loss: 1.94 | Val Acc: 19.00%
Epoch   1 | Train Loss: 1.937 | Train Acc:  28.57% | Val Loss: 1.94 | Val Acc: 13.60%
Epoch   2 | Train Loss: 1.931 | Train Acc:  33.57% | Val Loss: 1.94 | Val Acc: 20.60%
Epoch   3 | Train Loss: 1.923 | Train Acc:  44.29% | Val Loss: 1.94 | Val Acc: 24.60%
Epoch   4 | Train Loss: 1.916 | Train Acc:  40.00% | Val Loss: 1.93 | Val Acc: 30.40%
Epoch   5 | Train Loss: 1.907 | Train Acc:  55.00% | Val Loss: 1.92 | Val Acc: 36.60%

GCN test accuracy: 64.10%

CPU times: total: 1.12 s
Wall time: 139 ms


### 4. MixHop Model

#### Import libraries

In [53]:
import os.path as osp

import torch
import torch.nn.functional as F

from torch_geometric.datasets import Planetoid
from torch_geometric.nn import BatchNorm, Linear, MixHopConv

if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

path = osp.join('.', 'data', 'Planetoid')
dataset = Planetoid(path, name='Cora')
data = dataset[0]

#### Class Mixhop

In [54]:
class MixHop(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = MixHopConv(dataset.num_features, 60, powers=[0, 1, 2])
        self.norm1 = BatchNorm(3 * 60)

        self.conv2 = MixHopConv(3 * 60, 60, powers=[0, 1, 2])
        self.norm2 = BatchNorm(3 * 60)

        self.conv3 = MixHopConv(3 * 60, 60, powers=[0, 1, 2])
        self.norm3 = BatchNorm(3 * 60)

        self.lin = Linear(3 * 60, dataset.num_classes)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.7, training=self.training)

        x = self.conv1(x, edge_index)
        x = self.norm1(x)
        x = F.dropout(x, p=0.9, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.norm2(x)
        x = F.dropout(x, p=0.9, training=self.training)

        x = self.conv3(x, edge_index)
        x = self.norm3(x)
        x = F.dropout(x, p=0.9, training=self.training)

        return self.lin(x)


#### Construct model

In [55]:

model, data = MixHop().to(device), data.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.5, weight_decay=0.005)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=40,
                                            gamma=0.01)


#### Train function

In [56]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    scheduler.step()
    return float(loss)



#### Test function

In [57]:
@torch.no_grad()
def test():
    model.eval()
    pred = model(data.x, data.edge_index).argmax(dim=-1)

    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        accs.append(int((pred[mask] == data.y[mask]).sum()) / int(mask.sum()))
    return accs


#### training

In [58]:
best_val_acc = test_acc = 0
for epoch in range(1, 50):
    loss = train()
    train_acc, val_acc, tmp_test_acc = test()
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = tmp_test_acc
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_acc:.4f}, '
          f'Val: {best_val_acc:.4f}, Test: {test_acc:.4f}')

Epoch: 001, Loss: 2.6880, Train: 0.2214, Val: 0.1520, Test: 0.1490
Epoch: 002, Loss: 3.9377, Train: 0.2643, Val: 0.1780, Test: 0.2000
Epoch: 003, Loss: 5.0827, Train: 0.4357, Val: 0.2620, Test: 0.2610
Epoch: 004, Loss: 5.2836, Train: 0.3214, Val: 0.2620, Test: 0.2610
Epoch: 005, Loss: 5.8246, Train: 0.3500, Val: 0.3400, Test: 0.3890
Epoch: 006, Loss: 5.9115, Train: 0.4429, Val: 0.3920, Test: 0.3990
Epoch: 007, Loss: 5.7034, Train: 0.5929, Val: 0.4700, Test: 0.4980
Epoch: 008, Loss: 6.1699, Train: 0.5000, Val: 0.4700, Test: 0.4980
Epoch: 009, Loss: 4.3237, Train: 0.5429, Val: 0.4700, Test: 0.4980
Epoch: 010, Loss: 5.3870, Train: 0.5714, Val: 0.4700, Test: 0.4980
Epoch: 011, Loss: 4.6217, Train: 0.6929, Val: 0.5480, Test: 0.5740
Epoch: 012, Loss: 4.6847, Train: 0.5929, Val: 0.5480, Test: 0.5740
Epoch: 013, Loss: 4.2669, Train: 0.8429, Val: 0.6880, Test: 0.7140
Epoch: 014, Loss: 3.8027, Train: 0.8143, Val: 0.6880, Test: 0.7140
Epoch: 015, Loss: 4.1063, Train: 0.8143, Val: 0.6880, Test: 0.