# Prepare Dataset

In [None]:
import os
import math
import copy

import numpy as np
import pickle
import networkx as nx
import pandas as pd

import torch
from torch_geometric.data import Data, InMemoryDataset
from torch_geometric.datasets import QM9
import matplotlib.pyplot as plt
from torch_geometric.utils import from_networkx, to_networkx, to_undirected, to_scipy_sparse_matrix
from torch_geometric.data.data import Data
from torch_geometric.loader import DataLoader

qm9 = QM9(root='dataset/QM9')
qm9_dataset = qm9[:10000]

y_target = pd.DataFrame(qm9_dataset.data.y.numpy())
qm9_dataset.data.y = torch.Tensor(y_target[0])   # Change target value

qm9_dataset = qm9_dataset.shuffle()

Downloading https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/molnet_publish/qm9.zip
Extracting dataset/QM9/raw/qm9.zip
Downloading https://ndownloader.figshare.com/files/3195404


# GNN Models

In [2]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GINEConv, global_add_pool
from torch.nn import Sequential, Linear, ReLU, BatchNorm1d

class GINENet(torch.nn.Module):
    def __init__(self, num_node_features, dim_h, edge_attr):
        super(GINENet, self).__init__()
        
        # Define GINE layers with the specified edge_dim
        self.conv1 = GINEConv(
            Sequential(Linear(num_node_features, dim_h),
                       BatchNorm1d(dim_h), ReLU(),
                       Linear(dim_h, dim_h), ReLU()),
            edge_dim=edge_attr)
        self.conv2 = GINEConv(
            Sequential(Linear(dim_h, dim_h), BatchNorm1d(dim_h), ReLU(),
                       Linear(dim_h, dim_h), ReLU()),
            edge_dim=edge_attr)
        self.conv3 = GINEConv(
            Sequential(Linear(dim_h, dim_h), BatchNorm1d(dim_h), ReLU(),
                       Linear(dim_h, dim_h), ReLU()),
            edge_dim=edge_attr)
        
        # Define linear layers for classification or regression
        self.lin1 = Linear(dim_h * 3, dim_h * 3)
        self.lin2 = Linear(dim_h * 3, 1)

    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        
        # Pass node features and edge attributes through GINE layers
        h1 = self.conv1(x, edge_index, edge_attr)
        h2 = self.conv2(h1, edge_index, edge_attr)
        h3 = self.conv3(h2, edge_index, edge_attr)

        # Apply global pooling for graph-level output
        h1 = global_add_pool(h1, batch)
        h2 = global_add_pool(h2, batch)
        h3 = global_add_pool(h3, batch)

        # Concatenate pooled features and pass through final linear layers
        h = torch.cat((h1, h2, h3), dim=1)
        h = self.lin1(h).relu()
        h = F.dropout(h, p=0.5, training=self.training)
        h = self.lin2(h)
        return h

In [3]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import PNAConv, global_add_pool
from torch.nn import Sequential, Linear, ReLU, BatchNorm1d

class PNANet(torch.nn.Module):
    def __init__(self, num_node_features, dim_h, edge_attr, aggregators, scalers, deg):
        super(PNANet, self).__init__()
        
        # Define PNA layers with specified aggregators, scalers, and degree tensor
        self.conv1 = PNAConv(
            in_channels=num_node_features,
            out_channels=dim_h,
            aggregators=aggregators,
            scalers=scalers,
            deg=deg,
            edge_dim=edge_attr
        )
        self.conv2 = PNAConv(
            in_channels=dim_h,
            out_channels=dim_h,
            aggregators=aggregators,
            scalers=scalers,
            deg=deg,
            edge_dim=edge_attr
        )
        self.conv3 = PNAConv(
            in_channels=dim_h,
            out_channels=dim_h,
            aggregators=aggregators,
            scalers=scalers,
            deg=deg,
            edge_dim=edge_attr
        )
        
        # Define linear layers for final graph-level output
        self.lin1 = Linear(dim_h * 3, dim_h * 3)
        self.lin2 = Linear(dim_h * 3, 1)

    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        
        # Pass node features and edge attributes through PNA layers
        h1 = self.conv1(x, edge_index, edge_attr)
        h2 = self.conv2(h1, edge_index, edge_attr)
        h3 = self.conv3(h2, edge_index, edge_attr)

        # Apply global pooling for graph-level output
        h1 = global_add_pool(h1, batch)
        h2 = global_add_pool(h2, batch)
        h3 = global_add_pool(h3, batch)

        # Concatenate pooled features and pass through final linear layers
        h = torch.cat((h1, h2, h3), dim=1)
        h = self.lin1(h).relu()
        h = F.dropout(h, p=0.5, training=self.training)
        h = self.lin2(h)
        return h

# Transformations

In [6]:
import torch
import copy
import networkx as nx
import random
import numpy as np
from torch_geometric.transforms import VirtualNode, AddLaplacianEigenvectorPE
from torch_geometric.utils import from_networkx, to_networkx, to_undirected, to_dense_adj, degree
from torch_geometric.data import InMemoryDataset
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import eigsh
from scipy.sparse import csgraph


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CustomQM9Dataset(InMemoryDataset):
    def __init__(self, data_list):
        super(CustomQM9Dataset, self).__init__()
        self.data, self.slices = self.collate(data_list)
        

###Visual Node
def apply_vn(pyg_dataset):
    vn_dataset = copy.deepcopy(pyg_dataset)  # Make a deep copy to preserve the original dataset
    transform = VirtualNode()

    # Create a list to store transformed graphs
    transformed_data_list = []

    for data in vn_dataset:
        transformed_data_list.append(transform(data))

    vn_dataset = CustomQM9Dataset(transformed_data_list)

    return vn_dataset

###Centrality
def add_centrality_to_node_features(data, centrality_measure='degree'):
    G = to_networkx(data, node_attrs=['x'], to_undirected=True)

    # Compute the centrality measure
    if centrality_measure == 'degree':
        centrality = nx.degree_centrality(G)
    elif centrality_measure == 'closeness':
        centrality = nx.closeness_centrality(G)
    elif centrality_measure == 'betweenness':
        centrality = nx.betweenness_centrality(G)
    elif centrality_measure == 'eigenvector':
        if not nx.is_connected(G):
        # Handle connected components separately
            centrality = {}
            for component in nx.connected_components(G):
                subgraph = G.subgraph(component)
                sub_centrality = nx.eigenvector_centrality(subgraph, max_iter=500, tol=1e-4)
                centrality.update(sub_centrality)
        else:
            centrality = nx.eigenvector_centrality(G, max_iter=500, tol=1e-4)
    else:
        raise ValueError(f'Unknown centrality measure: {centrality_measure}')
    
    # Convert centrality to tensor and add as node feature
    centrality_values = list(centrality.values())
    centrality_tensor = torch.tensor(centrality_values, dtype=torch.float).view(-1, 1)
    centrality_tensor = (centrality_tensor - centrality_tensor.mean()) / (centrality_tensor.std() + 1e-8)
    data.x = torch.cat([data.x, centrality_tensor], dim=-1)

    return data

def centrality(dataset, centrality_measure='degree'):
    original_dataset = copy.deepcopy(dataset)
    addCentrality_list = []
    for data in original_dataset:
        if centrality_measure == 'degree':
            data = add_centrality_to_node_features(data, centrality_measure='degree')
            addCentrality_list.append(data)
        elif centrality_measure == 'closeness':
            data = add_centrality_to_node_features(data, centrality_measure='closeness')
            addCentrality_list.append(data)
        elif centrality_measure == 'betweenness':
            data = add_centrality_to_node_features(data, centrality_measure='betweenness')
            addCentrality_list.append(data)
        elif centrality_measure == 'eigenvector':
            data = add_centrality_to_node_features(data, centrality_measure='eigenvector')
            addCentrality_list.append(data)
        else:
            raise ValueError(f'Unknown centrality measure: {centrality_measure}')
        
    addCentrality_dataset = CustomQM9Dataset(addCentrality_list)

    return addCentrality_dataset

###Distance Encoding
def distance_encoding_node_augmentation(data):
    G = to_networkx(data, node_attrs=['x'], to_undirected = True)
    num_nodes = data.num_nodes

    # Initialize the distance matrix with infinity
    distance_matrix = [[float('inf')] * num_nodes for _ in range(num_nodes)]
    shortest_paths = dict(nx.all_pairs_shortest_path_length(G))
    
    # Populate the distance matrix with actual shortest path lengths
    for i in range(num_nodes):
        distance_matrix[i][i] = 0  # Distance to self is 0
        if i in shortest_paths:
            for j, d in shortest_paths[i].items():
                distance_matrix[i][j] = d

    # Convert the distance matrix to a tensor
    distance_tensor = torch.tensor(distance_matrix, dtype=torch.float)
    
    # Example: Add average distance to node features
    finite_distances = torch.where(distance_tensor == float('inf'), torch.tensor(float('nan')), distance_tensor)
    average_distance = torch.nanmean(finite_distances, dim=1).view(-1, 1)  # Use nanmean to ignore infinities
    data.x = torch.cat([data.x, average_distance], dim=1)
    
    return data

def distance_encoding_edge_rewiring(data):
    """
    Add edges between all pairs of nodes with shortest path distance as a new edge attribute,
    while preserving original edge attributes.
    """
    G = to_networkx(data, node_attrs=['x'], edge_attrs=['edge_attr'], to_undirected=True)
    G_transformed = G.copy()

    connected_components = list(nx.connected_components(G))
    shortest_paths = {}

    # Compute shortest paths for each connected component
    for component in connected_components:
        subgraph = G.subgraph(component)
        component_paths = dict(nx.all_pairs_shortest_path_length(subgraph))
        shortest_paths.update(component_paths)

    num_edge_attrs = data.edge_attr.shape[1] if data.edge_attr is not None else 0

    nodes = list(G.nodes)
    for i in nodes:
        for j in nodes:
            if i != j:
                if G.has_edge(i, j):
                    original_attr = G[i][j].get('edge_attr', [])
                    if not isinstance(original_attr, list):
                        original_attr = [original_attr]
                    G_transformed[i][j]['edge_attr'] = original_attr + [1]
                else:
                    if j in shortest_paths[i]:
                        distance = shortest_paths[i][j]
                    else:
                        distance = 1e9  # Replace inf with a large finite value
                    new_attr = [0] * num_edge_attrs + [distance]
                    G_transformed.add_edge(i, j, edge_attr=new_attr)

    new_data = from_networkx(G_transformed, group_node_attrs=['x'], group_edge_attrs=['edge_attr'])
    new_data.edge_attr = torch.tensor(new_data.edge_attr, dtype=torch.float)

    # Check for invalid values
    if torch.isnan(new_data.edge_attr).any() or torch.isinf(new_data.edge_attr).any():
        raise ValueError("Edge attributes contain invalid values!")

    new_data.y = data.y
    return new_data

def distance_encoding(dataset, method = 'node_augmentation'):
    original_dataset = copy.deepcopy(dataset)
    distance_encoding_list = []
    for data in original_dataset:
        if method == 'node_augmentation':
            data = distance_encoding_node_augmentation(data)
            distance_encoding_list.append(data)
        elif method == 'edge_rewiring':
            data = distance_encoding_edge_rewiring(data)
            distance_encoding_list.append(data)
        else:
            raise ValueError(f'Unknown distance encoding method: {method}')
    distance_encoding_dataset = CustomQM9Dataset(distance_encoding_list)
    return distance_encoding_dataset

###Subgraph Extraction
def extract_local_subgraph_features(data, radius=2):
    # Convert PyG data to NetworkX graph
    G = to_networkx(data, node_attrs=['x'], edge_attrs=['edge_attr'], to_undirected=True)

    # Initialize a list to store subgraph features for each node
    subgraph_sizes = []
    subgraph_degrees = []
    
    for node in G.nodes():
        # Extract the ego graph (subgraph) around the node
        subgraph = nx.ego_graph(G, node, radius=radius)
        
        # Example feature 1: Size of the subgraph (number of nodes)
        subgraph_size = subgraph.number_of_nodes()
        subgraph_sizes.append(subgraph_size)
        
        # Example feature 2: Average degree of the subgraph
        subgraph_degree = np.mean([d for n, d in subgraph.degree()])
        subgraph_degrees.append(subgraph_degree)
        
    # Convert the features to tensors and add them as node features
    subgraph_sizes_tensor = torch.tensor(subgraph_sizes, dtype=torch.float).view(-1, 1)
    subgraph_degrees_tensor = torch.tensor(subgraph_degrees, dtype=torch.float).view(-1, 1)
    
    # Concatenate the new features to the existing node features
    data.x = torch.cat([data.x, subgraph_sizes_tensor, subgraph_degrees_tensor], dim=-1)
    
    return data

def subgraph_extraction(dataset, radius=2):
    original_dataset = copy.deepcopy(dataset)
    subgraph_extraction_list = []
    for data in original_dataset:
        data = extract_local_subgraph_features(data, radius=radius)
        subgraph_extraction_list.append(data)
    subgraph_extraction_dataset = CustomQM9Dataset(subgraph_extraction_list)
    return subgraph_extraction_dataset

def canonicalize_eigenvectors(eigenvectors):
    """
    Canonicalize eigenvectors by fixing their signs for consistency.
    This ensures that isomorphic graphs will have the same eigenvectors.
    """
    for i in range(eigenvectors.shape[1]):
        if eigenvectors[0, i] < 0:  # Flip sign if the first element is negative
            eigenvectors[:, i] = -eigenvectors[:, i]
    return eigenvectors

def add_canonicalized_laplacian_pe_pyg(data, k=5, max_features=12):
    """
    Add canonicalized Laplacian positional encoding to a PyG data object.

    Args:
        data (torch_geometric.data.Data): PyG data object.
        k (int): Number of Laplacian eigenvectors to compute.
        max_features (int): Total desired node feature dimensions after encoding.

    Returns:
        data (torch_geometric.data.Data): PyG data object with Laplacian PE appended to node features.
    """
    # Step 1: Convert PyG graph to NetworkX graph
    G = to_networkx(data, to_undirected=True)

    # Step 2: Compute sparse adjacency matrix
    adj = nx.to_scipy_sparse_array(G, format='csr').astype(np.float64)

    # Step 3: Compute normalized Laplacian matrix
    laplacian = csgraph.laplacian(adj, normed=True)

    # Step 4: Handle small graphs
    num_nodes = adj.shape[0]
    if num_nodes <= 1:
        raise ValueError(f"Graph has too few nodes ({num_nodes}) for Laplacian PE.")
    num_eigenvectors = min(k, num_nodes - 1)  # Ensure k < num_nodes

    # Step 5: Compute the smallest eigenvectors using sparse methods
    eigenvalues, eigenvectors = eigsh(laplacian, k=num_eigenvectors, which='SM')  # Smallest magnitude eigenvalues

    # Step 6: Canonicalize eigenvectors
    eigenvectors = canonicalize_eigenvectors(torch.tensor(eigenvectors, dtype=torch.float))

    # Step 7: Pad eigenvectors if less than max_features
    padding = torch.zeros((eigenvectors.shape[0], max_features - num_eigenvectors))
    eigenvectors = torch.cat([eigenvectors, padding], dim=1)

    # Step 8: Append the eigenvectors as new node features
    if 'x' in data:
        num_existing_features = data.x.shape[1]
        padding_existing = torch.zeros((data.x.shape[0], max_features - num_existing_features))
        data.x = torch.cat([data.x, padding_existing, eigenvectors], dim=1)
    else:
        data.x = eigenvectors

    return data

def graph_encoding(dataset, k=3, max_features=12, batch_size=100):
    """
    Apply canonicalized Laplacian positional encoding to a PyG dataset in batches.

    Args:
        dataset (list of torch_geometric.data.Data): List of PyG data objects.
        k (int): Number of Laplacian eigenvectors to compute.
        max_features (int): Total desired node feature dimensions after encoding.
        batch_size (int): Number of graphs to process in each batch.

    Returns:
        encoded_dataset (list of torch_geometric.data.Data): List of PyG data objects with Laplacian PE added.
    """
    encoded_dataset = []
    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i + batch_size]
        for data in batch:
            data_copy = data.clone()  # Ensure original dataset remains unchanged
            try:
                graph_pe = add_canonicalized_laplacian_pe_pyg(data_copy, k=k, max_features=max_features)
                encoded_dataset.append(graph_pe)
            except Exception as e:
                print(f"Error processing graph (index {i}): {e}")
    ge_dataset = CustomQM9Dataset(encoded_dataset)
    return ge_dataset

###Add Extra Node on Each Edge
def add_extra_node_on_each_edge(data):
    # Convert PyG data to a NetworkX graph for easier manipulation
    G = to_networkx(data, node_attrs=['x'], edge_attrs = ['edge_attr'])
    
    # Original number of nodes
    num_original_nodes = G.number_of_nodes()
    
    # Prepare lists for new features
    edges = list(G.edges(data=True))
    new_node_features = []
    new_edges_src = []
    new_edges_dst = []
    new_edge_features = []

    for u, v, edge_data in edges:
        # Remove the original edge
        G.remove_edge(u, v)

        # Create new node as the mean of connected node features
        new_node_id = num_original_nodes + len(new_node_features)
        new_node_feature = (data.x[u] + data.x[v]) / 2
        new_node_features.append(new_node_feature)
        
        # Add new node with feature
        G.add_node(new_node_id, x=new_node_feature)

        # Add edges from new node to each original node
        G.add_edge(u, new_node_id)
        G.add_edge(new_node_id, v)

        # Use original edge feature for each new edge
        edge_feature = edge_data['edge_attr']
        edge_feature_tensor = (
            edge_feature if isinstance(edge_feature, torch.Tensor) else torch.tensor(edge_feature)
        )
        new_edge_features.append(edge_feature_tensor)  # for edge (u, new_node_id)
        new_edge_features.append(edge_feature_tensor)  # for edge (new_node_id, v)
    
    # Convert back to PyG Data object
    modified_data = from_networkx(G)

    # Update node features
    modified_data.x = torch.cat([data.x, torch.stack(new_node_features)], dim=0)

    # Update edge features to include only the new edges
    modified_data.edge_attr = torch.stack(new_edge_features)  # Only include new edge features

    # Preserve any additional global attributes
    modified_data.y = data.y
    
    return modified_data

def extra_node(dataset):
    original_dataset = copy.deepcopy(dataset)
    extra_node_list = []
    for data in original_dataset:
        data = add_extra_node_on_each_edge(data)
        extra_node_list.append(data)
    extra_node_dataset = CustomQM9Dataset(extra_node_list)
    return extra_node_dataset

def count_3_star(G):
    """Count 3-star graphlets for each node."""
    star_counts = {}
    for node in G.nodes():
        neighbors = list(G.neighbors(node))
        degree = len(neighbors)
        # Count the number of 3-combinations of neighbors
        star_counts[node] = max(0, (degree * (degree - 1) * (degree - 2)) // 6)
    return star_counts

def count_tailed_triangle(G):
    """Count tailed triangle graphlets for each node."""
    tail_counts = {node: 0 for node in G.nodes()}
    for node in G.nodes():
        neighbors = list(G.neighbors(node))
        for neighbor in neighbors:
            for other in neighbors:
                if neighbor != other and G.has_edge(neighbor, other):
                    for extra in G.neighbors(node):
                        if extra not in {neighbor, other}:
                            tail_counts[node] += 1
    return tail_counts

def count_4_cycle(G):
    """Count 4-cycle graphlets for each node in an undirected graph G."""
    cycle_counts = {node: 0 for node in G.nodes()}
    
    for node in G.nodes():
        neighbors = list(G.neighbors(node))
        # Iterate over all unique neighbor pairs
        for i, neighbor1 in enumerate(neighbors):
            for neighbor2 in neighbors[i + 1:]:
                # Find common neighbors of neighbor1 and neighbor2
                shared_neighbors = set(G.neighbors(neighbor1)).intersection(G.neighbors(neighbor2))
                # Add the count of shared neighbors to node's 4-cycle count
                cycle_counts[node] += len(shared_neighbors)
    
    # Each 4-cycle is counted 4 times (once per node in the cycle)
    cycle_counts = {node: count // 4 for node, count in cycle_counts.items()}
    
    return cycle_counts

def graphlet_based_encoding_pyg(data):
    """
    Add graphlet-based features (3-star, triangle, tailed triangle, 4-cycle) to node features in PyG.

    Args:
        data: PyG Data object.

    Returns:
        data: PyG Data object with graphlet-based features added.
    """
    # Convert PyG graph to NetworkX
    G = to_networkx(data, to_undirected=True)

    # Count graphlets
    triangle_counts = nx.triangles(G)  # Triangle counts
    star_counts = count_3_star(G)  # 3-star graphlets
    tail_counts = count_tailed_triangle(G)  # Tailed triangles
    cycle_counts = count_4_cycle(G)  # 4-cycles

    # Combine features into tensors
    num_nodes = data.num_nodes
    triangle_tensor = torch.tensor([triangle_counts[node] for node in range(num_nodes)], dtype=torch.float).view(-1, 1)
    star_tensor = torch.tensor([star_counts[node] for node in range(num_nodes)], dtype=torch.float).view(-1, 1)
    tail_tensor = torch.tensor([tail_counts[node] for node in range(num_nodes)], dtype=torch.float).view(-1, 1)
    cycle_tensor = torch.tensor([cycle_counts[node] for node in range(num_nodes)], dtype=torch.float).view(-1, 1)

    # Concatenate all graphlet features
    graphlet_features = torch.cat([triangle_tensor, star_tensor, tail_tensor, cycle_tensor], dim=1)

    # Add to node features
    if data.x is not None:
        data.x = torch.cat([data.x, graphlet_features], dim=1)
    else:
        data.x = graphlet_features

    return data

def graphlet_encoding_dataset(dataset):
    """
    Apply graphlet-based encoding to a PyG dataset.

    Args:
        dataset: List of PyG Data objects.

    Returns:
        encoded_dataset: List of PyG Data objects with graphlet-based features added.
    """
    encoded_dataset = []
    for data in dataset:
        data_copy = data.clone()  # Use PyG's clone method for deep copy
        graph_encoded = graphlet_based_encoding_pyg(data_copy)
        encoded_dataset.append(graph_encoded)
    gle_dataset = CustomQM9Dataset(encoded_dataset)
    return gle_dataset

In [7]:
qm9_vn = apply_vn(qm9_dataset)
qm9_deg = centrality(qm9_dataset, centrality_measure='degree')
qm9_clo = centrality(qm9_dataset, centrality_measure='closeness')
qm9_bet = centrality(qm9_dataset, centrality_measure='betweenness')
qm9_eig = centrality(qm9_dataset, centrality_measure='eigenvector')
qm9_de_n = distance_encoding(qm9_dataset, method='node_augmentation')
qm9_de_g = distance_encoding(qm9_dataset, method='edge_rewiring')
qm9_ge = graph_encoding(qm9_dataset, k=3)
qm9_se = subgraph_extraction(qm9_dataset, radius=3)
qm9_exN = extra_node(qm9_dataset)
qm9_gle = graphlet_encoding_dataset(qm9_dataset)

  new_data.edge_attr = torch.tensor(new_data.edge_attr, dtype=torch.float)


# Training, Validation, Testing

In [4]:
def training(loader, model, loss, optimizer):
    """Training one epoch

    Args:
        loader (DataLoader): loader (DataLoader): training data divided into batches
        model (nn.Module): GNN model to train on
        loss (nn.functional): loss function to use during training
        optimizer (torch.optim): optimizer during training

    Returns:
        float: training loss
    """
    model.train()

    current_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        data.x = data.x.float()

        out = model(data)

        l = loss(out, torch.reshape(data.y, (len(data.y), 1)))
        current_loss += l / len(loader)
        l.backward()
        optimizer.step()
    return current_loss, model

def validation(loader, model, loss):
    """Validation

    Args:
        loader (DataLoader): validation set in batches
        model (nn.Module): current trained model
        loss (nn.functional): loss function

    Returns:
        float: validation loss
    """
    model.eval()
    val_loss = 0
    for data in loader:
        data = data.to(device)
        out = model(data)
        l = loss(out, torch.reshape(data.y, (len(data.y), 1)))
        val_loss += l / len(loader)
    return val_loss

@torch.no_grad()
def testing(loader, model):
    """Testing

    Args:
        loader (DataLoader): test dataset
        model (nn.Module): trained model

    Returns:
        float: test loss
    """
    loss = torch.nn.MSELoss()
    test_loss = 0
    for data in loader:
        data = data.to(device)
        out = model(data)
        # NOTE
        # out = out.view(d.y.size())
        l = loss(out, torch.reshape(data.y, (len(data.y), 1)))
        test_loss += l / len(loader)


    return test_loss

In [11]:
def train_epochs(epochs, model, train_loader, val_loader, path):
    """Training over all epochs

    Args:
        epochs (int): number of epochs to train for
        model (nn.Module): the current model
        train_loader (DataLoader): training data in batches
        val_loader (DataLoader): validation data in batches
        path (string): path to save the best model

    Returns:
        array: returning train and validation losses over all epochs, prediction and ground truth values for training data in the last epoch
    """
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
    loss = torch.nn.L1Loss()

    train_loss = np.empty(epochs)
    val_loss = np.empty(epochs)
    best_loss = math.inf

    for epoch in range(1, epochs):
        epoch_loss, model = training(train_loader, model, loss, optimizer)
        v_loss = validation(val_loader, model, loss)
        if v_loss < best_loss:
            best_loss = v_loss
            torch.save(model.state_dict(), path)
        for data in train_loader:
            data = data.to(device)
            out = model(data)

        train_loss[epoch] = epoch_loss.detach().cpu().numpy()
        val_loss[epoch] = v_loss.detach().cpu().numpy()

        # print current train and val loss
        if epoch % 10 == 0:
            print(
                "Epoch: "
                + str(epoch)
                + ", Train loss: "
                + str(epoch_loss.item())
                + ", Val loss: "
                + str(v_loss.item())
            )
    return best_loss, train_loss, val_loss

# Conduct Experiments

In [None]:
import torch
from torch_geometric.transforms import AddLaplacianEigenvectorPE
from torch_geometric.data import DataLoader
from torch_geometric.utils import degree
import copy

# List of datasets with their respective names and transformations
datasets = [
    ('Base', qm9_dataset),
    ('VN', qm9_vn),
    ('DEG', qm9_deg),
    ('BET', qm9_bet),
    ('CLO', qm9_clo),
    ('DE_N', qm9_de_n),
    ('DE_G', qm9_de_g),
    ('EXN', qm9_exN),
    ('SE', qm9_se),
    ('GE', qm9_ge),    # this is referred to Laplacian PE
    ('EIG', qm9_eig),
    ('GLE', qm9_gle)
]


# Common parameters
data_size = 10000
train_index = int(data_size * 0.8)
test_index = train_index + int(data_size * 0.1)
val_index = test_index + int(data_size * 0.1)
epochs = 101
dim_h = 64
aggregators = ['mean', 'min', 'max', 'std']
scalers = ['identity', 'amplification', 'attenuation']
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Loop through each dataset transformation
for name, dataset in datasets:
    
    # Normalize target
    data_mean = dataset.data.y[:train_index].mean()
    data_std = dataset.data.y[:train_index].std()
    dataset.data.y = (dataset.data.y - data_mean) / data_std

    # Create data loaders
    train_loader = DataLoader(dataset[:train_index], batch_size=64, shuffle=True)
    test_loader = DataLoader(dataset[train_index:test_index], batch_size=64, shuffle=True)
    val_loader = DataLoader(dataset[test_index:val_index], batch_size=64, shuffle=True)

    # Determine maximum degree and degree histogram
    max_degree = int(max([degree(data.edge_index[1], num_nodes=data.num_nodes).max().item() for data in dataset[:train_index]]))
    deg = torch.zeros(max_degree + 1, dtype=torch.long)
    for data in dataset[:train_index]:
        d = degree(data.edge_index[1], num_nodes=data.num_nodes, dtype=torch.long)
        deg += torch.bincount(d, minlength=deg.numel())

    # Set model parameters
    num_features = dataset[0].x.shape[1]
    edge_attr = dataset[0].edge_attr.shape[1]
    model = PNANet(num_features, dim_h=64, edge_attr=edge_attr, aggregators=aggregators, scalers=scalers, deg=deg).to(device)

    # Train and save model with dataset-specific path
    model_path = f"./data/QM9/PNA_0_model_{name}.pt"         # the number 0 represents the target number
    pna_best_loss, pna_train_loss, pna_val_loss = train_epochs(epochs, model, train_loader, val_loader, model_path)
    

    # Print summary for each dataset
    print(f"\nCompleted training for dataset: {name}")
    print(f"  Model saved at: {model_path}")
    print(f"  Best Validation Loss: {pna_best_loss}")
    print(f"  Final Training Loss: {pna_train_loss[-1]}")
    print(f"  Final Validation Loss: {pna_val_loss[-1]}\n")

    # load our model
    model = PNANet(num_features, dim_h, edge_attr, aggregators, scalers, deg).to(device)
    model.load_state_dict(torch.load(f"./data/QM9/PNA_0_model_{name}.pt"))   

    # calculate test loss
    pna_test_loss = testing(test_loader, model)

    print("Test Loss for PNA: " + str(pna_test_loss.item()))

    model = GINENet(num_features, dim_h=64, edge_attr=edge_attr).to(device)

    # Train and save model with dataset-specific path
    model_path = f"./data/QM9/GIN_0_model_{name}.pt"
    gin_best_loss, gin_train_loss, gin_val_loss = train_epochs(epochs, model, train_loader, val_loader, model_path)

    
    # Print summary for each dataset
    print(f"\nCompleted training for dataset: {name}")
    print(f"  Model saved at: {model_path}")
    print(f"  Best Validation Loss: {gin_best_loss}")
    print(f"  Final Training Loss: {gin_train_loss[-1]}")
    print(f"  Final Validation Loss: {gin_val_loss[-1]}\n")

    # load our model
    model = GINENet(num_features, dim_h=64, edge_attr=edge_attr).to(device)
    model.load_state_dict(torch.load(f"./data/QM9/GIN_0_model_{name}.pt"))

    # calculate test loss
    gin_test_loss = testing(test_loader, model)

    print("Test Loss for GIN: " + str(gin_test_loss.item()))