# Preprocessing

**Preprocessing Functions**

In [None]:
from torch_geometric.utils import add_self_loops, remove_self_loops, degree, get_laplacian, to_scipy_sparse_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.nn.functional import normalize
from torch.nn.functional import normalize
from collections import Counter
import scipy.sparse as sp
import numpy as np


def normalize_features(data):
    # Ensure node features exist
    if data.x is None:
        print("No node features found; initializing default features.")
        data.x = torch.ones((data.num_nodes, 1), dtype=torch.float)

    # Normalize features
    data.x = data.x / data.x.sum(dim=1, keepdim=True)
    print(f"[normalize_features] Normalized features. Shape: {data.x.shape}")
    return data


def split_dataset(data, train_ratio=0.6, val_ratio=0.2, seed=42, stratify=True):
    # Validate split ratios
    if train_ratio + val_ratio > 1.0:
        raise ValueError("Train and validation ratios must sum to 1.0 or less.")

    num_nodes = data.num_nodes

    # Check stratification
    class_counts = Counter(data.y.numpy())
    if stratify and any(count < 2 for count in class_counts.values()):
        print("Small class detected; switching to random split.")
        stratify_labels = None
    else:
        stratify_labels = data.y.numpy() if stratify else None

    # Train/test split
    train_idx, test_idx = train_test_split(
        torch.arange(num_nodes).numpy(),
        test_size=1 - train_ratio,
        stratify=stratify_labels,
        random_state=seed,
    )

    # Validation/test split
    val_idx, test_idx = train_test_split(
        test_idx,
        test_size=(1 - train_ratio - val_ratio) / (1 - train_ratio),
        stratify=stratify_labels[test_idx] if stratify_labels is not None else None,
        random_state=seed,
    )

    # split masks
    data.train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    data.val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    data.test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    data.train_mask[train_idx] = True
    data.val_mask[val_idx] = True
    data.test_mask[test_idx] = True

    # split sizes
    print(f"Training nodes: {len(train_idx)}, Validation nodes: {len(val_idx)}, Test nodes: {len(test_idx)}, Total nodes: {data.num_nodes}")

    return data


def edge_weights(data):

    # Calculate node degrees
    row, col = data.edge_index
    deg = degree(row, data.num_nodes, dtype=torch.float)

    # Compute edge weights (inverse degree)
    source_degrees = deg[row]
    edge_weight = torch.where(source_degrees > 0, 1.0 / source_degrees, torch.zeros_like(source_degrees))

    # Row-wise normalization
    row_sum = torch.zeros(data.num_nodes, dtype=torch.float).scatter_add_(0, row, edge_weight)
    edge_weight = edge_weight / (row_sum[row] + 1e-8)

    # Assign edge weights to the graph
    data.edge_weight = edge_weight
    print(f"[edge_weights] Calculated edge weights. Total edges: {data.edge_index.size(1)}")
    return data



from torch_geometric.utils import add_self_loops, remove_self_loops

def self_loops(data):
    """
    Adds self-loops to the graph and ensures proper edge weight initialization.
    """
    # Remove existing self-loops
    num_self_loops_before = torch.sum(data.edge_index[0] == data.edge_index[1]).item()
    data.edge_index, _ = remove_self_loops(data.edge_index)

    # Add self-loops
    num_nodes = data.num_nodes
    data.edge_index, _ = add_self_loops(data.edge_index, num_nodes=num_nodes)

    # Initialize or append self-loop weights
    if 'edge_weight' in data and data.edge_weight is not None:
        # Append weights for self-loops
        self_loop_weights = torch.ones(num_nodes, dtype=data.edge_weight.dtype, device=data.edge_weight.device)
        data.edge_weight = torch.cat([data.edge_weight, self_loop_weights])
    else:
        # Initialize all edge weights if not present
        data.edge_weight = torch.ones(data.edge_index.size(1), dtype=torch.float, device=data.edge_index.device)

    num_self_loops_after = torch.sum(data.edge_index[0] == data.edge_index[1]).item()
    print(f"[self_loops] Self-loops added. Before: {num_self_loops_before}, After: {num_self_loops_after}")

    return data



def node_degree_as_feature(data, normalize=True, method="max"):

    # Calculate node degrees
    node_degrees = degree(data.edge_index[0], data.num_nodes)

    # Normalize degrees if required
    if normalize:
        if method == "max":
            max_degree = node_degrees.max() + 1e-8
            node_degrees = node_degrees / max_degree
        elif method == "zscore":
            mean_degree = node_degrees.mean()
            std_degree = node_degrees.std() + 1e-8
            node_degrees = (node_degrees - mean_degree) / std_degree
        else:
            raise ValueError("Normalization method must be 'max' or 'zscore'.")

    # Avoid duplicate concatenation
    if data.x.size(1) > 0 and torch.allclose(data.x[:, -1], node_degrees.unsqueeze(1), atol=1e-8):
        return data

    # Add node degree as a feature
    data.x = torch.cat([data.x, node_degrees.unsqueeze(1)], dim=1)
    print(f"[node_degree_as_feature] Added node degree as a feature. Shape: {data.x.shape}")

    return data


def positional_encoding(data, num_encodings=None, proportion=0.1):

    num_nodes = data.num_nodes

    # Dynamic set number of encodings
    if num_encodings is None:
        if proportion is not None:
            num_encodings = max(1, int(num_nodes * proportion))
        else:
            num_encodings = min(10, num_nodes - 1)


    edge_index, edge_weight = get_laplacian(data.edge_index, normalization='sym', num_nodes=data.num_nodes)
    adj_matrix = to_scipy_sparse_matrix(edge_index, edge_attr=edge_weight, num_nodes=data.num_nodes)

    laplacian = sp.csgraph.laplacian(adj_matrix, normed=True)

    eigenvalues, eigenvectors = np.linalg.eigh(laplacian.toarray())

    positional_encodings = torch.tensor(eigenvectors[:, 1:num_encodings + 1], dtype=torch.float)

    if data.x is not None:
        data.x = torch.cat([data.x, positional_encodings], dim=1)
    else:
        data.x = positional_encodings

    print(f"[positional_encoding] Added {num_encodings} positional encodings. Shape: {data.x.shape}")
    return data



def feature_augmentation(data, noise_level=0.01):

    if data.x is None:
        print("No node features found; skipping feature augmentation.")
        return data

    noise = torch.randn_like(data.x) * noise_level
    data.x += noise
    print(f"[feature_augmentation] Added noise to features. Shape: {data.x.shape}")

    return data


def graph_denoising(data, threshold=0.01, preserve_self_loops=True):

    # Handle missing edge weights
    if 'edge_weight' not in data or data.edge_weight is None:
        data.edge_weight = torch.ones(data.edge_index.size(1), dtype=torch.float)

    # Handle empty graphs
    if data.edge_index.size(1) == 0:
        print("Graph has no edges; skipping denoising.")
        return data

    # Apply threshold and optionally preserve self-loops
    if preserve_self_loops:
        self_loop_mask = data.edge_index[0] == data.edge_index[1]
        mask = (data.edge_weight > threshold) | self_loop_mask
    else:
        mask = data.edge_weight > threshold

    # Filter edges and weights
    num_removed = mask.numel() - mask.sum().item()
    data.edge_index = data.edge_index[:, mask]
    data.edge_weight = data.edge_weight[mask]

    print(f"[graph_denoising] Removed {num_removed} edges. Remaining edges: {data.edge_index.size(1)}")

    return data



**Saving Datasets**

In [None]:
import os

def save_dataset(data, dataset_name, is_heterophilic):
    # Create directory
    dataset_dir = os.path.join(root, f"processed_{dataset_name}")
    os.makedirs(dataset_dir, exist_ok=True)

    heterophilic_preprocessing = [
        edge_weights,
        graph_denoising,
        self_loops,
        positional_encoding,
        normalize_features,
        split_dataset,
        feature_augmentation,
    ]
    homophilic_preprocessing = [
        self_loops,
        edge_weights,
        graph_denoising,
        node_degree_as_feature,
        normalize_features,
        split_dataset,
    ]

    # Preprocessing based on dataset type
    new_data = data.clone()
    if is_heterophilic:
        for func in heterophilic_preprocessing:
            new_data = func(new_data)
    else:
        for func in homophilic_preprocessing:
            new_data = func(new_data)

    # Save preprocessed data
    new_save_path = os.path.join(dataset_dir, f"new_{dataset_name}.pt")
    torch.save(new_data, new_save_path)
    print(f"Saved preprocessed data: {new_save_path}")


# Heterophilic Datasets

**TEXAS**

In [None]:
from torch_geometric.datasets import WebKB

root = '/content/dataset'
texas_dataset = WebKB(root=root, name='Texas')
data = texas_dataset[0]

save_dataset(data, "Texas", is_heterophilic=True)


Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/new_data/texas/out1_node_feature_label.txt
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/new_data/texas/out1_graph_edges.txt
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/splits/texas_split_0.6_0.2_0.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/splits/texas_split_0.6_0.2_1.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/splits/texas_split_0.6_0.2_2.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/splits/texas_split_0.6_0.2_3.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/splits/texas_split_0.6_0.2_4.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/splits/texas_split_0.6_0.2_5.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master/spl

[edge_weights] Calculated edge weights. Total edges: 325
[graph_denoising] Removed 104 edges. Remaining edges: 221
[self_loops] Self-loops added. Before: 16, After: 183
[positional_encoding] Added 18 positional encodings. Shape: torch.Size([183, 1721])
[normalize_features] Normalized features. Shape: torch.Size([183, 1721])
Small class detected; switching to random split.
Training nodes: 109, Validation nodes: 37, Test nodes: 37, Total nodes: 183
[feature_augmentation] Added noise to features. Shape: torch.Size([183, 1721])
Saved preprocessed data: /content/dataset/processed_Texas/new_Texas.pt


Done!
  w = np.where(isolated_node_mask, 1, np.sqrt(w))


**CHAMELEON**

In [None]:
from torch_geometric.datasets import WikipediaNetwork

root = '/content/dataset'
chameleon_dataset = WikipediaNetwork(root=root, name='chameleon')
data = chameleon_dataset[0]

save_dataset(data, "Chameleon", is_heterophilic=True)


Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/new_data/chameleon/out1_node_feature_label.txt
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/new_data/chameleon/out1_graph_edges.txt
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/splits/chameleon_split_0.6_0.2_0.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/splits/chameleon_split_0.6_0.2_1.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/splits/chameleon_split_0.6_0.2_2.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/splits/chameleon_split_0.6_0.2_3.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14

[edge_weights] Calculated edge weights. Total edges: 36101
[graph_denoising] Removed 0 edges. Remaining edges: 36101
[self_loops] Self-loops added. Before: 50, After: 2277
[positional_encoding] Added 227 positional encodings. Shape: torch.Size([2277, 2552])
[normalize_features] Normalized features. Shape: torch.Size([2277, 2552])
Training nodes: 1366, Validation nodes: 455, Test nodes: 456, Total nodes: 2277
[feature_augmentation] Added noise to features. Shape: torch.Size([2277, 2552])
Saved preprocessed data: /content/dataset/processed_Chameleon/new_Chameleon.pt


**SQUIRREL**

In [None]:
# Import the dataset
from torch_geometric.datasets import WikipediaNetwork

# Define root directory and load dataset
root = '/content/dataset'
squirrel_dataset = WikipediaNetwork(root=root, name='squirrel')
data = squirrel_dataset[0] # Access the data object

# Saving dataset
save_dataset(data, "Squirrel", is_heterophilic=True)


Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/new_data/squirrel/out1_node_feature_label.txt
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/new_data/squirrel/out1_graph_edges.txt
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/splits/squirrel_split_0.6_0.2_0.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/splits/squirrel_split_0.6_0.2_1.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/splits/squirrel_split_0.6_0.2_2.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019c562737240d06ec83b07d16a8f/splits/squirrel_split_0.6_0.2_3.npz
Downloading https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/f1fc0d14b3b019

[edge_weights] Calculated edge weights. Total edges: 217073
[graph_denoising] Removed 165400 edges. Remaining edges: 51673
[self_loops] Self-loops added. Before: 140, After: 5201
[positional_encoding] Added 520 positional encodings. Shape: torch.Size([5201, 2609])
[normalize_features] Normalized features. Shape: torch.Size([5201, 2609])
Training nodes: 3120, Validation nodes: 1040, Test nodes: 1041, Total nodes: 5201
[feature_augmentation] Added noise to features. Shape: torch.Size([5201, 2609])
Saved preprocessed data: /content/dataset/processed_Squirrel/new_Squirrel.pt


# Homophilic Datasets

**CORA**

In [None]:
from torch_geometric.datasets import Planetoid

root = '/content/dataset'
cora_dataset = Planetoid(root=root, name='Cora')
data = cora_dataset[0]

save_dataset(data, "Cora", is_heterophilic=False)

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


[self_loops] Self-loops added. Before: 0, After: 2708
[edge_weights] Calculated edge weights. Total edges: 13264
[graph_denoising] Removed 168 edges. Remaining edges: 13096
[node_degree_as_feature] Added node degree as a feature. Shape: torch.Size([2708, 1434])
[normalize_features] Normalized features. Shape: torch.Size([2708, 1434])
Training nodes: 1624, Validation nodes: 542, Test nodes: 542, Total nodes: 2708
Saved preprocessed data: /content/dataset/processed_Cora/new_Cora.pt


**CITESEER**

In [None]:
from torch_geometric.datasets import Planetoid

root = '/content/dataset'
citeseer_dataset = Planetoid(root=root, name='Citeseer')
data = citeseer_dataset[0]

save_dataset(data, "Citeseer", is_heterophilic=False)


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.test.index
Processing...
Done!


[self_loops] Self-loops added. Before: 0, After: 3327
[edge_weights] Calculated edge weights. Total edges: 12431
[graph_denoising] Removed 0 edges. Remaining edges: 12431
[node_degree_as_feature] Added node degree as a feature. Shape: torch.Size([3327, 3704])
[normalize_features] Normalized features. Shape: torch.Size([3327, 3704])
Training nodes: 1996, Validation nodes: 665, Test nodes: 666, Total nodes: 3327
Saved preprocessed data: /content/dataset/processed_Citeseer/new_Citeseer.pt


**PUBMED**

In [None]:
from torch_geometric.datasets import Planetoid

root = '/content/dataset'
pubmed_dataset = Planetoid(root=root, name='PubMed')
data = pubmed_dataset[0]

save_dataset(data, "PubMed", is_heterophilic=False)


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.test.index
Processing...
Done!


[self_loops] Self-loops added. Before: 0, After: 19717
[edge_weights] Calculated edge weights. Total edges: 108365
[graph_denoising] Removed 832 edges. Remaining edges: 107533
[node_degree_as_feature] Added node degree as a feature. Shape: torch.Size([19717, 501])
[normalize_features] Normalized features. Shape: torch.Size([19717, 501])
Training nodes: 11830, Validation nodes: 3943, Test nodes: 3944, Total nodes: 19717
Saved preprocessed data: /content/dataset/processed_PubMed/new_PubMed.pt
