## Load Data and Setup

In [5]:
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
from pathlib import Path
import time

# Fix for PyTorch 2.6 torch.load compatibility
original_load = torch.load
torch.load = lambda *args, **kwargs: original_load(*args, **{**kwargs, 'weights_only': False})

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [6]:
# Load the dataset
from torch_geometric.datasets import OGB_MAG
from ogb.nodeproppred import NodePropPredDataset

print("Loading dataset (this may take a moment)...")
dataset = NodePropPredDataset(name='ogbn-products', root='../dataset')
split_idx = dataset.get_idx_split()

train_idx = torch.tensor(split_idx['train'], dtype=torch.long)
val_idx = torch.tensor(split_idx['valid'], dtype=torch.long)
test_idx = torch.tensor(split_idx['test'], dtype=torch.long)

graph, labels = dataset[0]
node_features = torch.tensor(graph['node_feat'], dtype=torch.float)
edge_index = torch.tensor(graph['edge_index'], dtype=torch.long)
labels = torch.tensor(labels, dtype=torch.long).squeeze()

num_nodes = node_features.shape[0]
num_classes = len(torch.unique(labels))

print(f"Dataset loaded:")
print(f"  Nodes: {num_nodes:,}")
print(f"  Edges: {edge_index.shape[1]:,}")
print(f"  Features: {node_features.shape[1]}")
print(f"  Classes: {num_classes}")
print(f"  Test nodes: {test_idx.shape[0]:,}")

Loading dataset (this may take a moment)...
Dataset loaded:
  Nodes: 2,449,029
  Edges: 123,718,280
  Features: 100
  Classes: 47
  Test nodes: 2,213,091
Dataset loaded:
  Nodes: 2,449,029
  Edges: 123,718,280
  Features: 100
  Classes: 47
  Test nodes: 2,213,091


In [7]:
# Build adjacency list for neighbor sampling
adj_list = {i: [] for i in range(num_nodes)}
for src, dst in edge_index.t().numpy():
    adj_list[src].append(dst)
    adj_list[dst].append(src)  # undirected

print("Adjacency list created")

Adjacency list created


## Define Model Architecture

In [8]:
class SAGEConvLayer(torch.nn.Module):
    """GraphSAGE convolution layer - mean aggregation"""
    def __init__(self, in_features, out_features):
        super().__init__()
        self.linear = torch.nn.Linear(2 * in_features, out_features)
    
    def forward(self, x, edge_index):
        # x: [N, in_features]
        # edge_index: [2, E]
        row, col = edge_index
        
        # Aggregate neighbors (mean aggregation)
        agg = torch.zeros(x.size(0), x.size(1), device=x.device)
        deg = torch.zeros(x.size(0), device=x.device)
        
        # Sum neighbors
        agg.index_add_(0, row, x[col])
        deg.index_add_(0, row, torch.ones(row.size(0), device=x.device))
        
        # Mean aggregation
        deg = deg.clamp(min=1).unsqueeze(1)
        agg = agg / deg
        
        # Concatenate self features with aggregated neighbor features
        out = torch.cat([x, agg], dim=1)
        out = self.linear(out)
        
        return out


class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2):
        super().__init__()
        self.num_layers = num_layers
        self.convs = torch.nn.ModuleList()
        
        # First layer
        self.convs.append(SAGEConvLayer(in_channels, hidden_channels))
        
        # Hidden layers
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConvLayer(hidden_channels, hidden_channels))
        
        # Output layer
        self.convs.append(SAGEConvLayer(hidden_channels, out_channels))
    
    def forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i < self.num_layers - 1:
                x = F.relu(x)
                x = F.dropout(x, p=0.5, training=self.training)
        return x

print("Model architecture defined")

Model architecture defined


## Sampling and Evaluation Functions

In [9]:
def sample_neighbors(nodes, adj_list, num_samples=[10, 5]):
    """
    Sample neighbors for multiple hops
    nodes: list of node indices
    adj_list: dict mapping node -> list of neighbors
    num_samples: list of number of neighbors to sample per hop
    Returns: all sampled nodes and subgraph edges
    """
    all_nodes = set(nodes)
    current_layer = set(nodes)
    edges = []
    
    for k in num_samples:
        next_layer = set()
        for node in current_layer:
            neighbors = adj_list.get(node, [])
            if len(neighbors) > 0:
                # Sample k neighbors (or all if less than k)
                sampled = np.random.choice(
                    neighbors, 
                    size=min(k, len(neighbors)), 
                    replace=False
                )
                for neighbor in sampled:
                    edges.append((node, neighbor))
                    next_layer.add(neighbor)
                    all_nodes.add(neighbor)
        current_layer = next_layer
    
    return list(all_nodes), edges


@torch.no_grad()
def evaluate_with_sampling(model, node_feats, node_labels, mask, device, 
                           adj_list, batch_size=2048, num_samples=[10, 5], 
                           verbose=False):
    """
    Evaluate model with specified neighbor sampling strategy
    Returns: accuracy and inference time
    """
    model.eval()
    start_time = time.time()
    
    # Get indices of nodes to evaluate
    eval_indices = mask.nonzero(as_tuple=True)[0].numpy()
    num_eval = len(eval_indices)
    
    all_preds = []
    all_labels = []
    
    # Process in batches with neighbor sampling
    for i in range(0, num_eval, batch_size):
        batch_target_nodes = eval_indices[i:i+batch_size]
        
        # Sample neighbors for this batch
        sampled_nodes, sampled_edges = sample_neighbors(
            batch_target_nodes.tolist(), adj_list, num_samples
        )
        node_map = {n: idx for idx, n in enumerate(sampled_nodes)}
        
        # Get subgraph features
        batch_x = node_feats[sampled_nodes].to(device)
        
        # Create subgraph edge_index
        subgraph_edges = []
        for src, dst in sampled_edges:
            if src in node_map and dst in node_map:
                subgraph_edges.append([node_map[src], node_map[dst]])
        
        if len(subgraph_edges) > 0:
            batch_edge_index = torch.tensor(subgraph_edges, dtype=torch.long).t().to(device)
        else:
            batch_edge_index = torch.empty((2, 0), dtype=torch.long, device=device)
        
        # Forward pass
        out = model(batch_x, batch_edge_index)
        
        # Get predictions for target nodes only
        target_indices = [node_map[n] for n in batch_target_nodes if n in node_map]
        pred = out[target_indices].argmax(dim=1).cpu()
        labels_batch = node_labels[batch_target_nodes[:len(target_indices)]]
        
        all_preds.append(pred)
        all_labels.append(labels_batch)
        
        # Clear GPU memory
        del out, batch_x, batch_edge_index
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    # Compute accuracy
    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)
    correct = (all_preds == all_labels).sum()
    acc = correct.item() / len(all_labels)
    
    inference_time = time.time() - start_time
    
    if verbose:
        print(f"Evaluated {len(all_labels):,} nodes in {inference_time:.2f}s")
    
    return acc, inference_time

print("Sampling and evaluation functions defined")

Sampling and evaluation functions defined


## Load Saved GraphSAGE Model

In [10]:
# Initialize model with same architecture
in_channels = node_features.shape[1]
hidden_channels = 256

model = GraphSAGE(
    in_channels=in_channels,
    hidden_channels=hidden_channels,
    out_channels=num_classes,
    num_layers=2
).to(device)

# Check for model file and load saved weights
model_path = Path('..\\notebooks\\models\\graphsage_model.pth')

if not model_path.exists():
    print("‚ùå ERROR: Model file not found!")
    print(f"   Expected location: {model_path.absolute()}")
    print("\nüìù To use this notebook, you need to:")
    print("   1. Open first_iteration.ipynb")
    print("   2. Train the GraphSAGE model (run all training cells)")
    print("   3. Make sure the model is saved to models/graphsage_model.pth")
    print("\nAlternatively, check if the model was saved to a different location.")
    raise FileNotFoundError(f"Model file not found: {model_path}")

checkpoint = torch.load(model_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

print(f"‚úì GraphSAGE model loaded from checkpoint")
print(f"  Epoch: {checkpoint.get('epoch', 'N/A')}")
print(f"  Train accuracy: {checkpoint.get('train_acc', 'N/A')}")
print(f"  Val accuracy: {checkpoint.get('val_acc', 'N/A')}")
print(f"  Test accuracy: {checkpoint.get('test_acc', 'N/A')}")

‚úì GraphSAGE model loaded from checkpoint
  Epoch: N/A
  Train accuracy: N/A
  Val accuracy: N/A
  Test accuracy: N/A

  Epoch: N/A
  Train accuracy: N/A
  Val accuracy: N/A
  Test accuracy: N/A


## Create Test Mask

In [11]:
# Create test mask
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask[test_idx.cpu()] = True

print(f"Test set: {test_mask.sum():,} nodes")

Test set: 2,213,091 nodes


## Test Different Sampling Strategies

In [None]:
# Define different sampling strategies to test - REDUCED FOR SPEED
sampling_strategies = [
    [5, 3],      # Fast (fewer samples)
    [10, 5],     # Baseline
    [15, 10],    # More samples
    # Commented out to save time - uncomment if needed:
    # [20, 10],    # Even more first-hop
    # [25, 15],    # High sampling
    # [30, 15],    # Very high sampling
    # [15, 5],     # More first-hop, fewer second-hop
    # [10, 10],    # Balanced
]

print(f"üöÄ FAST MODE: Testing {len(sampling_strategies)} sampling strategies (reduced from 8)")
print(f"üí° To test more strategies, uncomment lines in the cell above\n")

results = []

for num_samples in sampling_strategies:
    print(f"Testing sampling: {num_samples}")
    
    # Run evaluation 1 time only (reduced from 3 for speed)
    acc, inf_time = evaluate_with_sampling(
        model, 
        node_features.cpu(), 
        labels.cpu(), 
        test_mask, 
        device, 
        adj_list,
        batch_size=4096,  # Increased from 2048 for speed
        num_samples=num_samples,
        verbose=False
    )
    
    # Single run - no averaging needed
    accs = [acc]
    times = [inf_time]
    
    avg_acc = np.mean(accs)
    std_acc = np.std(accs)
    avg_time = np.mean(times)
    
    results.append({
        'sampling_strategy': str(num_samples),
        'hop1_samples': num_samples[0],
        'hop2_samples': num_samples[1],
        'test_accuracy': avg_acc,
        'accuracy_std': std_acc,
        'inference_time': avg_time
    })
    
    print(f"  ‚Üí Accuracy: {avg_acc:.4f} (¬±{std_acc:.4f}), Time: {avg_time:.2f}s\n")

print("\n" + "="*60)
print("All tests complete!")
print("="*60)

üöÄ FAST MODE: Testing 3 sampling strategies (reduced from 8)
üí° To test more strategies, uncomment lines in the cell above

Testing sampling: [5, 3]


## Results Summary

In [None]:
# Create results dataframe
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('test_accuracy', ascending=False)

print("\nResults ranked by test accuracy:")
print(results_df.to_string(index=False))

# Find best strategy
best_idx = results_df['test_accuracy'].idxmax()
best_result = results_df.loc[best_idx]

print("\n" + "="*60)
print("BEST SAMPLING STRATEGY:")
print(f"  Samples: {best_result['sampling_strategy']}")
print(f"  Accuracy: {best_result['test_accuracy']:.4f} (¬±{best_result['accuracy_std']:.4f})")
print(f"  Time: {best_result['inference_time']:.2f}s")
print("="*60)

## Visualize Results

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Accuracy vs Sampling Strategy
x_labels = results_df['sampling_strategy'].values
ax1.bar(range(len(results_df)), results_df['test_accuracy'], color='skyblue')
ax1.errorbar(range(len(results_df)), results_df['test_accuracy'], 
             yerr=results_df['accuracy_std'], fmt='none', color='red', capsize=5)
ax1.set_xticks(range(len(results_df)))
ax1.set_xticklabels(x_labels, rotation=45, ha='right')
ax1.set_xlabel('Sampling Strategy [hop1, hop2]')
ax1.set_ylabel('Test Accuracy')
ax1.set_title('Test Accuracy by Sampling Strategy')
ax1.grid(axis='y', alpha=0.3)

# Plot 2: Accuracy vs Inference Time Trade-off
scatter = ax2.scatter(results_df['inference_time'], results_df['test_accuracy'], 
                      s=100, alpha=0.6, c=range(len(results_df)), cmap='viridis')
for idx, row in results_df.iterrows():
    ax2.annotate(row['sampling_strategy'], 
                (row['inference_time'], row['test_accuracy']),
                fontsize=8, alpha=0.7)
ax2.set_xlabel('Inference Time (seconds)')
ax2.set_ylabel('Test Accuracy')
ax2.set_title('Accuracy vs Speed Trade-off')
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("Plots generated!")

## Test with Even More Aggressive Sampling (Optional - SKIP TO SAVE TIME)

In [None]:
# SKIPPED FOR SPEED - Uncomment to test aggressive sampling strategies
"""
aggressive_strategies = [
    [50, 25],
    [40, 30],
    [100, 50],
]

print("Testing aggressive sampling strategies (may be slower)...\n")

aggressive_results = []

for num_samples in aggressive_strategies:
    print(f"Testing sampling: {num_samples}")
    
    acc, inf_time = evaluate_with_sampling(
        model, 
        node_features.cpu(), 
        labels.cpu(), 
        test_mask, 
        device, 
        adj_list,
        batch_size=2048,
        num_samples=num_samples,
        verbose=True
    )
    
    aggressive_results.append({
        'sampling_strategy': str(num_samples),
        'test_accuracy': acc,
        'inference_time': inf_time
    })
    
    print(f"  ‚Üí Accuracy: {acc:.4f}, Time: {inf_time:.2f}s\n")

aggressive_df = pd.DataFrame(aggressive_results)
print("\nAggressive sampling results:")
print(aggressive_df.to_string(index=False))
"""

print("‚è© Aggressive sampling section SKIPPED to save time")
print("   Uncomment the code above if you want to test [50,25], [40,30], [100,50]")

## Save Results

In [None]:
# Save results to CSV
results_df.to_csv('../notebooks/sampling_strategy_results.csv', index=False)
print("Results saved to sampling_strategy_results.csv")