# Testing Graph Data Loader

This notebook tests the graph data loader with all four social network datasets.

In [1]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt

# Add parent directory to path to import our modules
sys.path.append('..')

In [2]:
from data import graph_data_loader

Cache directory: /scratches/cartwright/mz473/Efficient-Gaussian-Process-on-Graphs/graph_bo/data/processed_data


## List Available Datasets

In [3]:
# Check what datasets are available
datasets = graph_data_loader.list_available_datasets()
print(f"Available datasets: {datasets}")

# Check cache status for each dataset
for dataset in datasets:
    info = graph_data_loader.get_dataset_info(dataset)
    print(f"{dataset}: cached = {info.get('cached', False)}")

Available datasets: ['facebook', 'youtube', 'twitch', 'enron']
facebook: cached = True
youtube: cached = True
twitch: cached = True
enron: cached = True


## Load and Test Each Dataset

In [4]:
# Load all datasets and collect statistics
dataset_stats = {}

for dataset_name in datasets:
    print(f"\n{'='*50}")
    print(f"Loading {dataset_name.upper()} dataset")
    print(f"{'='*50}")
    
    try:
        # Load dataset
        A, X, y = graph_data_loader(dataset_name)
        
        # Compute statistics
        stats = {
            'num_nodes': len(X),
            'num_edges': A.nnz // 2,
            'density': A.nnz / (A.shape[0] * A.shape[1]),
            'min_degree': y.min(),
            'max_degree': y.max(),
            'mean_degree': y.mean(),
            'std_degree': y.std()
        }
        
        dataset_stats[dataset_name] = stats
        
        print(f"✓ Successfully loaded {dataset_name}")
        print(f"  Nodes: {stats['num_nodes']:,}")
        print(f"  Edges: {stats['num_edges']:,}")
        print(f"  Density: {stats['density']:.6f}")
        print(f"  Degree - min: {stats['min_degree']}, max: {stats['max_degree']}, mean: {stats['mean_degree']:.2f}")
        print(f"  Adjacency matrix shape: {A.shape}")
        print(f"  Adjacency matrix format: {A.format}")
        print(f"  Node indices range: {X.min()} to {X.max()}")
        
    except Exception as e:
        print(f"✗ Error loading {dataset_name}: {e}")
        dataset_stats[dataset_name] = None


Loading FACEBOOK dataset
Loading facebook from cache...
✓ Successfully loaded facebook
  Nodes: 22,470
  Edges: 170,912
  Density: 0.000677
  Degree - min: 1, max: 709, mean: 15.22
  Adjacency matrix shape: (22470, 22470)
  Adjacency matrix format: csr
  Node indices range: 0 to 22469

Loading YOUTUBE dataset
Loading youtube from cache...
✓ Successfully loaded youtube
  Nodes: 1,134,890
  Edges: 2,987,624
  Density: 0.000005
  Degree - min: 1, max: 28754, mean: 5.27
  Adjacency matrix shape: (1134890, 1134890)
  Adjacency matrix format: csr
  Node indices range: 1 to 1157827

Loading TWITCH dataset
Loading twitch from cache...
✓ Successfully loaded twitch
  Nodes: 168,114
  Edges: 6,797,557
  Density: 0.000481
  Degree - min: 1, max: 35279, mean: 80.87
  Adjacency matrix shape: (168114, 168114)
  Adjacency matrix format: csr
  Node indices range: 0 to 168113

Loading ENRON dataset
Loading enron from cache...
✓ Successfully loaded enron
  Nodes: 36,692
  Edges: 183,831
  Density: 0.000