# Testing Graph Data Loader

This notebook tests the graph data loader with all available datasets including social network datasets and wind interpolation datasets.

In [5]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt

# Add parent directory to path to import our modules
sys.path.append('..')

In [6]:
from data.database import GraphDataLoader

# raw_data is in graph_bo/data, and we're in graph_bo/notebooks
# So the path is ../data/raw_data
graph_data_loader = GraphDataLoader(data_root="../data/raw_data", cache_dir="../data/processed_data")

Cache directory: /scratches/cartwright/mz473/Efficient-Gaussian-Process-on-Graphs/graph_bo/data/processed_data


## List Available Datasets

In [7]:
# Check what datasets are available
datasets = graph_data_loader.list_available_datasets()
print(f"Available datasets: {datasets}")

# Check cache status for each dataset
for dataset in datasets:
    info = graph_data_loader.get_dataset_info(dataset)
    print(f"{dataset}: cached = {info.get('cached', False)}")

Available datasets: ['facebook', 'youtube', 'twitch', 'enron', '500hpa', '800hpa', '1000hpa']
facebook: cached = True
youtube: cached = True
twitch: cached = True
enron: cached = True
500hpa: cached = False
800hpa: cached = False
1000hpa: cached = False


## Load and Test Each Dataset

In [8]:
# Load all datasets and collect statistics
dataset_stats = {}

for dataset_name in datasets:
    print(f"\n{'='*60}")
    print(f"Loading {dataset_name.upper()} dataset")
    print(f"{'='*60}")
    
    try:
        # Load dataset
        A, X, y = graph_data_loader(dataset_name)
        
        # Determine dataset type based on new naming scheme
        is_wind_dataset = dataset_name in ['500hpa', '800hpa', '1000hpa']
        
        # Compute basic statistics
        stats = {
            'dataset_type': 'wind' if is_wind_dataset else 'graph',
            'num_nodes': len(X),
            'num_edges': A.nnz // 2,
            'density': A.nnz / (A.shape[0] * A.shape[1]),
            'adjacency_shape': A.shape,
            'adjacency_format': A.format,
            'node_indices_range': (X.min(), X.max()),
            'target_min': y.min(),
            'target_max': y.max(),
            'target_mean': y.mean(),
            'target_std': y.std()
        }
        
        dataset_stats[dataset_name] = stats
        
        print(f"✓ Successfully loaded {dataset_name}")
        print(f"  Dataset type: {stats['dataset_type']}")
        print(f"  Nodes: {stats['num_nodes']:,}")
        print(f"  Edges: {stats['num_edges']:,}")
        print(f"  Density: {stats['density']:.6f}")
        print(f"  Adjacency matrix: {stats['adjacency_shape']} ({stats['adjacency_format']})")
        print(f"  Node indices: {stats['node_indices_range'][0]} to {stats['node_indices_range'][1]}")
        
        if is_wind_dataset:
            print(f"  Wind speeds - min: {stats['target_min']:.3f} m/s, max: {stats['target_max']:.3f} m/s")
            print(f"  Wind speeds - mean: {stats['target_mean']:.3f} m/s, std: {stats['target_std']:.3f} m/s")
        else:
            print(f"  Node degrees - min: {stats['target_min']}, max: {stats['target_max']}")
            print(f"  Node degrees - mean: {stats['target_mean']:.2f}, std: {stats['target_std']:.2f}")
        
    except Exception as e:
        print(f"✗ Error loading {dataset_name}: {e}")
        dataset_stats[dataset_name] = None


Loading FACEBOOK dataset
Loading facebook from cache...
✓ Successfully loaded facebook
  Dataset type: graph
  Nodes: 22,470
  Edges: 170,912
  Density: 0.000677
  Adjacency matrix: (22470, 22470) (csr)
  Node indices: 0 to 22469
  Node degrees - min: 1, max: 709
  Node degrees - mean: 15.22, std: 26.41

Loading YOUTUBE dataset
Loading youtube from cache...
✓ Successfully loaded youtube
  Dataset type: graph
  Nodes: 1,134,890
  Edges: 2,987,624
  Density: 0.000005
  Adjacency matrix: (1134890, 1134890) (csr)
  Node indices: 1 to 1157827
  Node degrees - min: 1, max: 28754
  Node degrees - mean: 5.27, std: 50.75

Loading TWITCH dataset
Loading twitch from cache...
✓ Successfully loaded twitch
  Dataset type: graph
  Nodes: 168,114
  Edges: 6,797,557
  Density: 0.000481
  Adjacency matrix: (168114, 168114) (csr)
  Node indices: 0 to 168113
  Node degrees - min: 1, max: 35279
  Node degrees - mean: 80.87, std: 314.16

Loading ENRON dataset
Loading enron from cache...
✓ Successfully load