In [1]:
# Imports and setup
import os
from pathlib import Path
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: ogb and torch (used if CSVs are not present)
try:
    from ogb.nodeproppred import NodePropPredDataset
    _OGB_AVAILABLE = True
except Exception:
    _OGB_AVAILABLE = False

DATA_RAW = Path('data/raw')
DATA_PROCESSED = Path('data/processed')
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

print('OGB available:', _OGB_AVAILABLE)
print('Raw data dir exists:', DATA_RAW.exists())

OGB available: False
Raw data dir exists: False


In [2]:
def load_from_csv(data_dir=DATA_RAW):
    # Expected CSV files: node-feat.csv, node-label.csv, edge.csv
    feat_path = data_dir / 'node-feat.csv'
    label_path = data_dir / 'node-label.csv'
    edge_path = data_dir / 'edge.csv'

    if not feat_path.exists() or not label_path.exists() or not edge_path.exists():
        raise FileNotFoundError('One or more expected CSVs are missing in ' + str(data_dir))

    node_feat = pd.read_csv(feat_path, header=None)
    node_label = pd.read_csv(label_path, header=None)
    edge_index = pd.read_csv(edge_path, header=None)

    # Ensure integer node indices
    edge_index = edge_index.astype(int)
    edge_index.columns = ['src','dst']

    # Remove self-loops
    edge_index = edge_index[edge_index['src'] != edge_index['dst']]

    # Drop duplicate undirected edges (optional: treat as undirected)
    # Create canonical ordering for undirected deduplication
    edge_min = edge_index[['src','dst']].min(axis=1)
    edge_max = edge_index[['src','dst']].max(axis=1)
    dedup = pd.DataFrame({'u': edge_min, 'v': edge_max})
    dedup = dedup.drop_duplicates().reset_index(drop=True)

    # Rebuild edge_index as directed edges from dedup (both directions)
    edges_u = pd.DataFrame({'src': dedup['u'], 'dst': dedup['v']})
    edges_v = pd.DataFrame({'src': dedup['v'], 'dst': dedup['u']})
    cleaned_edges = pd.concat([edges_u, edges_v], ignore_index=True)

    # Basic stats
    num_nodes = node_feat.shape[0]
    num_edges = cleaned_edges.shape[0]

    print(f'Nodes: {num_nodes}, Edges (directed, cleaned): {num_edges}')

    # Compute degrees
    degrees = np.zeros(num_nodes, dtype=int)
    for src, dst in cleaned_edges.values:
        if 0 <= src < num_nodes:
            degrees[int(src)] += 1
    # Save processed artifacts
    node_feat.to_parquet(DATA_PROCESSED / 'node-feat.parquet', index=False)
    node_label.to_parquet(DATA_PROCESSED / 'node-label.parquet', index=False)
    cleaned_edges.to_csv(DATA_PROCESSED / 'edge-cleaned.csv', index=False)
    np.save(DATA_PROCESSED / 'degrees.npy', degrees)

    return node_feat, node_label, cleaned_edges, degrees

In [3]:
def load_from_ogb(name='ogbn-products', save_processed=True):
    if not _OGB_AVAILABLE:
        raise RuntimeError('OGB is not installed in this environment')
    ds = NodePropPredDataset(name=name)
    graph = ds.graph
    node_feat = pd.DataFrame(graph['node_feat'])
    node_label = pd.DataFrame(graph['node_label'])
    edge_index = pd.DataFrame(graph['edge_index'].T, columns=['src','dst'])
    # Use same cleaning as CSV path
    edge_index = edge_index.astype(int)
    edge_index = edge_index[edge_index['src'] != edge_index['dst']]
    edge_min = edge_index[['src','dst']].min(axis=1)
    edge_max = edge_index[['src','dst']].max(axis=1)
    dedup = pd.DataFrame({'u': edge_min, 'v': edge_max}).drop_duplicates().reset_index(drop=True)
    edges_u = pd.DataFrame({'src': dedup['u'], 'dst': dedup['v']})
    edges_v = pd.DataFrame({'src': dedup['v'], 'dst': dedup['u']})
    cleaned_edges = pd.concat([edges_u, edges_v], ignore_index=True)
    num_nodes = node_feat.shape[0]
    degrees = np.zeros(num_nodes, dtype=int)
    for src, dst in cleaned_edges.values:
        degrees[int(src)] += 1
    if save_processed:
        node_feat.to_parquet(DATA_PROCESSED / 'node-feat.parquet', index=False)
        node_label.to_parquet(DATA_PROCESSED / 'node-label.parquet', index=False)
        cleaned_edges.to_csv(DATA_PROCESSED / 'edge-cleaned.csv', index=False)
        np.save(DATA_PROCESSED / 'degrees.npy', degrees)
    print(f'Loaded {name}: nodes={num_nodes}, cleaned edges={cleaned_edges.shape[0]}')
    return node_feat, node_label, cleaned_edges, degrees

In [4]:
# Try CSVs first, otherwise try OGB
try:
    if DATA_RAW.exists() and any(DATA_RAW.glob('*.csv')):
        feat, label, edges, degrees = load_from_csv(DATA_RAW)
    elif _OGB_AVAILABLE:
        feat, label, edges, degrees = load_from_ogb()
    else:
        raise FileNotFoundError('No CSVs in data/raw/ and OGB not available')
except Exception as e:
    print('Load failed:', e)

Load failed: No CSVs in data/raw/ and OGB not available


In [5]:
# Quick EDA: degree stats and histogram
try:
    print('Degree mean:', degrees.mean())
    print('Degree median:', np.median(degrees))
    print('Degree min/max:', degrees.min(), degrees.max())
    plt.figure(figsize=(7,4))
    sns.histplot(degrees, bins=100, log_scale=(False, True))
    plt.xlabel('Degree')
    plt.ylabel('Count (log)')
    plt.title('Degree distribution (processed)')
    plt.show()
except NameError:
    print('No `degrees` array available. Run the load cell first.')

No `degrees` array available. Run the load cell first.


## Next steps
- Use `data/processed/node-feat.parquet`, `node-label.parquet`, and `edge-cleaned.csv` for model training or further feature engineering.
- Consider converting the cleaned edges to a sparse adjacency matrix or PyTorch Geometric format for faster GNN pipelines.
- If you want, I can update `uv_project/requirements.txt` to include `ogb`, `pyarrow` (for parquet), and plotting libs.