In [2]:
import networkx as nx
import gzip
from collections import defaultdict
import numpy as np

In [1]:
graph_path = "email-Enron.txt.gz"

In [3]:
G = nx.Graph()

with gzip.open(graph_path, 'rt') as f:
    for line in f:
        if line.startswith('#'):
            continue
        u, v = map(int, line.strip().split())
        G.add_edge(u, v)

print(f"Loaded graph with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")


Loaded graph with 36692 nodes and 183831 edges


In [4]:
num_nodes = G.number_of_nodes()
A = nx.adjacency_matrix(G)

# Compute ML-ready data structures
# X: all node indices
X = np.array(list(G.nodes()))
print(f"X (node indices) shape: {X.shape}")
print(f"Node indices range: {X.min()} to {X.max()}")

# y: all degrees of each node
y = np.array([G.degree(node) for node in X])
print(f"y (node degrees) shape: {y.shape}")
print(f"Degree statistics - min: {y.min()}, max: {y.max()}, mean: {y.mean():.2f}")

# adj_matrix: the adjacency matrix (sparse format)
adj_matrix = A
print(f"adj_matrix shape: {adj_matrix.shape}")
print(f"adj_matrix format: {adj_matrix.format}")
print(f"adj_matrix density: {adj_matrix.nnz / (adj_matrix.shape[0] * adj_matrix.shape[1]):.6f}")

X (node indices) shape: (36692,)
Node indices range: 0 to 36691
y (node degrees) shape: (36692,)
Degree statistics - min: 1, max: 1383, mean: 10.02
adj_matrix shape: (36692, 36692)
adj_matrix format: csr
adj_matrix density: 0.000273
