In [1]:
# Imports
import pickle
import networkx as nx
import pandas as pd
import os
from tqdm import tqdm
from networkx.algorithms import approximation as approx

os.makedirs("output", exist_ok=True)

In [2]:
# Load graph
with open("road_graph.pkl", "rb") as f:
    G_main = pickle.load(f)

In [3]:
# --- Largest weakly connected component ---
largest_cc = max(nx.weakly_connected_components(G_main), key=len)
G_largest = G_main.subgraph(largest_cc).copy()
print(f"Largest WCC: {len(G_largest.nodes())} nodes, {len(G_largest.edges())} edges\n")

Largest WCC: 162634 nodes, 399630 edges



In [4]:
# --- Fast centralities ---
print("Computing fast centralities: in-degree, out-degree, PageRank...")
in_deg = nx.in_degree_centrality(G_largest)
out_deg = nx.out_degree_centrality(G_largest)
pagerank = nx.pagerank(G_largest, alpha=0.85)

pd.DataFrame(in_deg.items(), columns=["node", "in_degree"]).to_csv("output/in_degree.csv", index=False)
pd.DataFrame(out_deg.items(), columns=["node", "out_degree"]).to_csv("output/out_degree.csv", index=False)
pd.DataFrame(pagerank.items(), columns=["node", "pagerank"]).to_csv("output/pagerank.csv", index=False)
print("✅ Fast centralities saved.\n")

Computing fast centralities: in-degree, out-degree, PageRank...
✅ Fast centralities saved.



In [5]:
# --- Slow centralities with progress bars (optimized) ---
print("Computing slow centralities: betweenness (approx), HITS...")

# Approximate betweenness centrality (sample k nodes)
print("Calculating betweenness centrality (approx, k=100)...")
bet = nx.betweenness_centrality(G_largest, k=100, seed=42)
pd.DataFrame(bet.items(), columns=["node", "betweenness"]).to_csv("output/betweenness_centrality.csv", index=False)
print("✅ Betweenness centrality saved.\n")

# HITS
print("Calculating HITS (hubs & authorities, max_iter=100)...")
hubs, authorities = nx.hits(G_largest, max_iter=100, tol=1e-4)
pd.DataFrame(hubs.items(), columns=["node", "hub_score"]).to_csv("output/hubs.csv", index=False)
pd.DataFrame(authorities.items(), columns=["node", "authority_score"]).to_csv("output/authorities.csv", index=False)
print("✅ HITS saved.\n")

Computing slow centralities: betweenness (approx), HITS...
Calculating betweenness centrality (approx, k=100)...
✅ Betweenness centrality saved.

Calculating HITS (hubs & authorities, max_iter=100)...
✅ HITS saved.



In [6]:
# --- Merge existing centralities ---
centrality_files = {
    "in_degree": "output/in_degree.csv",
    "out_degree": "output/out_degree.csv",
    "pagerank": "output/pagerank.csv",
    "betweenness": "output/betweenness_centrality.csv",
    "hub_score": "output/hubs.csv",
    "authority_score": "output/authorities.csv"
}

df = pd.DataFrame()
for name, path in centrality_files.items():
    if os.path.exists(path):
        tmp = pd.read_csv(path)
        tmp = tmp.rename(columns={tmp.columns[1]: name})
        if df.empty:
            df = tmp
        else:
            df = df.merge(tmp, on="node", how="outer")

df.to_csv("output/combined_metrics.csv", index=False)
print("✅ Combined metrics saved to /output/combined_metrics.csv\n")

# Top-10 previews
for col in ["in_degree", "out_degree", "pagerank", "betweenness", "hub_score", "authority_score"]:
    if col in df.columns:
        print(f"Top 10 by {col}:")
        print(df.sort_values(col, ascending=False).head(10)[["node", col]], "\n")

✅ Combined metrics saved to /output/combined_metrics.csv

Top 10 by in_degree:
             node  in_degree
12675  4813567397   0.000037
5687   1905643476   0.000037
86061  8609853337   0.000037
56220  7488855891   0.000037
79424  8341138742   0.000037
54783  7279195366   0.000031
38445  5719116676   0.000031
54761  7279156158   0.000031
64426  8332788739   0.000031
38660  5719237807   0.000031 

Top 10 by out_degree:
              node  out_degree
12675   4813567397    0.000037
56220   7488855891    0.000037
86061   8609853337    0.000037
5687    1905643476    0.000037
79424   8341138742    0.000037
8513    2706474833    0.000031
86068   8609853353    0.000031
123737  8831704986    0.000031
20909   5678424303    0.000031
65502   8332942736    0.000031 

Top 10 by pagerank:
               node  pagerank
53993    7226570628  0.000025
2986      445252475  0.000018
139077  10183503218  0.000017
157457  12116337047  0.000017
36716    5716511033  0.000016
67307    8333205997  0.000016
90439