In [1]:
import torch
import numpy as np
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from torch_geometric.data import Data
from scipy.stats import pearsonr
from tqdm import tqdm 

from publib import set_style, fix_style

from utils import (
    compute_k_hop_homophily, 
    compute_k_hop_eccentricity,
    compute_k_hop_euclidean,
    create_partition_labels,
)

set_style(['article'])

In [2]:
k_hop=16
n_chunks=10
K=5

dataset_name = "paris"
dataset_path = f'./road_data/{dataset_name}'
save_path = f"./"


d_labels = {
    "paris":"Paris", "shanghai":"Shanghai", "la":"L.A.", "london":"London",
}

d_color = {
    "paris":"#c5373e", # Red, "#9c251c", rgb(197, 55, 62)
    "shanghai":   "#006eae", # Blue, "#00498d", rgb(0, 110, 174)
    "la":  "#439130", # Green, "#1c6e2b",   rgb(67, 145, 48)
    "london":   "#6e788e", # Grey # "#43536a",   rgb(110, 120, 142)
}

d_marker = {"paris":'x', "shanghai":'o', "la":'^', "london":"s"}

In [3]:
homophily_scores = {}
k_hop_distances_dict = {}
label_dict = {}
eccentricities = {}
sampled_nodes = {}
euclidean_distances = {}

compute_homophily = False
compute_eccentricity = False
compute_euclidean = False

for dataset_name in ["paris", "shanghai", "la", "london"]:
    dataset_path = f'./road_data/{dataset_name}'
    k_hop_distances = torch.load(f'{dataset_path}/node_labels_nhop-{k_hop}.pt')
    node_feat = torch.load(f'{dataset_path}/node_features.pt')
    edge_feat = torch.load(f'{dataset_path}/edge_features.pt')
    edge_index = torch.load(f'{dataset_path}/edge_indices.pt')
    k_hop_distances_dict[dataset_name] = k_hop_distances
    
    label = create_partition_labels(k_hop_distances, n_chunks)
    label_dict[dataset_name] = label
    label = torch.nn.functional.one_hot(label, num_classes=n_chunks)

    torch.manual_seed(0)
    sample_size = 1000
    num_nodes = torch.max(edge_index).item() + 1  # Total number of nodes
    random_samples = torch.randperm(num_nodes)[:sample_size]  # Randomly sample nodes

    if compute_euclidean:
        euclidean_distances[dataset_name], sampled_nodes[dataset_name] = compute_k_hop_euclidean(
            node_feat,
            edge_index, 
            edge_weight=edge_feat[:,0], 
            k=k_hop, 
            node_samples=random_samples,
            sample_rate=0.1
        )

    if compute_eccentricity:
        eccentricities[dataset_name], sampled_nodes[dataset_name] = compute_k_hop_eccentricity(
            edge_index, 
            edge_weight=edge_feat[:,0], 
            k=k_hop, 
            node_samples=random_samples,
            sample_rate=0.1
        )

    if compute_homophily:
        data = Data(y=label, edge_index=edge_index)
        mask = torch.ones(label.shape[0]).long()
        
        score = compute_k_hop_homophily(data.y, data.edge_index, mask, K, sampling_rate=0.1)
        homophily_scores[dataset_name] = score
        print(f"Homophily score for {dataset_name} is: {score:.4f}")

Creating labels by splitting data into 10 chunks
Creating labels by splitting data into 10 chunks
Creating labels by splitting data into 10 chunks
Creating labels by splitting data into 10 chunks


In [None]:
correlations = {}
for dataset_name in ["paris", "shanghai", "la", "london"]:
    x1 = torch.tensor(euclidean_distances[dataset_name])
    x2 = torch.tensor(eccentricities[dataset_name]) / 1000 # Change from meters to km
    mask = x1>=0
    x1, x2 = x1[mask], x2[mask]
    corr, _ = pearsonr(x1, x2)
    correlations[dataset_name] = [x1.tolist(), x2.tolist(), corr]

n_chunk=10 {'paris': 0.43517953157424927, 'shanghai': 0.4792105555534363, 'la': 0.46932077407836914, 'london': 0.47633564472198486}

n_chunk=20 {'paris': 0.5819949507713318,  'shanghai': 0.5968075394630432 
 'la': 0.5921060442924 
 'london': 0.5986458659172058}



In [2]:
num_bins = 50
fontsize = 26

# Create the figure and axis
fig, ax = plt.subplots(1, 2, figsize=(15, 8))

# Plot histograms for each dataset on the same subplot
for dataset_name in ["paris", "shanghai", "la", "london"]:
    ax[0].hist(
        x=euclidean_distances[dataset_name],
        bins=num_bins,
        range=(0, 15),  # Limit the x-axis to focus on the main distribution
        density=True,
        alpha=0.7,
        label=d_labels[dataset_name],
        color=d_color[dataset_name],
        edgecolor='black'
    )

# Set labels and title
ax[0].set_xlabel("Euclidean distance (km)", fontsize=fontsize)
ax[0].set_ylabel("Density", fontsize=fontsize)
ax[0].set_title("(a) Distributions of distance", fontsize=fontsize + 2)
ax[0].tick_params(axis='both', labelsize=fontsize - 3)
ax[0].xaxis.set_major_formatter(ticker.FormatStrFormatter('%.2f'))

# Add gridlines for readability
ax[0].grid(visible=True, linestyle='--', alpha=0.6)

# Add legend
ax[0].legend(fontsize=fontsize - 5, frameon=True)


for dataset_name in ["paris", "shanghai", "la", "london"]:
    x1 = correlations[dataset_name][0][:200]
    x2 = correlations[dataset_name][1][:200]
    sns.regplot(
        x=x1, y=x2, ax=ax[1],
        ci=95, scatter=True, 
        color=d_color[dataset_name],
        label=f"{d_labels[dataset_name]}; Corr={correlations[dataset_name][2]:.2f}",
        line_kws={
            "linewidth":3,
        },
        scatter_kws={
            "alpha":0.6,
        },
        marker=d_marker[dataset_name]
    )

ax[1].set_xlim(-0.5, 8)
ax[1].set_ylim(-0.5, 12)
#ax[1].set_xticks([x for x in [0, 0.05, 0.10]])

ax[1].set_xlabel("Euclidean distance (km)", fontsize=fontsize)
ax[1].set_ylabel("Eccentricity (km)", fontsize=fontsize)
ax[1].tick_params(axis='both', labelsize=fontsize - 3)
ax[1].set_title("(b) Correlation plots", fontsize=fontsize + 2)
ax[1].legend(fontsize=fontsize - 6, frameon=True)
ax[1].grid(True, linestyle='--', alpha=0.7)

# Apply custom styling and adjust layout
plt.tight_layout()
fix_style('article')
plt.savefig(f"{save_path}/label_distance_distributions.jpg", bbox_inches='tight')
plt.show()

NameError: name 'plt' is not defined