## Librerías

In [1]:
!pip install torch_geometric -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [22]:
import networkx as nx
import numpy as np
import os
import pandas as pd
import random
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.data import Batch, Data, Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import global_mean_pool, TransformerConv
from torch_geometric.utils import from_networkx

## Lectura desde Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Graph Transformer

In [4]:
class GraphormerEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim=128, num_layers=4, num_heads=4, out_dim=128):
        super().__init__()
        # Proyección lineal del embedding de Alpha Earth
        self.input_proj = nn.Linear(in_dim, hidden_dim)

        # Capas del Graphormer
        self.layers = nn.ModuleList()
        for _ in range(num_layers):
            self.layers.append(
                TransformerConv(
                    in_channels=hidden_dim,
                    out_channels=hidden_dim,
                    heads=num_heads,
                    concat=False,
                )
            )

        # Normalizaciones por capa (una LayerNorm por cada TransformerConv)
        self.norms = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_layers)])

        # Proyección final del embedding del grafo al espacio de salida (embedding de ciudad)
        self.out_proj = nn.Linear(hidden_dim, out_dim)

    def forward(self, x, edge_index, batch):
        """
        x: [N, in_dim]     - features de nodos
        edge_index: [2, E] - aristas del grafo en formato COO
        batch: [N]         - asigna cada nodo a un grafo (aquí, subgrafos / ciudades)
        """
        # Pasamos las features de los nodos al espacio oculto
        h = self.input_proj(x) # [N, hidden_dim]

        # Aplicamos num_layers veces: TransformerConv + residual + LayerNorm + ReLU
        for conv, norm in zip(self.layers, self.norms):
            h_res = h                      # conexión residual
            h = conv(h, edge_index)        # mensaje + atención entre nodos
            h = norm(h + h_res)            # residual + normalización
            h = F.relu(h)                  # no linealidad

        # Readout: agregamos todos los nodos de cada grafo en un solo vector (mean pooling)
        g = global_mean_pool(h, batch)

        # Proyección al espacio de embedding final
        g = self.out_proj(g)

        # Normalizamos para que los embeddings queden en la esfera unitaria (útil para contraste)
        g = F.normalize(g, p=2, dim=-1)
        return g

## Augmentaciones

In [5]:
# Devuelve un grafo que tiene drop_prob de sus aristas enmascaradas
def random_edge_dropout(data, drop_prob=0.2):
    edge_index = data.edge_index
    num_edges = edge_index.size(1)

    mask = torch.rand(num_edges, device=edge_index.device) > drop_prob
    new_edge_index = edge_index[:, mask]

    new_data = data.clone()
    new_data.edge_index = new_edge_index
    return new_data

# Devuelve un grafo que tiene mask_prob de sus features enmascaradas
def random_feature_masking(data, mask_prob=0.2):
    x = data.x.clone()
    mask = torch.rand_like(x) < mask_prob
    x[mask] = 0.0

    new_data = data.clone()
    new_data.x = x
    return new_data

# Aplica las dos transformaciones anteriores.
def graphcl_augment(data):
    v1 = random_edge_dropout(data, drop_prob=0.2)
    v1 = random_feature_masking(v1, mask_prob=0.2)

    v2 = random_edge_dropout(data, drop_prob=0.2)
    v2 = random_feature_masking(v2, mask_prob=0.2)

    return v1, v2

## Contrastive Loss

In [6]:
def nt_xent_loss(z1, z2, temperature=0.2):
    # Número de grafos en z1 (mismos que z2)
    batch_size = z1.size(0)

    # Concatenamos las 2 augmentaciones
    z = torch.cat([z1, z2], dim=0)

    # Similitud mediante producto punto
    sim = torch.matmul(z, z.t())

    # Escalamos con temperatura. Temperaturas más bajas hacen el contraste más agresivo
    sim = sim / temperature

    # Eliminamos la diagonal
    mask = torch.eye(2 * batch_size, device=z.device, dtype=torch.bool)
    sim = sim.masked_fill(mask, -1e9)

    # Construcción de labels
    # Las vistas augmentadas son vistas como pares positivos (i + batch_size)
    labels = torch.arange(2 * batch_size, device=z.device)
    labels = (labels + batch_size) % (2 * batch_size)

    # Cross-entropy
    loss = F.cross_entropy(sim, labels)
    return loss

## Procesamiento del dataset y obtención de subgrafos

In [7]:
# Atributos correctos que TODOS los nodos deben tener
required_attrs = [
    'A00','A01','A02','A03','A04','A05','A06','A07','A08','A09',
    'A10','A11','A12','A13','A14','A15','A16','A17','A18','A19',
    'A20','A21','A22','A23','A24','A25','A26','A27','A28','A29',
    'A30','A31','A32','A33','A34','A35','A36','A37','A38','A39',
    'A40','A41','A42','A43','A44','A45','A46','A47','A48','A49',
    'A50','A51','A52','A53','A54','A55','A56','A57','A58','A59',
    'A60','A61','A62','A63','label','lat','lon','nombre','tipo'
]
required_set = set(required_attrs)

def clean_graph(G):
    """
    Elimina nodos que NO tienen exactamente los atributos requeridos.
    """
    bad_nodes = []

    for n, data in G.nodes(data=True):
        if set(data.keys()) != required_set:
            bad_nodes.append(n)

    # Eliminar
    G.remove_nodes_from(bad_nodes)

    return bad_nodes

In [8]:
A_keys = [f"A{i:02d}" for i in range(64)]

def process_city_graph(path, num_lat_bins=8, num_lon_bins=8, min_nodes=50):
    print(f"\n---------------------------------------")
    print(f"Procesando ciudad: {path}")

    # 1) Leer grafo
    Gx = nx.read_gexf(path)
    total = Gx.number_of_nodes()

    # 2) Limpiar nodos con atributos incompletos
    bad_nodes = clean_graph(Gx)
    remaining = Gx.number_of_nodes()

    print(f"   Nodos totales:         {total}")
    print(f"   Nodos eliminados:      {len(bad_nodes)}")
    print(f"   Proporción eliminada:  {len(bad_nodes)/total:.4%}")
    print(f"   Nodos restantes:       {remaining}")

    # 3) Convertir grafo completo a PyG
    data_raw_full = from_networkx(Gx)

    feat_tensors_full = []
    for k in A_keys:
        t = getattr(data_raw_full, k).view(-1, 1).float()
        feat_tensors_full.append(t)

    x_full = torch.cat(feat_tensors_full, dim=1)

    data_full_city = Data(
        x=x_full,
        edge_index=data_raw_full.edge_index,
        num_nodes=data_raw_full.num_nodes
    )

    # -----------------------
    # 3) Subgrafos por grilla
    # -----------------------
    node_ids = list(Gx.nodes())
    lats = np.array([Gx.nodes[n]['lat'] for n in node_ids])
    lons = np.array([Gx.nodes[n]['lon'] for n in node_ids])

    lat_bins = np.linspace(lats.min(), lats.max(), num_lat_bins + 1)
    lon_bins = np.linspace(lons.min(), lons.max(), num_lon_bins + 1)

    subgraphs_pyg = []

    # helper para subgrafos (reutiliza A_keys)
    def build_data_from_nx(G_sub):
        data_raw = from_networkx(G_sub)

        feat_tensors = []
        for k in A_keys:
            t = getattr(data_raw, k).view(-1, 1).float()
            feat_tensors.append(t)

        x = torch.cat(feat_tensors, dim=1)

        return Data(
            x=x,
            edge_index=data_raw.edge_index,
            num_nodes=data_raw.num_nodes
        )

    # recorrer grilla lat/lon
    for i in range(num_lat_bins):
        for j in range(num_lon_bins):

            mask = (
                (lats >= lat_bins[i]) & (lats < lat_bins[i + 1]) &
                (lons >= lon_bins[j]) & (lons < lon_bins[j + 1])
            )

            idxs = np.where(mask)[0]
            if len(idxs) < min_nodes:
                continue

            nodes_bin = [node_ids[k] for k in idxs]
            G_sub = Gx.subgraph(nodes_bin).copy()

            if G_sub.number_of_edges() == 0:
                continue

            data_sub = build_data_from_nx(G_sub)
            subgraphs_pyg.append(data_sub)

    print("Subgrafos creados:", len(subgraphs_pyg))

    return data_full_city, subgraphs_pyg

In [10]:
folder = "/content/drive/MyDrive/graph_cities/"

full_cities = []     # lista con cada grafo completo en formato PyG
all_subgraphs = []   # lista global de todos los subgrafos de todas las ciudades

city_files = [f for f in os.listdir(folder)]

print(f"Encontrados {len(city_files)} archivos .gexf\n")

for fname in tqdm(city_files, desc="Procesando ciudades"):
    path = os.path.join(folder, fname)

    data_full, subs = process_city_graph(path)

    full_cities.append((fname, data_full))
    all_subgraphs.extend(subs)

print("Total ciudades cargadas:", len(full_cities))
print("Total subgrafos globales:", len(all_subgraphs))

Encontrados 11 archivos .gexf



Procesando ciudades:   0%|          | 0/11 [00:00<?, ?it/s]


---------------------------------------
Procesando ciudad: /content/drive/MyDrive/graph_cities/lima.gexf
   Nodos totales:         31474
   Nodos eliminados:      139
   Proporción eliminada:  0.4416%
   Nodos restantes:       31335


Procesando ciudades:   9%|▉         | 1/11 [00:49<08:10, 49.04s/it]

Subgrafos creados: 30

---------------------------------------
Procesando ciudad: /content/drive/MyDrive/graph_cities/paris.gexf
   Nodos totales:         66156
   Nodos eliminados:      0
   Proporción eliminada:  0.0000%
   Nodos restantes:       66156


Procesando ciudades:  18%|█▊        | 2/11 [04:45<23:53, 159.33s/it]

Subgrafos creados: 64

---------------------------------------
Procesando ciudad: /content/drive/MyDrive/graph_cities/quito.gexf
   Nodos totales:         2064
   Nodos eliminados:      0
   Proporción eliminada:  0.0000%
   Nodos restantes:       2064


Procesando ciudades:  27%|██▋       | 3/11 [04:51<11:55, 89.47s/it] 

Subgrafos creados: 14

---------------------------------------
Procesando ciudad: /content/drive/MyDrive/graph_cities/washington.gexf
   Nodos totales:         5127
   Nodos eliminados:      0
   Proporción eliminada:  0.0000%
   Nodos restantes:       5127


Procesando ciudades:  36%|███▋      | 4/11 [05:02<06:47, 58.26s/it]

Subgrafos creados: 34

---------------------------------------
Procesando ciudad: /content/drive/MyDrive/graph_cities/berlin.gexf
   Nodos totales:         49370
   Nodos eliminados:      151
   Proporción eliminada:  0.3059%
   Nodos restantes:       49219


Procesando ciudades:  45%|████▌     | 5/11 [06:29<06:52, 68.76s/it]

Subgrafos creados: 58

---------------------------------------
Procesando ciudad: /content/drive/MyDrive/graph_cities/bogota.gexf
   Nodos totales:         30627
   Nodos eliminados:      353
   Proporción eliminada:  1.1526%
   Nodos restantes:       30274


Procesando ciudades:  55%|█████▍    | 6/11 [07:45<05:55, 71.02s/it]

Subgrafos creados: 15

---------------------------------------
Procesando ciudad: /content/drive/MyDrive/graph_cities/buenos_aires.gexf
   Nodos totales:         45526
   Nodos eliminados:      0
   Proporción eliminada:  0.0000%
   Nodos restantes:       45526


Procesando ciudades:  64%|██████▎   | 7/11 [10:02<06:10, 92.65s/it]

Subgrafos creados: 54

---------------------------------------
Procesando ciudad: /content/drive/MyDrive/graph_cities/roma.gexf
   Nodos totales:         21754
   Nodos eliminados:      0
   Proporción eliminada:  0.0000%
   Nodos restantes:       21754


Procesando ciudades:  73%|███████▎  | 8/11 [10:37<03:42, 74.23s/it]

Subgrafos creados: 49

---------------------------------------
Procesando ciudad: /content/drive/MyDrive/graph_cities/londres.gexf
   Nodos totales:         79500
   Nodos eliminados:      2459
   Proporción eliminada:  3.0931%
   Nodos restantes:       77041


Procesando ciudades:  82%|████████▏ | 9/11 [12:43<03:00, 90.39s/it]

Subgrafos creados: 64

---------------------------------------
Procesando ciudad: /content/drive/MyDrive/graph_cities/santiago.gexf
   Nodos totales:         29267
   Nodos eliminados:      0
   Proporción eliminada:  0.0000%
   Nodos restantes:       29267


Procesando ciudades:  91%|█████████ | 10/11 [13:27<01:16, 76.15s/it]

Subgrafos creados: 46

---------------------------------------
Procesando ciudad: /content/drive/MyDrive/graph_cities/johannesburgo.gexf
   Nodos totales:         6307
   Nodos eliminados:      16
   Proporción eliminada:  0.2537%
   Nodos restantes:       6291


Procesando ciudades: 100%|██████████| 11/11 [13:35<00:00, 74.13s/it]

Subgrafos creados: 26
Total ciudades cargadas: 11
Total subgrafos globales: 454





In [14]:
save_path = "/content/drive/MyDrive/precomputed_graphs.pt"

torch.save(
    {
        "full_cities": full_cities,
        "all_subgraphs": all_subgraphs,
    },
    save_path,
)
print("Guardado en:", save_path)

Guardado en: /content/drive/MyDrive/precomputed_graphs.pt


## Definición del Dataset (pytorch)

In [11]:
class CitySubgraphDataset(Dataset):
    def __init__(self, graphs):
        super().__init__()
        self.graphs = graphs

    def len(self):
        return len(self.graphs)

    def get(self, idx):
        return self.graphs[idx]

dataset = CitySubgraphDataset(all_subgraphs)
loader = DataLoader(dataset, batch_size=8, shuffle=True)

## Bucle de entrenamiento

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dimension del embedding del nodo (64)
in_dim = dataset[0].x.size(1)
model = GraphormerEncoder(in_dim=in_dim, hidden_dim=128, num_layers=4, num_heads=4, out_dim=128).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(1, 51):
    model.train()
    total_loss = 0.0
    total_graphs = 0

    for batch_data in loader:
        batch_data = batch_data.to(device)

        # Separar en lista de grafos individuales
        data_list = batch_data.to_data_list()

        # Se crean las augmentaciones
        v1_list, v2_list = [], []
        for g in data_list:
            a1, a2 = graphcl_augment(g)
            v1_list.append(a1)
            v2_list.append(a2)

        # Se modelan como batches independientes
        v1_batch = Batch.from_data_list(v1_list).to(device)
        v2_batch = Batch.from_data_list(v2_list).to(device)

        # Se obtienen sus embeddings
        z1 = model(v1_batch.x, v1_batch.edge_index, v1_batch.batch)
        z2 = model(v2_batch.x, v2_batch.edge_index, v2_batch.batch)

        # Se cálcula la pérdida
        loss = nt_xent_loss(z1, z2, temperature=0.2)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * z1.size(0)
        total_graphs += z1.size(0)

    avg_loss = total_loss / total_graphs
    print(f"Epoch {epoch:03d} | Loss: {avg_loss:.4f}")

Epoch 001 | Loss: 0.7301
Epoch 002 | Loss: 0.5180
Epoch 003 | Loss: 0.4593
Epoch 004 | Loss: 0.4501
Epoch 005 | Loss: 0.4206
Epoch 006 | Loss: 0.3966
Epoch 007 | Loss: 0.3546
Epoch 008 | Loss: 0.3759
Epoch 009 | Loss: 0.3867
Epoch 010 | Loss: 0.3150
Epoch 011 | Loss: 0.3508
Epoch 012 | Loss: 0.3361
Epoch 013 | Loss: 0.2884
Epoch 014 | Loss: 0.3298
Epoch 015 | Loss: 0.3338
Epoch 016 | Loss: 0.3198
Epoch 017 | Loss: 0.3077
Epoch 018 | Loss: 0.3157
Epoch 019 | Loss: 0.3220
Epoch 020 | Loss: 0.2973
Epoch 021 | Loss: 0.3142
Epoch 022 | Loss: 0.2952
Epoch 023 | Loss: 0.2824
Epoch 024 | Loss: 0.2633
Epoch 025 | Loss: 0.2697
Epoch 026 | Loss: 0.2898
Epoch 027 | Loss: 0.2821
Epoch 028 | Loss: 0.2860
Epoch 029 | Loss: 0.2703
Epoch 030 | Loss: 0.2504
Epoch 031 | Loss: 0.2860
Epoch 032 | Loss: 0.2894
Epoch 033 | Loss: 0.2567
Epoch 034 | Loss: 0.2492
Epoch 035 | Loss: 0.2517
Epoch 036 | Loss: 0.2674
Epoch 037 | Loss: 0.2721
Epoch 038 | Loss: 0.2518
Epoch 039 | Loss: 0.2720
Epoch 040 | Loss: 0.2891


In [None]:
save_path = "/content/drive/MyDrive/graphcl_city_encoder.pt"
torch.save(model.state_dict(), save_path)
print("Modelo guardado en:", save_path)


In [12]:
in_dim = dataset[0].x.size(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GraphormerEncoder(in_dim=in_dim, hidden_dim=128, num_layers=4, num_heads=4, out_dim=128).to(device)

load_path = "/content/drive/MyDrive/graphcl_city_encoder.pt"
model.load_state_dict(torch.load(load_path, map_location=device))
model.eval()

GraphormerEncoder(
  (input_proj): Linear(in_features=64, out_features=128, bias=True)
  (layers): ModuleList(
    (0-3): 4 x TransformerConv(128, 128, heads=4)
  )
  (norms): ModuleList(
    (0-3): 4 x LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (out_proj): Linear(in_features=128, out_features=128, bias=True)
)

In [13]:
results = []  # para guardar resultados por ciudad

for fname, city_data in full_cities:
    print(f"\n=== Ciudad: {fname} ===")

    # --- 1) Embedding del grafo completo (original) ---
    g = city_data.to(device)
    batch_full = torch.zeros(g.num_nodes, dtype=torch.long, device=device)

    with torch.no_grad():
        z_orig = model(g.x, g.edge_index, batch_full)[0]  # [128]

    # --- 2) Dos augmentaciones del grafo completo ---
    a1, a2 = graphcl_augment(g)

    b1 = Batch.from_data_list([a1]).to(device)
    b2 = Batch.from_data_list([a2]).to(device)

    with torch.no_grad():
        z1 = model(b1.x, b1.edge_index, b1.batch)[0]
        z2 = model(b2.x, b2.edge_index, b2.batch)[0]

    # --- 3) Similitudes coseno ---
    cos_orig_a1 = F.cosine_similarity(z_orig, z1, dim=0).item()
    cos_orig_a2 = F.cosine_similarity(z_orig, z2, dim=0).item()
    cos_a1_a2   = F.cosine_similarity(z1, z2, dim=0).item()

    print(f"cos(city, aug1) = {cos_orig_a1:.4f}")
    print(f"cos(city, aug2) = {cos_orig_a2:.4f}")
    print(f"cos(aug1, aug2) = {cos_a1_a2:.4f}")

    results.append({
        "city": fname,
        "cos_city_aug1": cos_orig_a1,
        "cos_city_aug2": cos_orig_a2,
        "cos_aug1_aug2": cos_a1_a2,
    })


=== Ciudad: lima.gexf ===
cos(city, aug1) = 0.9609
cos(city, aug2) = 0.9615
cos(aug1, aug2) = 1.0000

=== Ciudad: paris.gexf ===
cos(city, aug1) = 0.9875
cos(city, aug2) = 0.9880
cos(aug1, aug2) = 1.0000

=== Ciudad: quito.gexf ===
cos(city, aug1) = 0.9964
cos(city, aug2) = 0.9963
cos(aug1, aug2) = 0.9999

=== Ciudad: washington.gexf ===
cos(city, aug1) = 0.9879
cos(city, aug2) = 0.9902
cos(aug1, aug2) = 0.9997

=== Ciudad: berlin.gexf ===
cos(city, aug1) = 0.9796
cos(city, aug2) = 0.9795
cos(aug1, aug2) = 0.9999

=== Ciudad: bogota.gexf ===
cos(city, aug1) = 0.9918
cos(city, aug2) = 0.9913
cos(aug1, aug2) = 1.0000

=== Ciudad: buenos_aires.gexf ===
cos(city, aug1) = 0.9776
cos(city, aug2) = 0.9772
cos(aug1, aug2) = 1.0000

=== Ciudad: roma.gexf ===
cos(city, aug1) = 0.9898
cos(city, aug2) = 0.9898
cos(aug1, aug2) = 1.0000

=== Ciudad: londres.gexf ===
cos(city, aug1) = 0.9908
cos(city, aug2) = 0.9908
cos(aug1, aug2) = 1.0000

=== Ciudad: santiago.gexf ===
cos(city, aug1) = 0.9776
cos

In [15]:
city_embeddings = {}

for fname, city_data in full_cities:
    print(f"\n=== Ciudad: {fname} ===")

    g = city_data.to(device)
    batch_full = torch.zeros(g.num_nodes, dtype=torch.long, device=device)

    with torch.no_grad():
        z_orig = model(g.x, g.edge_index, batch_full)[0]  # vector [128]

    # Guardamos en diccionario (como tensor CPU)
    city_embeddings[fname] = z_orig.cpu()

# Guardar el archivo
save_path = "/content/drive/MyDrive/city_embeddings.pt"
torch.save(city_embeddings, save_path)

print("Embeddings guardados en:", save_path)


=== Ciudad: lima.gexf ===

=== Ciudad: paris.gexf ===

=== Ciudad: quito.gexf ===

=== Ciudad: washington.gexf ===

=== Ciudad: berlin.gexf ===

=== Ciudad: bogota.gexf ===

=== Ciudad: buenos_aires.gexf ===

=== Ciudad: roma.gexf ===

=== Ciudad: londres.gexf ===

=== Ciudad: santiago.gexf ===

=== Ciudad: johannesburgo.gexf ===
Embeddings guardados en: /content/drive/MyDrive/city_embeddings.pt


In [16]:
city_embeddings = torch.load("/content/drive/MyDrive/city_embeddings.pt")
print(city_embeddings.keys())
print(city_embeddings["santiago.gexf"].shape)

dict_keys(['lima.gexf', 'paris.gexf', 'quito.gexf', 'washington.gexf', 'berlin.gexf', 'bogota.gexf', 'buenos_aires.gexf', 'roma.gexf', 'londres.gexf', 'santiago.gexf', 'johannesburgo.gexf'])
torch.Size([128])


In [19]:
cities = list(city_embeddings.keys())

# Pila de embeddings -> tensor [num_cities, dim]
emb_matrix = torch.stack([city_embeddings[c] for c in cities])   # shape [C, D]

In [20]:
cos_sim_matrix = emb_matrix @ emb_matrix.t()   # producto punto = coseno por normalización

In [23]:
df_cos = pd.DataFrame(
    cos_sim_matrix.cpu().numpy(),
    index=cities,
    columns=cities
)

In [24]:
df_cos

Unnamed: 0,lima.gexf,paris.gexf,quito.gexf,washington.gexf,berlin.gexf,bogota.gexf,buenos_aires.gexf,roma.gexf,londres.gexf,santiago.gexf,johannesburgo.gexf
lima.gexf,1.0,0.085749,-0.024219,0.013323,0.16713,0.013016,0.107667,0.1532,0.154314,0.013256,0.076943
paris.gexf,0.085749,1.0,-0.046359,-0.051371,0.09245,0.153589,0.128131,0.31102,0.320938,-0.031447,-0.10853
quito.gexf,-0.024219,-0.046359,1.0,-0.056939,-0.132539,0.002274,0.059456,0.094451,0.072713,0.081641,0.157115
washington.gexf,0.013323,-0.051371,-0.056939,1.0,-0.088347,-0.032739,0.093363,-0.016449,0.141339,0.009253,0.039556
berlin.gexf,0.16713,0.09245,-0.132539,-0.088347,1.0,-0.053185,-0.122023,-0.200518,0.114098,-0.198178,-0.007097
bogota.gexf,0.013016,0.153589,0.002274,-0.032739,-0.053185,1.0,0.042902,0.243877,0.074936,-0.075615,-0.111521
buenos_aires.gexf,0.107667,0.128131,0.059456,0.093363,-0.122023,0.042902,1.0,-0.027223,0.234041,0.270748,0.04167
roma.gexf,0.1532,0.31102,0.094451,-0.016449,-0.200518,0.243877,-0.027223,1.0,0.273561,-0.099562,0.093502
londres.gexf,0.154314,0.320938,0.072713,0.141339,0.114098,0.074936,0.234041,0.273561,1.0,-0.056566,-0.032398
santiago.gexf,0.013256,-0.031447,0.081641,0.009253,-0.198178,-0.075615,0.270748,-0.099562,-0.056566,1.0,-0.100316


In [25]:
def top_k_pairs(df, k=10):
    # Convertir matriz a formato largo
    df_long = (
        df.stack()
          .reset_index()
          .rename(columns={"level_0": "city1", "level_1": "city2", 0: "sim"})
    )

    # eliminar pares con la misma ciudad
    df_long = df_long[df_long["city1"] != df_long["city2"]]

    # eliminar duplicados: mantener solo city1 < city2
    df_long = df_long[df_long["city1"] < df_long["city2"]]

    # ordenar y devolver
    return df_long.sort_values("sim", ascending=False).head(k)

top_pairs = top_k_pairs(df_cos, k=10)
top_pairs

Unnamed: 0,city1,city2,sim
89,londres.gexf,paris.gexf,0.320938
18,paris.gexf,roma.gexf,0.31102
95,londres.gexf,roma.gexf,0.273561
75,buenos_aires.gexf,santiago.gexf,0.270748
62,bogota.gexf,roma.gexf,0.243877
74,buenos_aires.gexf,londres.gexf,0.234041
44,berlin.gexf,lima.gexf,0.16713
112,johannesburgo.gexf,quito.gexf,0.157115
8,lima.gexf,londres.gexf,0.154314
56,bogota.gexf,paris.gexf,0.153589


## Sacamos el embedding de Santiago

In [None]:
data_santiago = data_santiago.to(device)
batch_full = torch.zeros(data_santiago.num_nodes, dtype=torch.long, device=device)

model.eval()
with torch.no_grad():
    emb_santiago = model(data_santiago.x, data_santiago.edge_index, batch_full)  # [1, out_dim]
    emb_santiago = emb_santiago[0]
    print("Embedding de Santiago:", emb_santiago.shape)

Embedding de Santiago: torch.Size([128])


## Verificar que acerca augmentaciones y separa subgrafos distintos

### Subgrafos al azar

In [None]:
model.eval()

# Tomamos un subgrafo al azar
g = random.choice(subgraphs_pyg)
g = g.to(device)

# Sacamos embedding del subgrafo original
b_orig = Batch.from_data_list([g]).to(device)
with torch.no_grad():
    z_orig = model(b_orig.x, b_orig.edge_index, b_orig.batch)[0]

# Augmentamos el grafo
a1, a2 = graphcl_augment(g)

b1 = Batch.from_data_list([a1]).to(device)
b2 = Batch.from_data_list([a2]).to(device)

with torch.no_grad():
    z1 = model(b1.x, b1.edge_index, b1.batch)[0]
    z2 = model(b2.x, b2.edge_index, b2.batch)[0]

# Cosenos entre original y augmentaciones
cos_orig_a1 = F.cosine_similarity(z_orig, z1, dim=0).item()
cos_orig_a2 = F.cosine_similarity(z_orig, z2, dim=0).item()
cos_a1_a2   = F.cosine_similarity(z1, z2, dim=0).item()

print("cosine(original, vista1) =", cos_orig_a1)
print("cosine(original, vista2) =", cos_orig_a2)
print("cosine(vista1, vista2)   =", cos_a1_a2)

# Sacamos otro grafo al azar
g2 = random.choice([h for h in subgraphs_pyg if h is not g]).to(device)
b3 = Batch.from_data_list([g2]).to(device)

with torch.no_grad():
    z3 = model(b3.x, b3.edge_index, b3.batch)[0]

# Comparamos su similaridad
cos_diff = F.cosine_similarity(z_orig, z3, dim=0).item()
print("cosine(original, otro subgrafo) =", cos_diff)


cosine(original, vista1) = 0.9545512199401855
cosine(original, vista2) = 0.9691970348358154
cosine(vista1, vista2)   = 0.995003342628479
cosine(original, otro subgrafo) = -0.06205920875072479


### Grafo Santiago

In [None]:
model.eval()

# grafo completo
g = data_santiago.to(device)

# augmentaciones del grafo completo
a1, a2 = graphcl_augment(g)

b_orig = Batch.from_data_list([g]).to(device)
b1     = Batch.from_data_list([a1]).to(device)
b2     = Batch.from_data_list([a2]).to(device)

with torch.no_grad():
    z_orig = model(b_orig.x, b_orig.edge_index, b_orig.batch)[0]
    z1     = model(b1.x, b1.edge_index, b1.batch)[0]
    z2     = model(b2.x, b2.edge_index, b2.batch)[0]

print("cos(original, vista1) =", F.cosine_similarity(z_orig, z1, dim=0).item())
print("cos(original, vista2) =", F.cosine_similarity(z_orig, z2, dim=0).item())
print("cos(vista1, vista2)   =", F.cosine_similarity(z1, z2, dim=0).item())

cos(original, vista1) = 0.9369595646858215
cos(original, vista2) = 0.9363678693771362
cos(vista1, vista2)   = 0.9998849630355835


In [None]:
g = random.choice(subgraphs_pyg).to(device)
b = Batch.from_data_list([g]).to(device)
with torch.no_grad():
    z_sub = model(b.x, b.edge_index, b.batch)[0]

cos_orig_sub = F.cosine_similarity(z_orig, z_sub, dim=0).item()
print("cos(Santiago completo, subgrafo al azar) =", cos_orig_sub)

cos(Santiago completo, subgrafo al azar) = 0.08978444337844849
