In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

# Leer archivos CORA
citations = pd.read_csv("cora/cora.cites", sep="\t", header=None, names=["target", "source"])

column_names = ["paper_id"] + [f"term_{idx}" for idx in range(1433)] + ["subject"]
papers = pd.read_csv("cora/cora.content", sep="\t", header=None, names=column_names)

In [2]:
# Generar set de train y test
class_values = sorted(papers["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}

papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])

train_data, test_data = [], []

for _, group_data in papers.groupby("subject"):
    # Select around 50% of the dataset for training.
    random_selection = np.random.rand(len(group_data.index)) <= 0.5
    train_data.append(group_data[random_selection])
    test_data.append(group_data[~random_selection])

train_data = pd.concat(train_data).sample(frac=1)
test_data = pd.concat(test_data).sample(frac=1)

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (1362, 1435)
Test data shape: (1346, 1435)


In [3]:
train_data.head()

Unnamed: 0,paper_id,term_0,term_1,term_2,term_3,term_4,term_5,term_6,term_7,term_8,...,term_1424,term_1425,term_1426,term_1427,term_1428,term_1429,term_1430,term_1431,term_1432,subject
1232,1583,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1911,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,5
1037,1012,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1009,2675,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
274,851,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,4


# Manejo de datos específico para nuestras GNNs.
Lo primero es que ahora las GNNs van a funcionar en base a las conexiones entro los papers (además de los features obviamente). La GNN se compila con la info del grado, por lo que el x_train y x_test solo deben tener los id de los nodos relevantes.

In [4]:
feature_names = list(set(papers.columns) - {"paper_id", "subject"})
num_features = len(feature_names)
num_classes = len(class_idx)

# Crear train y test features (X).
x_train = train_data["paper_id"].to_numpy()
x_test = test_data["paper_id"].to_numpy()
# Crear train y test targets (y).
y_train = train_data["subject"]
y_test = test_data["subject"]

In [5]:
# Matriz en forma de lista de pares
edges = citations[["source", "target"]].to_numpy().T

# Agregar peso a cada arista, por ahora son todos 1s, todas valen lo mismo.
edge_weights = torch.ones(edges.shape[1])

# Crear los features para cada nodo.
node_features = torch.tensor(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=torch.float32
)

# El grafo es la unión de estas tres cosas
graph_info = (node_features, edges, edge_weights)

print("Edges shape:", edges.shape)
print("Nodes shape:", node_features.shape)

# El primer vector es la lista de los índices de los nodos source de edges,
# El segundo vector es la lista de los índices de los nodos target
node_indices, neighbour_indices = edges[0], edges[1]

# Imprimir los índices de los nodos y sus vecinos
print("Node indices:", node_indices)
print("Neighbour indices:", neighbour_indices)

Edges shape: (2, 5429)
Nodes shape: torch.Size([2708, 1433])
Node indices: [  21  905  906 ... 2586 1874 2707]
Neighbour indices: [   0    0    0 ... 1874 1876 1897]


## Un modelo para una capa de la GNN
Esta es la capa que va a hacer los pasos de agregación y update. Se van a definir estas operaciones, y las necesarias para que funcione el modelo

In [6]:
def create_MLP(layers_dims, dropout_rate):
    layers = []
    for i in range(len(layers_dims) - 1):
        layers.append(nn.Linear(layers_dims[i], layers_dims[i+1]))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(dropout_rate))
    return nn.Sequential(*layers)

In [7]:
class GNNLayer(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.2):
        super(GNNLayer, self).__init__()

        # Preprocesador que transforma las representaciones de nodos antes de la agregación
        self.preprocesador = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )
        # Updater que combina las representaciones de nodos con los mensajes agregados
        self.updater = nn.Sequential(
            nn.Linear(input_dim + output_dim, output_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

    def prepare(self, node_representations, weights=None):
        # Preprocesar las representaciones de los vecinos para la agregación
        messages = self.preprocesador(node_representations)
        if weights is not None:
            messages = messages * weights.unsqueeze(-1)
        return messages

    def aggregate(self, node_indices, neighbour_messages, node_representations):
        node_indices = torch.tensor(node_indices, dtype=torch.long)
        num_nodes = node_representations.size(0)
        # Tensor para almacenar los mensajes agregados
        aggregated_message = torch.zeros((num_nodes, neighbour_messages.size(1)), dtype=neighbour_messages.dtype)
        # Agregar los mensajes de los vecinos a los nodos correspondientes
        aggregated_message.index_add_(0, node_indices, neighbour_messages)
        return aggregated_message

    def update(self, node_representations, aggregated_messages):
        # Concatenar las representaciones de los nodos con los mensajes agregados
        h = torch.cat([node_representations, aggregated_messages], dim=1)
        # Actualizar las representaciones de los nodos utilizando el Updater
        node_embeddings = self.updater(h)
        return node_embeddings

    def forward(self, inputs):
        node_representations, edges, edge_weights = inputs
        node_indices, neighbour_indices = edges
        node_indices = torch.tensor(node_indices, dtype=torch.long)
        neighbour_indices = torch.tensor(neighbour_indices, dtype=torch.long)

        # Obtener las representaciones de los vecinos
        neighbour_representations = node_representations[neighbour_indices]
        # Preprocesar los mensajes de los vecinos
        neighbour_messages = self.prepare(neighbour_representations, edge_weights)
        # Agregar los mensajes de los vecinos
        aggregated_messages = self.aggregate(node_indices, neighbour_messages, node_representations)
        # Actualizar las representaciones de los nodos
        return self.update(node_representations, aggregated_messages)


## Juntando todo en un clasificador
Ahora si, definimos un modelo igual que la vez anterior, solo que ahora usa nuestra capa!



In [8]:
class GNNBasica(nn.Module):
    def __init__(self, graph_info, num_classes, hidden_dims=[32, 32], dropout_rate=0.2):
        super(GNNBasica, self).__init__()
        # Desempaquetar la información del grafo
        node_features, edges, edge_weights = graph_info
        self.node_features = node_features
        self.edges = edges
        self.edge_weights = edge_weights / edge_weights.sum()

        input_dim = node_features.size(1)
        # Crear MLP para preprocesar las características de los nodos
        self.preprocesar = create_MLP([input_dim] + hidden_dims, dropout_rate)

        # Crear capas de la red neuronal gráfica (GNN)
        self.capa1 = GNNLayer(hidden_dims[-1], hidden_dims[-1], dropout_rate)
        self.capa2 = GNNLayer(hidden_dims[-1], hidden_dims[-1], dropout_rate)

        # Crear MLP para el postprocesamiento de los nodos
        self.postprocess = create_MLP(hidden_dims, dropout_rate)
        self.clas = nn.Linear(hidden_dims[-1], num_classes)

    def forward(self, node_indices):
        nodos_preprocesados = self.preprocesar(self.node_features)
        # Pasar las características preprocesadas a la primera capa GNN
        paso_mens1 = self.capa1((nodos_preprocesados, self.edges, self.edge_weights))
        skip1 = nodos_preprocesados + paso_mens1 # Residual connection

        # Pasar a la segunda capa GNN
        paso_mens2 = self.capa2((skip1, self.edges, self.edge_weights))
        skip2 = paso_mens2 + skip1 # Residual connection
        postprocesado = self.postprocess(skip2)

        # Obtener las representaciones para los índices dados
        node_embeddings = postprocesado[node_indices]
        return self.clas(node_embeddings)

In [9]:
# Inicializar modelo
gnn_model = GNNBasica(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_dims=[32, 32],
    dropout_rate=0.2
)

print(gnn_model)

GNNBasica(
  (preprocesar): Sequential(
    (0): Linear(in_features=1433, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=32, out_features=32, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
  )
  (capa1): GNNLayer(
    (preprocesador): Sequential(
      (0): Linear(in_features=32, out_features=32, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
    )
    (updater): Sequential(
      (0): Linear(in_features=64, out_features=32, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
    )
  )
  (capa2): GNNLayer(
    (preprocesador): Sequential(
      (0): Linear(in_features=32, out_features=32, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
    )
    (updater): Sequential(
      (0): Linear(in_features=64, out_features=32, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
    )
  )
  (postprocess): Sequential(
    (0): Linear(i

In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gnn_model.parameters(), lr=0.01)

class EarlyStopping:
    def __init__(self, patience=50, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_score = None
        self.early_stop = False
        self.counter = 0
        self.best_weights = None

    def __call__(self, val_acc, model):
        score = val_acc

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                model.load_state_dict(self.best_weights)
        else:
            self.best_score = score
            self.save_checkpoint(model)
            self.counter = 0

    def save_checkpoint(self, model):
        self.best_weights = model.state_dict()

# Entrenamiento del modelo
x_train_tensor = torch.tensor(x_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.long)

x_test_tensor = torch.tensor(x_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)

# Dividir los datos de entrenamiento en entrenamiento y validación (Keras lo separa internamente con "validation_split")
x_train_split, x_val_split, y_train_split, y_val_split = train_test_split(x_train_tensor, y_train_tensor, test_size=0.15)
x_train_split, x_val_split, y_train_split, y_val_split = train_test_split(x_train_tensor, y_train_tensor, test_size=0.15)

epochs = 300
early_stopping = EarlyStopping(patience=50)

for epoch in range(epochs):
    gnn_model.train()

    optimizer.zero_grad()
    outputs = gnn_model(x_train_split)
    loss = criterion(outputs, y_train_split)
    loss.backward()
    optimizer.step()

    # Validación
    gnn_model.eval()
    val_acc = 0
    with torch.no_grad():
        outputs = gnn_model(x_val_split)
        _, predicted = torch.max(outputs.data, 1)
        correct = (predicted == y_val_split).sum().item()
        total = y_val_split.size(0)
        val_acc = correct / total

    print(f"Epoch {epoch+1:03d}, Loss: {loss.item():.5f}, Val Acc: {val_acc:.4f}")

    early_stopping(val_acc, gnn_model)
    if early_stopping.early_stop:
        print("Early stopping")
        break


  node_indices = torch.tensor(node_indices, dtype=torch.long)


Epoch 001, Loss: 1.93900, Val Acc: 0.2829
Epoch 002, Loss: 1.91375, Val Acc: 0.2829
Epoch 003, Loss: 1.89442, Val Acc: 0.2829
Epoch 004, Loss: 1.87075, Val Acc: 0.2829
Epoch 005, Loss: 1.82836, Val Acc: 0.2829
Epoch 006, Loss: 1.77332, Val Acc: 0.2829
Epoch 007, Loss: 1.72553, Val Acc: 0.2829
Epoch 008, Loss: 1.65797, Val Acc: 0.2878
Epoch 009, Loss: 1.56678, Val Acc: 0.3512
Epoch 010, Loss: 1.48786, Val Acc: 0.3707
Epoch 011, Loss: 1.38816, Val Acc: 0.4049
Epoch 012, Loss: 1.29528, Val Acc: 0.4244
Epoch 013, Loss: 1.20073, Val Acc: 0.4439
Epoch 014, Loss: 1.11611, Val Acc: 0.4732
Epoch 015, Loss: 1.06174, Val Acc: 0.4780
Epoch 016, Loss: 0.96783, Val Acc: 0.5171
Epoch 017, Loss: 0.90052, Val Acc: 0.5171
Epoch 018, Loss: 0.81517, Val Acc: 0.5512
Epoch 019, Loss: 0.74735, Val Acc: 0.5854
Epoch 020, Loss: 0.67847, Val Acc: 0.6049
Epoch 021, Loss: 0.63345, Val Acc: 0.6049
Epoch 022, Loss: 0.53654, Val Acc: 0.6000
Epoch 023, Loss: 0.50542, Val Acc: 0.5902
Epoch 024, Loss: 0.46925, Val Acc:

In [11]:
# Convertir los datos de prueba a tensores
x_test_tensor = torch.tensor(x_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)

gnn_model.eval()

# Evaluación
with torch.no_grad():
    outputs = gnn_model(x_test_tensor)
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == y_test_tensor).sum().item()
    total = y_test_tensor.size(0)
    test_acc = correct / total

print(f"Test Accuracy: {test_acc:.4f}")

Test Accuracy: 0.6902


  node_indices = torch.tensor(node_indices, dtype=torch.long)
