## **Practico 3: Entrenamos graph neural networks**

Se entrenarán tres modelos de GNN:
1. GCN
2. GraphSAGE
3. GAT

In [19]:
# CELDA 1: Imports y configuración básica
# ------------------------------------------------------------
import os
import random
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn, optim
from torch_geometric.datasets import EllipticBitcoinDataset
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


In [20]:
# CELDA 2: Cargar dataset (PyG)
# ------------------------------------------------------------
root = 'data/elliptic'   # carpeta donde se guardará/descargará
dataset = EllipticBitcoinDataset(root=root)

# El dataset entregará un objeto Data (gráfico completo) en dataset[0]
data = dataset[0]
print(data)
# típicamente: Data(x=[N, 166], edge_index=[2, E], y=[N], ...)


Data(x=[203769, 165], edge_index=[2, 234355], y=[203769], train_mask=[203769], test_mask=[203769])


In [21]:
# CELDA 3: Exploración rápida de etiquetas / clases
# ------------------------------------------------------------
# Observa distribución de etiquetas (en la mayoría de implementaciones: 0 unknown, 1 illicit, 2 licit)
ys = data.y.cpu().numpy()
(unique, counts) = np.unique(ys, return_counts=True)
print(dict(zip(unique, counts)))
# Verifica número de nodos, features:
print("N nodos:", data.num_nodes)
print("N edges:", data.num_edges)
print("dim features:", data.num_node_features if hasattr(data, 'num_node_features') else data.x.shape[1])


{np.int64(0): np.int64(42019), np.int64(1): np.int64(4545), np.int64(2): np.int64(157205)}
N nodos: 203769
N edges: 234355
dim features: 165


In [None]:
labeled_indices = torch.where(data.y >= 0)[0]  # todos los nodos con label válido


In [22]:
# CELDA 4: Pipeline de preprocesamiento (sklearn-style) para tensores
# ------------------------------------------------------------
# Vamos a construir un Transformer compatible con sklearn que acepta y devuelve
# tensores de torch (o arrays) para integrarlo en un Pipeline.
class TorchFeatureTransformer(BaseEstimator, TransformerMixin):
    """
    Encapsula transformaciones sklearn (StandardScaler) para features de nodos
    representadas como torch.Tensor.
    El pipeline internamente convierte a numpy, aplica scaler y vuelve a torch.
    """
    def __init__(self, scaler=None):
        self.scaler = scaler if scaler is not None else StandardScaler()
        self.fitted = False

    def fit(self, X, y=None):
        # X puede ser torch.Tensor o np.array de shape (N, F)
        arr = X.detach().cpu().numpy() if torch.is_tensor(X) else np.asarray(X)
        self.scaler.fit(arr)
        self.fitted = True
        return self

    def transform(self, X):
        arr = X.detach().cpu().numpy() if torch.is_tensor(X) else np.asarray(X)
        arr_t = self.scaler.transform(arr)
        return torch.tensor(arr_t, dtype=torch.float32)

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y)
        return self.transform(X)

# Crea pipeline (útil si más pasos se agregan)
feature_pipeline = Pipeline([
    ('torch_transform', TorchFeatureTransformer(scaler=StandardScaler()))
])


In [23]:
# CELDA 5: Preparar X, y y máscaras de entrenamiento/validación/test
# ------------------------------------------------------------
# Etiquetas: 0 = unknown, 1 = illicit, 2 = licit  (verifica en tu versión si cambia)
# Nosotros definimos problema binario: illicit (1) vs licit (0). Filtramos 'unknown'.
labels = data.y.clone().cpu()  # tensor (N,)
# Crea máscara de nodos etiquetados (no-unknown)
labeled_mask = labels != 0

# Mapear: illicit -> 1, licit -> 0
# asumimos: 1 = illicit, 2 = licit -> transformar a {0,1}
y_binary = labels.clone()
y_binary[labels == 2] = 0  # licit -> 0
y_binary[labels == 1] = 1  # illicit -> 1
y_binary[labels == 0] = -1  # unknown marcado como -1 para ignorar

# Extraer índices etiquetados
labeled_idx = torch.where(labeled_mask)[0].cpu().numpy()
y_labeled = y_binary[labeled_idx].cpu().numpy().astype(int)

# Hacemos un split estratificado sobre los nodos etiquetados
train_idx_rel, test_idx_rel, y_train_rel, y_test_rel = train_test_split(
    np.arange(len(labeled_idx)),
    y_labeled,
    test_size=0.2,
    stratify=y_labeled,
    random_state=42
)
train_idx = labeled_idx[train_idx_rel]
test_idx = labeled_idx[test_idx_rel]

# De test separamos validación (80/20 sobre test)
val_rel, test_rel, y_val_rel, y_test_rel = train_test_split(
    np.arange(len(test_idx)),
    y_labeled[test_idx_rel],
    test_size=0.5,
    stratify=y_labeled[test_idx_rel],
    random_state=42
)
val_idx = test_idx[val_rel]
test_idx = test_idx[test_rel]

# Construimos boolean masks en formato torch (útiles para PyG training)
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
val_mask   = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask  = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True

print("N train:", train_mask.sum().item(), "N val:", val_mask.sum().item(), "N test:", test_mask.sum().item())


N train: 129400 N val: 16175 N test: 16175


In [24]:
# CELDA 6: Aplicar pipeline de features (fit sólo con nodos de entrenamiento o con todos?)
# ------------------------------------------------------------
# Opción 1 (recomendado): ajustar scaler usando sólo nodos de entrenamiento etiquetados
X = data.x  # tensor [N, F]
feature_pipeline.fit(X[train_mask])   # ajustamos con distrib. de train
X_trans = feature_pipeline.transform(X)  # tensor transformado (N, F)
data.x = X_trans  # reemplazamos features en el objeto Data
print("Features transformadas. shape:", data.x.shape)


Features transformadas. shape: torch.Size([203769, 165])




In [25]:
# CELDA 7: Definir modelo GCN simple (2 capas)
# ------------------------------------------------------------
class GCNNet(nn.Module):
    def __init__(self, in_channels, hidden_channels=64, out_channels=2, dropout=0.5):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return x  # logits (N, out_channels)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCNNet(in_channels=data.num_node_features, hidden_channels=64, out_channels=2, dropout=0.5).to(device)
data = data.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()


In [26]:
# CELDA 8: Training loop (usando masks)
# ------------------------------------------------------------
def train(model, data, train_mask, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data)  # logits for all nodes (N, 2)
    # aplicamos mask para loss (solo nodos de entrenamiento)
    loss = criterion(out[train_mask], data.y[train_mask].long())
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def evaluate(model, data, mask):
    model.eval()
    out = model(data)
    probs = F.softmax(out, dim=1)
    preds = probs.argmax(dim=1)
    y_true = data.y[mask].cpu().numpy()
    y_pred = preds[mask].cpu().numpy()
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', pos_label=1)
    return {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'preds': preds}

# Ajuste: las etiquetas de data.y deben ser 0 o 1 para nodo etiquetado.
# En nuestro pipeline, transformamos labels (2->0, 1->1). Asegurémonos:
# los nodos unknown deben quedar fuera por las masks (no se usarán).
data.y = y_binary.to(device)  # contiene -1 para unknown; convertimos a device

# Entrenamiento
n_epochs = 200
best_val_f1 = 0.0
best_state = None
for epoch in range(1, n_epochs + 1):
    loss = train(model, data, train_mask.to(device), optimizer)
    eval_train = evaluate(model, data, train_mask.to(device))
    eval_val = evaluate(model, data, val_mask.to(device))
    if eval_val['f1'] > best_val_f1:
        best_val_f1 = eval_val['f1']
        best_state = model.state_dict()
    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | Loss: {loss:.4f} | Train F1: {eval_train['f1']:.4f} | Val F1: {eval_val['f1']:.4f}")


Epoch 001 | Loss: 2.0239 | Train F1: 0.0458 | Val F1: 0.0455
Epoch 010 | Loss: 0.2134 | Train F1: 0.0000 | Val F1: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 020 | Loss: 0.1834 | Train F1: 0.0277 | Val F1: 0.0336
Epoch 030 | Loss: 0.1460 | Train F1: 0.0347 | Val F1: 0.0300
Epoch 040 | Loss: 0.1297 | Train F1: 0.0256 | Val F1: 0.0298
Epoch 050 | Loss: 0.1158 | Train F1: 0.0793 | Val F1: 0.0677
Epoch 060 | Loss: 0.1118 | Train F1: 0.1031 | Val F1: 0.0882
Epoch 070 | Loss: 0.1052 | Train F1: 0.0983 | Val F1: 0.0842
Epoch 080 | Loss: 0.1036 | Train F1: 0.1067 | Val F1: 0.0882
Epoch 090 | Loss: 0.1012 | Train F1: 0.1077 | Val F1: 0.1002
Epoch 100 | Loss: 0.0997 | Train F1: 0.1131 | Val F1: 0.1042
Epoch 110 | Loss: 0.0980 | Train F1: 0.1087 | Val F1: 0.1042
Epoch 120 | Loss: 0.0974 | Train F1: 0.1174 | Val F1: 0.1159
Epoch 130 | Loss: 0.0962 | Train F1: 0.1194 | Val F1: 0.1237
Epoch 140 | Loss: 0.0949 | Train F1: 0.1179 | Val F1: 0.1276
Epoch 150 | Loss: 0.0947 | Train F1: 0.1237 | Val F1: 0.1276
Epoch 160 | Loss: 0.0940 | Train F1: 0.1223 | Val F1: 0.1314
Epoch 170 | Loss: 0.0936 | Train F1: 0.1324 | Val F1: 0.1391
Epoch 180 | Loss: 0.0926

In [27]:
# CELDA 9: Evaluación final sobre test set (usar mejor modelo por val)
# ------------------------------------------------------------
if best_state is not None:
    model.load_state_dict(best_state)

eval_test = evaluate(model, data, test_mask.to(device))
print("Test results:")
print(f"Accuracy: {eval_test['accuracy']:.4f}")
print(f"Precision: {eval_test['precision']:.4f}")
print(f"Recall: {eval_test['recall']:.4f}")
print(f"F1: {eval_test['f1']:.4f}")


Test results:
Accuracy: 0.9740
Precision: 1.0000
Recall: 0.0749
F1: 0.1393


In [28]:
# CELDA EXTRA: Centralidades considerando grafo dirigido
# -------------------------------------------------------
import networkx as nx
from torch_geometric.utils import to_networkx

# Convertimos a grafo dirigido
G_dir = to_networkx(data, to_undirected=False)

# In-degree y Out-degree centrality
print("Calculando in-degree y out-degree centrality...")
in_deg_cent = nx.in_degree_centrality(G_dir)
out_deg_cent = nx.out_degree_centrality(G_dir)

# Betweenness centrality dirigido (aproximado para acelerar)
print("Calculando betweenness centrality dirigido (aprox)...")
# Betweenness centrality aproximada en grafo dirigido
btw_cent = nx.betweenness_centrality(G_dir, k=500, seed=42, normalized=True)

# Closeness centrality dirigido
print("Calculando closeness centrality dirigido...")
clo_cent = nx.closeness_centrality(G_dir)

# Eigenvector centrality dirigido (puede tardar)
#print("Calculando eigenvector centrality dirigido...")
#eig_cent = nx.eigenvector_centrality_numpy(G_dir, max_iter=500)

# Convertimos a tensores en orden de índices de nodos
n_nodes = data.num_nodes
in_deg_feat = torch.tensor([in_deg_cent[i] for i in range(n_nodes)], dtype=torch.float32).view(-1,1)
out_deg_feat = torch.tensor([out_deg_cent[i] for i in range(n_nodes)], dtype=torch.float32).view(-1,1)
btw_feat = torch.tensor([btw_cent[i] for i in range(n_nodes)], dtype=torch.float32).view(-1,1)
clo_feat = torch.tensor([clo_cent[i] for i in range(n_nodes)], dtype=torch.float32).view(-1,1)
#eig_feat = torch.tensor([eig_cent[i] for i in range(n_nodes)], dtype=torch.float32).view(-1,1)

# Concatenamos todas las centralidades
centrality_feats = torch.cat([in_deg_feat, out_deg_feat, btw_feat, clo_feat], dim=1)

# Normalizamos centralidades usando pipeline ya definido
centrality_feats = feature_pipeline.fit_transform(centrality_feats)

# Concatenamos con features originales
data.x = torch.cat([data.x, centrality_feats], dim=1)
print("Nuevo shape de data.x con centralidades dirigidas:", data.x.shape)


Calculando in-degree y out-degree centrality...
Calculando betweenness centrality dirigido (aprox)...
Calculando closeness centrality dirigido...
Nuevo shape de data.x con centralidades dirigidas: torch.Size([203769, 169])


In [29]:
# CELDA 6: Aplicar pipeline de features (fit sólo con nodos de entrenamiento o con todos?)
# ------------------------------------------------------------
# Opción 1 (recomendado): ajustar scaler usando sólo nodos de entrenamiento etiquetados
X = data.x  # tensor [N, F]
feature_pipeline.fit(X[train_mask])   # ajustamos con distrib. de train
X_trans = feature_pipeline.transform(X)  # tensor transformado (N, F)
data.x = X_trans  # reemplazamos features en el objeto Data
print("Features transformadas. shape:", data.x.shape)


Features transformadas. shape: torch.Size([203769, 169])




In [30]:
# CELDA 7: Definir modelo GCN simple (2 capas)
# ------------------------------------------------------------
class GCNNet(nn.Module):
    def __init__(self, in_channels, hidden_channels=64, out_channels=2, dropout=0.5):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return x  # logits (N, out_channels)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCNNet(in_channels=data.num_node_features, hidden_channels=64, out_channels=2, dropout=0.5).to(device)
data = data.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()


In [31]:
# CELDA 8: Training loop (usando masks)
# ------------------------------------------------------------
def train(model, data, train_mask, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data)  # logits for all nodes (N, 2)
    # aplicamos mask para loss (solo nodos de entrenamiento)
    loss = criterion(out[train_mask], data.y[train_mask].long())
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def evaluate(model, data, mask):
    model.eval()
    out = model(data)
    probs = F.softmax(out, dim=1)
    preds = probs.argmax(dim=1)
    y_true = data.y[mask].cpu().numpy()
    y_pred = preds[mask].cpu().numpy()
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', pos_label=1)
    return {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'preds': preds}

# Ajuste: las etiquetas de data.y deben ser 0 o 1 para nodo etiquetado.
# En nuestro pipeline, transformamos labels (2->0, 1->1). Asegurémonos:
# los nodos unknown deben quedar fuera por las masks (no se usarán).
data.y = y_binary.to(device)  # contiene -1 para unknown; convertimos a device

# Entrenamiento
n_epochs = 200
best_val_f1 = 0.0
best_state = None
for epoch in range(1, n_epochs + 1):
    loss = train(model, data, train_mask.to(device), optimizer)
    eval_train = evaluate(model, data, train_mask.to(device))
    eval_val = evaluate(model, data, val_mask.to(device))
    if eval_val['f1'] > best_val_f1:
        best_val_f1 = eval_val['f1']
        best_state = model.state_dict()
    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | Loss: {loss:.4f} | Train F1: {eval_train['f1']:.4f} | Val F1: {eval_val['f1']:.4f}")


Epoch 001 | Loss: 0.8135 | Train F1: 0.0203 | Val F1: 0.0196


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 010 | Loss: 0.1802 | Train F1: 0.0016 | Val F1: 0.0087
Epoch 020 | Loss: 0.1327 | Train F1: 0.0707 | Val F1: 0.0669
Epoch 030 | Loss: 0.1135 | Train F1: 0.0895 | Val F1: 0.0996
Epoch 040 | Loss: 0.1052 | Train F1: 0.0731 | Val F1: 0.0720
Epoch 050 | Loss: 0.1003 | Train F1: 0.1092 | Val F1: 0.1198
Epoch 060 | Loss: 0.0982 | Train F1: 0.0823 | Val F1: 0.0720
Epoch 070 | Loss: 0.0965 | Train F1: 0.0958 | Val F1: 0.0882
Epoch 080 | Loss: 0.0953 | Train F1: 0.0893 | Val F1: 0.0842
Epoch 090 | Loss: 0.0935 | Train F1: 0.1052 | Val F1: 0.1042
Epoch 100 | Loss: 0.0925 | Train F1: 0.1042 | Val F1: 0.1042
Epoch 110 | Loss: 0.0920 | Train F1: 0.1037 | Val F1: 0.1042
Epoch 120 | Loss: 0.0912 | Train F1: 0.1052 | Val F1: 0.1118
Epoch 130 | Loss: 0.0908 | Train F1: 0.1101 | Val F1: 0.1079
Epoch 140 | Loss: 0.0895 | Train F1: 0.1314 | Val F1: 0.1311
Epoch 150 | Loss: 0.0890 | Train F1: 0.1252 | Val F1: 0.1273
Epoch 160 | Loss: 0.0884 | Train F1: 0.1271 | Val F1: 0.1347
Epoch 170 | Loss: 0.0874

In [32]:
# CELDA 9: Evaluación final sobre test set (usar mejor modelo por val)
# ------------------------------------------------------------
if best_state is not None:
    model.load_state_dict(best_state)

eval_test = evaluate(model, data, test_mask.to(device))
print("Test results:")
print(f"Accuracy: {eval_test['accuracy']:.4f}")
print(f"Precision: {eval_test['precision']:.4f}")
print(f"Recall: {eval_test['recall']:.4f}")
print(f"F1: {eval_test['f1']:.4f}")


Test results:
Accuracy: 0.9743
Precision: 1.0000
Recall: 0.0859
F1: 0.1582


## Balanceo de clases

In [34]:
# Calcular distribución de clases en nodos de entrenamiento
y_train = data.y[train_mask].cpu().numpy()
classes, counts = np.unique(y_train, return_counts=True)
print(dict(zip(classes, counts)))

# Clases: 0 = licit, 1 = illicit
# Peso inversamente proporcional a frecuencia
weight_licit = 1.0
weight_illicit = counts[0] / counts[1]  # mayor peso para la clase minoritaria

class_weights = torch.tensor([weight_licit, weight_illicit], dtype=torch.float32).to(device)
print("Pesos de clase:", class_weights)


{np.int64(0): np.int64(125764), np.int64(1): np.int64(3636)}
Pesos de clase: tensor([ 1.0000, 34.5886])


In [35]:
# Aplicar pipeline de features sobre las features nuevas (originales + centralidades)
X = data.x
feature_pipeline.fit(X[train_mask])   # ajustamos con nodos de entrenamiento
X_trans = feature_pipeline.transform(X)
data.x = X_trans
print("Features transformadas. shape:", data.x.shape)


Features transformadas. shape: torch.Size([203769, 169])




device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCNNet(
    in_channels=data.num_node_features,  # actualizar in_channels
    hidden_channels=64,
    out_channels=2,
    dropout=0.5
).to(device)
data = data.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [36]:
n_epochs = 200
best_val_f1 = 0.0
best_state = None

for epoch in range(1, n_epochs + 1):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[train_mask], data.y[train_mask].long())
    loss.backward()
    optimizer.step()
    
    eval_train = evaluate(model, data, train_mask)
    eval_val = evaluate(model, data, val_mask)
    
    if eval_val['f1'] > best_val_f1:
        best_val_f1 = eval_val['f1']
        best_state = model.state_dict()
    
    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | Loss: {loss.item():.4f} | Train F1: {eval_train['f1']:.4f} | Val F1: {eval_val['f1']:.4f}")


Epoch 001 | Loss: 0.0865 | Train F1: 0.1323 | Val F1: 0.1309
Epoch 010 | Loss: 0.0864 | Train F1: 0.1497 | Val F1: 0.1535
Epoch 020 | Loss: 0.0857 | Train F1: 0.1393 | Val F1: 0.1385
Epoch 030 | Loss: 0.0857 | Train F1: 0.1497 | Val F1: 0.1501
Epoch 040 | Loss: 0.0857 | Train F1: 0.1585 | Val F1: 0.1498
Epoch 050 | Loss: 0.0850 | Train F1: 0.1530 | Val F1: 0.1347
Epoch 060 | Loss: 0.0848 | Train F1: 0.1650 | Val F1: 0.1498
Epoch 070 | Loss: 0.0846 | Train F1: 0.1607 | Val F1: 0.1535
Epoch 080 | Loss: 0.0843 | Train F1: 0.1728 | Val F1: 0.1501
Epoch 090 | Loss: 0.0840 | Train F1: 0.1816 | Val F1: 0.1687
Epoch 100 | Loss: 0.0839 | Train F1: 0.1705 | Val F1: 0.1650
Epoch 110 | Loss: 0.0836 | Train F1: 0.1696 | Val F1: 0.1613
Epoch 120 | Loss: 0.0833 | Train F1: 0.1786 | Val F1: 0.1720
Epoch 130 | Loss: 0.0837 | Train F1: 0.1826 | Val F1: 0.1793
Epoch 140 | Loss: 0.0831 | Train F1: 0.1796 | Val F1: 0.1793
Epoch 150 | Loss: 0.0832 | Train F1: 0.1718 | Val F1: 0.1756
Epoch 160 | Loss: 0.0826

In [37]:
if best_state is not None:
    model.load_state_dict(best_state)

eval_test = evaluate(model, data, test_mask)
print("Test results con centralidades dirigidas:")
print(f"Accuracy: {eval_test['accuracy']:.4f}")
print(f"Precision: {eval_test['precision']:.4f}")
print(f"Recall: {eval_test['recall']:.4f}")
print(f"F1: {eval_test['f1']:.4f}")


Test results con centralidades dirigidas:
Accuracy: 0.9742
Precision: 0.9091
Recall: 0.0881
F1: 0.1606


Focal loss

In [38]:
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, weight=self.alpha, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss


In [43]:
# Pesos para la clase minoritaria
# Alpha suavizado
alpha = torch.tensor([1.0, 5.0], dtype=torch.float32).to(device)  # menor que antes (~30-40)
criterion = FocalLoss(alpha=alpha, gamma=1.5)



In [44]:
n_epochs = 200
best_val_f1 = 0.0
best_state = None

for epoch in range(1, n_epochs + 1):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[train_mask], data.y[train_mask].long())
    loss.backward()
    optimizer.step()
    
    eval_train = evaluate(model, data, train_mask)
    eval_val = evaluate(model, data, val_mask)
    
    if eval_val['f1'] > best_val_f1:
        best_val_f1 = eval_val['f1']
        best_state = model.state_dict()
    
    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | Loss: {loss.item():.4f} | Train F1: {eval_train['f1']:.4f} | Val F1: {eval_val['f1']:.4f}")


Epoch 001 | Loss: 0.2202 | Train F1: 0.3354 | Val F1: 0.3045
Epoch 010 | Loss: 0.1418 | Train F1: 0.4231 | Val F1: 0.3721
Epoch 020 | Loss: 0.1318 | Train F1: 0.4524 | Val F1: 0.3997
Epoch 030 | Loss: 0.1252 | Train F1: 0.4689 | Val F1: 0.4081
Epoch 040 | Loss: 0.1229 | Train F1: 0.4846 | Val F1: 0.4265
Epoch 050 | Loss: 0.1215 | Train F1: 0.5029 | Val F1: 0.4390
Epoch 060 | Loss: 0.1204 | Train F1: 0.5028 | Val F1: 0.4436
Epoch 070 | Loss: 0.1197 | Train F1: 0.5059 | Val F1: 0.4502
Epoch 080 | Loss: 0.1196 | Train F1: 0.5101 | Val F1: 0.4451
Epoch 090 | Loss: 0.1192 | Train F1: 0.5114 | Val F1: 0.4571
Epoch 100 | Loss: 0.1188 | Train F1: 0.5149 | Val F1: 0.4576
Epoch 110 | Loss: 0.1180 | Train F1: 0.5209 | Val F1: 0.4601
Epoch 120 | Loss: 0.1182 | Train F1: 0.5194 | Val F1: 0.4611
Epoch 130 | Loss: 0.1175 | Train F1: 0.5245 | Val F1: 0.4554
Epoch 140 | Loss: 0.1170 | Train F1: 0.5254 | Val F1: 0.4578
Epoch 150 | Loss: 0.1166 | Train F1: 0.5282 | Val F1: 0.4624
Epoch 160 | Loss: 0.1166

In [42]:
eval_test = evaluate(model, data, test_mask)
print("Test results con focal loss:")
print(f"Accuracy: {eval_test['accuracy']:.4f}")
print(f"Precision: {eval_test['precision']:.4f}")
print(f"Recall: {eval_test['recall']:.4f}")
print(f"F1: {eval_test['f1']:.4f}")


Test results con focal loss:
Accuracy: 0.8161
Precision: 0.1250
Recall: 0.9251
F1: 0.2202


## Random shuffle

In [48]:
# ------------------------------------------------------------
# CELDA COMPLETA: Procesamiento, split, modelo, entrenamiento y evaluación
# ------------------------------------------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.nn import GCNConv
import numpy as np

# -------------------------
# 1️⃣ Preparar labels y nodos etiquetados
# -------------------------
# Mapear etiquetas a binario: illicit=1, licit=0, unknown=-1
labels = data.y.clone().cpu()
y_binary = labels.clone()


# Nodos etiquetados (no unknown)
labeled_indices = torch.where(y_binary >= 0)[0]

# -------------------------
# 2️⃣ Split 60/20/20 con shuffle reproducible
# -------------------------
num_labeled = labeled_indices.shape[0]
num_train = int(0.6 * num_labeled)
num_val   = int(0.2 * num_labeled)
num_test  = num_labeled - num_train - num_val

torch.manual_seed(42)
perm = torch.randperm(num_labeled)

train_idx = labeled_indices[perm[:num_train]]
val_idx   = labeled_indices[perm[num_train:num_train + num_val]]
test_idx  = labeled_indices[perm[num_train + num_val:]]

# Construir máscaras booleanas
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
val_mask   = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask  = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True

# Asignar al objeto data
data.train_mask = train_mask
data.val_mask   = val_mask
data.test_mask  = test_mask
data.y_binary   = y_binary

print("N train:", train_mask.sum().item(),
      "N val:", val_mask.sum().item(),
      "N test:", test_mask.sum().item())

# -------------------------
# 3️⃣ Definir modelo GCN
# -------------------------
class GCNNet(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.5):
        super(GCNNet, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# -------------------------
# 4️⃣ Definir Focal Loss opcional
# -------------------------
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, weight=self.alpha, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

# -------------------------
# 5️⃣ Inicializar modelo y optimizer
# -------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
in_channels = data.x.shape[1]
hidden_channels = 64
out_channels = 2
model = GCNNet(in_channels, hidden_channels, out_channels).to(device)
data = data.to(device)

# Pesos de clase (opcional para Focal Loss)
class_counts = torch.bincount(data.y_binary[train_mask])
class_weights = (class_counts.sum() / class_counts).float().to(device)
criterion = FocalLoss(alpha=class_weights, gamma=1.5)  # gamma ajustable
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# -------------------------
# 6️⃣ Función de evaluación
# -------------------------
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate(model, data, mask):
    model.eval()
    with torch.no_grad():
        logits = model(data)
        preds = logits[mask].argmax(dim=1).cpu().numpy()
        labels_true = data.y_binary[mask].cpu().numpy()
        acc = accuracy_score(labels_true, preds)
        prec = precision_score(labels_true, preds, zero_division=0)
        rec = recall_score(labels_true, preds, zero_division=0)
        f1 = f1_score(labels_true, preds, zero_division=0)
    return {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1}

# -------------------------
# 7️⃣ Loop de entrenamiento
# -------------------------
n_epochs = 200
best_val_f1 = 0.0
best_state = None

for epoch in range(1, n_epochs + 1):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[train_mask], data.y_binary[train_mask].long())
    loss.backward()
    optimizer.step()

    eval_train = evaluate(model, data, train_mask)
    eval_val = evaluate(model, data, val_mask)

    if eval_val['f1'] > best_val_f1:
        best_val_f1 = eval_val['f1']
        best_state = model.state_dict()

    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | Loss: {loss.item():.4f} | "
              f"Train F1: {eval_train['f1']:.4f} | Val F1: {eval_val['f1']:.4f}")

# -------------------------
# 8️⃣ Evaluación final en test set
# -------------------------
if best_state is not None:
    model.load_state_dict(best_state)

eval_test = evaluate(model, data, test_mask)
print("Test results:")
print(f"Accuracy: {eval_test['accuracy']:.4f}")
print(f"Precision: {eval_test['precision']:.4f}")
print(f"Recall: {eval_test['recall']:.4f}")
print(f"F1: {eval_test['f1']:.4f}")


N train: 97050 N val: 32350 N test: 32350
Epoch 001 | Loss: 1.7423 | Train F1: 0.0729 | Val F1: 0.0717
Epoch 010 | Loss: 0.7867 | Train F1: 0.1387 | Val F1: 0.1407
Epoch 020 | Loss: 0.6549 | Train F1: 0.1419 | Val F1: 0.1439
Epoch 030 | Loss: 0.5988 | Train F1: 0.1641 | Val F1: 0.1664
Epoch 040 | Loss: 0.5465 | Train F1: 0.1802 | Val F1: 0.1800
Epoch 050 | Loss: 0.5262 | Train F1: 0.1855 | Val F1: 0.1840
Epoch 060 | Loss: 0.5055 | Train F1: 0.1973 | Val F1: 0.1927
Epoch 070 | Loss: 0.4857 | Train F1: 0.2056 | Val F1: 0.2007
Epoch 080 | Loss: 0.4678 | Train F1: 0.2203 | Val F1: 0.2155
Epoch 090 | Loss: 0.4585 | Train F1: 0.2224 | Val F1: 0.2165
Epoch 100 | Loss: 0.4496 | Train F1: 0.2315 | Val F1: 0.2268
Epoch 110 | Loss: 0.4372 | Train F1: 0.2420 | Val F1: 0.2320
Epoch 120 | Loss: 0.4282 | Train F1: 0.2517 | Val F1: 0.2390
Epoch 130 | Loss: 0.4178 | Train F1: 0.2540 | Val F1: 0.2417
Epoch 140 | Loss: 0.4198 | Train F1: 0.2663 | Val F1: 0.2538
Epoch 150 | Loss: 0.4111 | Train F1: 0.2595

In [47]:
print("Total nodos en data:", data.num_nodes)
print("Total nodos con label original 1 o 2:", ((labels==0) | (labels==1)).sum().item())
print("Total nodos después de mapear y_binary >=0:", (y_binary >=0).sum().item())


Total nodos en data: 203769
Total nodos con label original 1 o 2: 161750
Total nodos después de mapear y_binary >=0: 4545
