In [2]:
#!pip install torch_geometric
#!pip install networkx

Collecting torch_geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.5.3


# Obiettivi
Creare una rete neurale capace di calcolare la **betweenness centrality** di un nodo all'interno di un grafo con una precisione accettabile, comparabile agli algoritmi di approssimazione tradizionali, ma con una significativa riduzione dei tempi di calcolo, quindi della complessità computazionale.

## Attività principali:
1. **Selezione del dataset (grafo) di training**
2. **Calcolo della betweenness centrality esatta** per il grafo di training
3. **Data labelling**: Aggiunta della feature *betweenness centrality* al grafo
4. **Suddivisione Dataset in test e val**
5. parallelismo -> immagine/grafo nodo/pixel
5. **Sviluppo del modello**:
    - Modello di regressione
    - Training supervisionato
6. **Training del modello**: con test e validazione
7. **Test delle prestazioni del modello**
8. **Confronto** dei risultati ottenuti con gli approcci classici


### 1. **Selezione del dataset (grafo) di training**

The Cora dataset consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words.



In [None]:
import torch
import torch_geometric
import networkx as nx
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_networkx

In [None]:
# Utilizza tutti i dataset disponibili su Planetoid
datasets = ['Cora', 'CiteSeer', 'PubMed']
data_list = []

for dataset_name in datasets:
    dataset = Planetoid(root=f'/tmp/{dataset_name}', name=dataset_name)
    data_list.append(dataset[0])

In [2]:
# Stampa il tipo di ogni dataset caricato
for data in data_list:
    print(f'Numero di nodi: {data.num_nodes}')
    print(f'Train mask: {data.train_mask.sum()} nodi')
    print(f'Validation mask: {data.val_mask.sum()} nodi')
    print(f'Test mask: {data.test_mask.sum()} nodi')

Numero di nodi: 2708
Train mask: 140 nodi
Validation mask: 500 nodi
Test mask: 1000 nodi
Numero di nodi: 3327
Train mask: 120 nodi
Validation mask: 500 nodi
Test mask: 1000 nodi
Numero di nodi: 19717
Train mask: 60 nodi
Validation mask: 500 nodi
Test mask: 1000 nodi


### 2. Calcolo della betweenness centrality esatta per il grafo di training
### 3. Data Labelling


In [3]:
for data in data_list:
    G = to_networkx(data, to_undirected=True)

    # Calcola la betweenness centrality dei nodi
    betweenness = nx.betweenness_centrality(G)

    data.y = torch.tensor([betweenness[i] for i in range(data.num_nodes)], dtype=torch.float)

#### Save betweenness computation offline

In [8]:
import pickle
dataset_path = "/tmp/dataset_with_betweenness.pkl"

In [4]:
with open(dataset_path, 'wb') as f:
    pickle.dump(data_list, f)

**import from offline**

In [9]:
with open(dataset_path, 'rb') as f:
    data_list = pickle.load(f)

print(data_list)

[Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708]), Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327]), Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])]


### 4. Suddivisione Dataset in test e val


In [8]:
# nulla da fare, i datasets sono già suddivisi.

### 5. Sviluppo del modello

**guardare nn_model.py**

In [27]:
from nn_model import *
import torch


### 6. Training
#### Model setup

In [28]:
def add_padding_to_features(data, target_num_features):
    num_features = data.x.shape[1]
    if num_features < target_num_features:
        padding = torch.zeros((data.num_nodes, target_num_features - num_features), dtype=torch.float)
        data.x = torch.cat([data.x, padding], dim=1)
    return data

In [32]:
from torch_geometric.loader import DataLoader

# Definizione dei parametri del modello
target_num_features = max(data.num_node_features for data in data_list)
hidden_channels = 64
out_channels = 1

data_list = [add_padding_to_features(data, target_num_features) for data in data_list]

train_loader = DataLoader([data for data in data_list if data.train_mask.sum() > 0], batch_size=32, shuffle=True)
val_loader = DataLoader([data for data in data_list if data.val_mask.sum() > 0], batch_size=32, shuffle=False)
test_loader = DataLoader([data for data in data_list if data.test_mask.sum() > 0], batch_size=32, shuffle=False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [33]:
model = GATRegression(target_num_features, hidden_channels, out_channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)


#### Training functions

In [34]:
def train(loader):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = F.mse_loss(out[data.train_mask], data.y[data.train_mask].unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def test(loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data)
            loss = F.mse_loss(out[data.test_mask], data.y[data.test_mask].unsqueeze(1))
            total_loss += loss.item()
    return total_loss / len(loader)

#### Training Phase

In [35]:
num_epochs = 300

In [36]:
for epoch in range(1, num_epochs + 1):
    train_loss = train(train_loader)
    val_loss = test(val_loader)
    test_loss = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch: 001, Train Loss: 0.0825, Val Loss: 0.0073, Test Loss: 0.0073
Epoch: 002, Train Loss: 0.0887, Val Loss: 0.0061, Test Loss: 0.0061
Epoch: 003, Train Loss: 0.0748, Val Loss: 0.0049, Test Loss: 0.0049
Epoch: 004, Train Loss: 0.0828, Val Loss: 0.0047, Test Loss: 0.0047
Epoch: 005, Train Loss: 0.0710, Val Loss: 0.0046, Test Loss: 0.0046
Epoch: 006, Train Loss: 0.0635, Val Loss: 0.0042, Test Loss: 0.0042
Epoch: 007, Train Loss: 0.0490, Val Loss: 0.0041, Test Loss: 0.0041
Epoch: 008, Train Loss: 0.0767, Val Loss: 0.0043, Test Loss: 0.0043
Epoch: 009, Train Loss: 0.0683, Val Loss: 0.0050, Test Loss: 0.0050
Epoch: 010, Train Loss: 0.1065, Val Loss: 0.0060, Test Loss: 0.0060
Epoch: 011, Train Loss: 0.0722, Val Loss: 0.0068, Test Loss: 0.0068
Epoch: 012, Train Loss: 0.0855, Val Loss: 0.0068, Test Loss: 0.0068
Epoch: 013, Train Loss: 0.0497, Val Loss: 0.0061, Test Loss: 0.0061
Epoch: 014, Train Loss: 0.0590, Val Loss: 0.0053, Test Loss: 0.0053
Epoch: 015, Train Loss: 0.0841, Val Loss: 0.0048

## Comparazione

In [38]:
from torch_geometric.datasets import KarateClub

# Carica il dataset Karate Club
dataset_k = KarateClub()
data_k = dataset_k[0]

# Aggiungi padding alle feature dei nodi per uniformare il numero di feature
data_k = add_padding_to_features(data_k, target_num_features)

# Metti il modello in modalità di valutazione
model.eval()

# Calcola la betweenness centrality usando il modello
with torch.no_grad():
    data_k = data_k.to(device)
    predicted_betweenness = model(data_k).squeeze()

# Ordina i nodi per betweenness centrality predetta
sorted_indices = torch.argsort(predicted_betweenness, descending=True)

# Stampa i nodi classificati per betweenness centrality
print("Nodi classificati per betweenness centrality predetta:")
for idx in sorted_indices:
    print(f"Nodo {idx.item()}: Betweenness = {predicted_betweenness[idx].item():.4f}")

Nodi classificati per betweenness centrality predetta:
Nodo 14: Betweenness = 0.0029
Nodo 26: Betweenness = 0.0027
Nodo 29: Betweenness = 0.0020
Nodo 18: Betweenness = 0.0017
Nodo 9: Betweenness = 0.0012
Nodo 33: Betweenness = 0.0010
Nodo 2: Betweenness = 0.0006
Nodo 23: Betweenness = 0.0003
Nodo 28: Betweenness = 0.0000
Nodo 32: Betweenness = 0.0000
Nodo 15: Betweenness = -0.0003
Nodo 27: Betweenness = -0.0003
Nodo 30: Betweenness = -0.0003
Nodo 19: Betweenness = -0.0004
Nodo 8: Betweenness = -0.0004
Nodo 7: Betweenness = -0.0005
Nodo 13: Betweenness = -0.0007
Nodo 22: Betweenness = -0.0007
Nodo 3: Betweenness = -0.0007
Nodo 1: Betweenness = -0.0011
Nodo 12: Betweenness = -0.0014
Nodo 16: Betweenness = -0.0017
Nodo 5: Betweenness = -0.0019
Nodo 21: Betweenness = -0.0020
Nodo 31: Betweenness = -0.0021
Nodo 6: Betweenness = -0.0027
Nodo 0: Betweenness = -0.0032
Nodo 25: Betweenness = -0.0033
Nodo 4: Betweenness = -0.0034
Nodo 24: Betweenness = -0.0035
Nodo 17: Betweenness = -0.0035
Nodo

In [39]:
# Funzione per calcolare la betweenness centrality esatta
def calculate_exact_betweenness(data):
    G = to_networkx(data, to_undirected=True)
    betweenness = nx.betweenness_centrality(G)
    return torch.tensor([betweenness[i] for i in range(data.num_nodes)], dtype=torch.float)

# Calcola la betweenness centrality esatta
exact_betweenness = calculate_exact_betweenness(data_k)

# Ordina i nodi per betweenness centrality predetta
sorted_indices_predicted = torch.argsort(predicted_betweenness, descending=True)

# Ordina i nodi per betweenness centrality esatta
sorted_indices_exact = torch.argsort(exact_betweenness, descending=True)

# Confronta i nodi classificati per betweenness centrality predetta ed esatta
print("Confronto tra betweenness centrality predetta ed esatta:")
for idx in range(data_k.num_nodes):
    node_pred = sorted_indices_predicted[idx].item()
    node_exact = sorted_indices_exact[idx].item()
    print(f"Nodo {node_pred} (Predetto): Betweenness = {predicted_betweenness[node_pred]:.4f} | Nodo {node_exact} (Esatto): Betweenness = {exact_betweenness[node_exact]:.4f}")

Confronto tra betweenness centrality predetta ed esatta:
Nodo 14 (Predetto): Betweenness = 0.0029 | Nodo 0 (Esatto): Betweenness = 0.4376
Nodo 26 (Predetto): Betweenness = 0.0027 | Nodo 33 (Esatto): Betweenness = 0.3041
Nodo 29 (Predetto): Betweenness = 0.0020 | Nodo 32 (Esatto): Betweenness = 0.1452
Nodo 18 (Predetto): Betweenness = 0.0017 | Nodo 2 (Esatto): Betweenness = 0.1437
Nodo 9 (Predetto): Betweenness = 0.0012 | Nodo 31 (Esatto): Betweenness = 0.1383
Nodo 33 (Predetto): Betweenness = 0.0010 | Nodo 8 (Esatto): Betweenness = 0.0559
Nodo 2 (Predetto): Betweenness = 0.0006 | Nodo 1 (Esatto): Betweenness = 0.0539
Nodo 23 (Predetto): Betweenness = 0.0003 | Nodo 13 (Esatto): Betweenness = 0.0459
Nodo 28 (Predetto): Betweenness = 0.0000 | Nodo 19 (Esatto): Betweenness = 0.0325
Nodo 32 (Predetto): Betweenness = 0.0000 | Nodo 6 (Esatto): Betweenness = 0.0300
Nodo 15 (Predetto): Betweenness = -0.0003 | Nodo 5 (Esatto): Betweenness = 0.0300
Nodo 27 (Predetto): Betweenness = -0.0003 | Nodo