In [7]:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.0+cu121.html

Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu121.html


In [8]:
!pip install torch_geometric



##**GAE**

In [9]:
import torch
from torch_geometric.nn import GCNConv, SAGEConv, GATv2Conv

class Encoder(torch.nn.Module):
    def __init__(self, in_features, hidden, conv_type="GRAPHSAGE", **kwargs):
        super().__init__()

        convs = []
        for hidden_dim in hidden:
            if conv_type == "GCN":
                conv = GCNConv(in_features, hidden_dim)
            elif conv_type == "GRAPHSAGE":
                if kwargs["aggr"] == "max":
                    conv = SAGEConv(in_features, hidden_dim, aggr=kwargs["aggr"], project=True)
                else:
                    conv = SAGEConv(in_features, hidden_dim, aggr=kwargs["aggr"])
            elif conv_type == "GAT":
                conv = GATv2Conv(in_features, hidden_dim//kwargs["heads"], heads=kwargs["heads"])

            convs.append(conv)
            in_features = hidden_dim

        self.convs = torch.nn.ModuleList(convs)

    def forward(self, x, edge_index):
        for conv in self.convs:
            x = conv(x, edge_index).relu()

        return x

In [10]:
from copy import deepcopy

def train_gae_one_epoch(model, optimizer, train_data):
    model.train()
    optimizer.zero_grad()

    z = model(train_data.x, train_data.edge_index)

    pos_edge_index = train_data.edge_label_index[:, train_data.edge_label == 1]
    neg_edge_index = train_data.edge_label_index[:, train_data.edge_label == 0]

    loss = model.recon_loss(z, pos_edge_index, neg_edge_index)

    loss.backward()
    optimizer.step()

    return float(loss)

def test_gae(model, train_data, test_data):
    model.eval()
    with torch.no_grad():
        z = model(train_data.x, train_data.edge_index)

    pos_edge_index = test_data.edge_label_index[:, test_data.edge_label == 1]
    neg_edge_index = test_data.edge_label_index[:, test_data.edge_label == 0]

    return model.test(z, pos_edge_index, neg_edge_index)[0]

def train_gae(model, optimizer, train_data, val_data, patience, nepochs = 1500):
    best_epoch = -1
    best_val = 0.0

    loss_history = []
    val_history = []

    for epoch in range(nepochs):
        loss = train_gae_one_epoch(model, optimizer, train_data)
        val = test_gae(model, train_data, val_data)

        loss_history.append(loss)
        val_history.append(val)

        if val > best_val:
            best_epoch = epoch
            best_val = val
            with torch.no_grad():
                model.eval()
                representations = model(train_data.x, train_data.edge_index)

        if (epoch - best_epoch) > patience:
            break

    return loss_history, val_history, representations

##**Hyperparameters**

In [11]:
from sklearn.model_selection import ParameterGrid

HYPERPARAMETER_GRID = [
{
    'conv_type': ["GCN"],
    'hidden': [[32], [64], [32, 32], [64, 64]],
    'lr': [1e-2, 1e-3, 1e-4],
    'patience': [50, 100]
},
{
    'conv_type': ["GRAPHSAGE"],
    'aggr': ['mean', 'max'],
    'hidden': [[32], [64], [32, 32], [64, 64]],
    'lr': [1e-2, 1e-3, 1e-4],
    'patience': [50, 100]
},
{
    'conv_type': ["GAT"],
    'heads': [4, 8],
    'hidden': [[32], [64], [32, 32], [64, 64]],
    'lr': [1e-2, 1e-3, 1e-4],
    'patience': [50, 100]
}]

hyperparameter_list = list(ParameterGrid(HYPERPARAMETER_GRID))
hyperparameter_list[:6]

[{'conv_type': 'GCN', 'hidden': [32], 'lr': 0.01, 'patience': 50},
 {'conv_type': 'GCN', 'hidden': [32], 'lr': 0.01, 'patience': 100},
 {'conv_type': 'GCN', 'hidden': [32], 'lr': 0.001, 'patience': 50},
 {'conv_type': 'GCN', 'hidden': [32], 'lr': 0.001, 'patience': 100},
 {'conv_type': 'GCN', 'hidden': [32], 'lr': 0.0001, 'patience': 50},
 {'conv_type': 'GCN', 'hidden': [32], 'lr': 0.0001, 'patience': 100}]

##**Graph Embedding**

In [12]:
from google.colab import drive

drive.mount("/content/drive", force_remount = True)

Mounted at /content/drive


In [13]:
PATH_TO_PICKLE = "/content/drive/MyDrive/Multimodal_Fusion/grafos_reg"
PATH_TO_SAVE = "/content/drive/MyDrive/Multimodal_Fusion/grafos_gae"

In [14]:
import numpy as np
import random

random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
import os
import pickle as pkl
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.utils import from_networkx
from torch_geometric.nn import GAE
from tqdm import tqdm

pickles = os.listdir(PATH_TO_PICKLE)

for pickle in ['fakenews.pkl']:
    graph_name = pickle.split('.')[0]
    print(f"Graph: {graph_name}")

    with open(f"{PATH_TO_PICKLE}/{pickle}", "rb") as file:
        G = pkl.load(file)

    index_to_node = {index: node for index, node in enumerate(G.nodes())}

    G_copy = deepcopy(G)
    for node in G_copy.nodes():
        for attr in list(G_copy.nodes[node].keys()):
            if attr != 'f_features':
                del G_copy.nodes[node][attr]

    data = from_networkx(G_copy, group_node_attrs = ['f_features'])
    data.x = data.x.to(torch.float32)

    train_data, _, test_data = RandomLinkSplit(num_val = 0, num_test = 0.2, is_undirected = True)(data)

    train_data = train_data.to('cuda')
    test_data = test_data.to('cuda')

    if not os.path.exists(f"{PATH_TO_SAVE}/{graph_name}"):
        os.mkdir(f"{PATH_TO_SAVE}/{graph_name}")

    for index, hyperparams in tqdm(enumerate(hyperparameter_list), total = len(hyperparameter_list)):
        if hyperparams["conv_type"] == "GCN": continue #ja rodamos

        model = GAE(Encoder(in_features=data.x.shape[1],**hyperparams)).to('cuda')
        optimizer = torch.optim.Adam(model.parameters(), lr = hyperparams['lr'])

        loss_history, val_history, representations = train_gae(model, optimizer, train_data, test_data, patience = hyperparams['patience'])

        representations = representations.cpu().numpy()
        for i in range(representations.shape[0]):
            G.nodes[index_to_node[i]]['gae_features'] = representations[i]

        if not os.path.exists(f"{PATH_TO_SAVE}/{graph_name}/{hyperparams['conv_type']}"):
            os.mkdir(f"{PATH_TO_SAVE}/{graph_name}/{hyperparams['conv_type']}")

        with open(f"{PATH_TO_SAVE}/{graph_name}/{hyperparams['conv_type']}/{index}.pkl", "wb") as file:
            pkl.dump(G, file)

    print("")


Graph: fakenews


  data[key] = torch.tensor(value)
 68%|██████▊   | 82/120 [1:12:33<38:46, 61.22s/it]