In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install torch-geometric
!pip install optuna



In [3]:
import pandas as pd
import torch
import networkx as nx
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
from torch.optim import Adam
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch_geometric.transforms import RandomLinkSplit
from sklearn.metrics import accuracy_score
from torch_geometric.utils import from_networkx, negative_sampling, degree

In [4]:
df = pd.read_csv('/content/drive/MyDrive/1 Maestria/Topicos/Reviews.csv')

In [5]:
df = df.head(50000)
print(df.shape)
df.head()

(50000, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [6]:
# Normalizar las calificaciones para que estén entre 0 y 1
df['weight'] = df['Score'] / df['Score'].max()

# Crear mapeo de IDs únicos para usuarios y productos
user_ids = df['UserId'].unique()
product_ids = df['ProductId'].unique()
user_id_map = {uid: idx for idx, uid in enumerate(user_ids)}
product_id_map = {pid: idx + len(user_ids) for idx, pid in enumerate(product_ids)}
node_mapping = {**user_id_map, **product_id_map}

In [7]:
# Crear el grafo
G = nx.DiGraph()
for _, row in df.iterrows():
    user_node = node_mapping[row['UserId']]
    product_node = node_mapping[row['ProductId']]
    G.add_node(user_node, type='user')
    G.add_node(product_node, type='product')
    G.add_edge(user_node, product_node, weight=row['weight'])  # Añadir peso de la arista

data = from_networkx(G)

# Añadir características de nodos
num_nodes = len(node_mapping)
num_node_features = 745
data.x = torch.randn((num_nodes, num_node_features), dtype=torch.float)
data.edge_weight = torch.tensor([d['weight'] for u, v, d in G.edges(data=True)], dtype=torch.float)

transform = RandomLinkSplit(is_undirected=True, num_val=0.2, num_test=0.1)
train_data, val_data, test_data = transform(data)

In [8]:
# Obtener información del dataset
print(f'Dataset: Amazon Food Reviews')
print('----------------------------')
print(f'Numero de grafos: 1')
print(f'Cantidad de Nodos: {data.x.shape[0]}')
print(f'Cantidad de features: {data.x.shape[1]}')
print(f'Cantidad de clases: N/A')  # Si no hay clases específicas en el dataset

# Obtener información del grafo
print(f'\nGraph:')
print('------')
print(f'Se tienen links dirigidos: {data.is_directed()}')
print(f'Grafo tiene nodos aislados: {data.has_isolated_nodes()}')
print(f'Grafo tiene self-loops: {data.has_self_loops()}')

# Obtener los grados de los nodos
degrees = degree(data.edge_index[0], data.num_nodes)
isolated_nodes = (degrees == 0).sum().item()
nodes_with_degree_1 = (degrees == 1).sum().item()

print(f'Cantidad de nodos aislados: {isolated_nodes}')
print(f'Cantidad de nodos con grado = 1: {nodes_with_degree_1}')

Dataset: Amazon Food Reviews
----------------------------
Numero de grafos: 1
Cantidad de Nodos: 46163
Cantidad de features: 745
Cantidad de clases: N/A

Graph:
------
Se tienen links dirigidos: True
Grafo tiene nodos aislados: False
Grafo tiene self-loops: False
Cantidad de nodos aislados: 6115
Cantidad de nodos con grado = 1: 34105


In [9]:
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

In [10]:
# Modelo SAGE
train_data.train_pos_edge_index = train_data.edge_index
val_data.val_pos_edge_index = val_data.edge_label_index[:, val_data.edge_label == 1]
test_data.test_pos_edge_index = test_data.edge_label_index[:, test_data.edge_label == 1]

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index, edge_weight=None):
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        return x

def negative_sampling(edge_index, num_nodes=None, num_neg_samples=None):
    """Muestrea aristas negativas aleatorias."""
    num_nodes = edge_index.max().item() + 1 if num_nodes is None else num_nodes
    num_neg_samples = edge_index.size(1) if num_neg_samples is None else num_neg_samples

    adj_mask = torch.zeros(num_nodes, num_nodes, dtype=torch.bool)
    adj_mask[edge_index[0], edge_index[1]] = 1

    edges = torch.randint(0, num_nodes, size=(2, num_neg_samples), dtype=torch.long)
    mask = adj_mask[edges[0], edges[1]]
    while mask.sum().item() != 0:
        replacements = torch.randint(0, num_nodes, size=(2, mask.sum().item()), dtype=torch.long)
        edges[:, mask] = replacements
        mask = adj_mask[edges[0], edges[1]]
    return edges

# utilizar GPU, enviar datos a GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

model = GraphSAGE(train_data.x.size(1), 170, 64).to(device)
optimizer = Adam(model.parameters(), lr=0.005)
criterion = torch.nn.BCEWithLogitsLoss()

In [11]:
# Early Stoping
class EarlyStopping:
    def __init__(self, patience=10, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        torch.save(model.state_dict(), 'checkpoint.pt')

In [12]:
# entremiento y testing
def train(data):
    model.train()
    optimizer.zero_grad()
    z = model(data.x, data.train_pos_edge_index)

    pos_logits = (z[data.train_pos_edge_index[0]] * z[data.train_pos_edge_index[1]]).sum(dim=1)
    neg_edge_index = negative_sampling(data.train_pos_edge_index, data.num_nodes, data.train_pos_edge_index.size(1))
    neg_logits = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=1)

    logits = torch.cat([pos_logits, neg_logits])
    labels = torch.cat([torch.ones(pos_logits.size(0)), torch.zeros(neg_logits.size(0))]).to(device)

    loss = criterion(logits, labels)
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def evaluate(model, data, pos_edge_index, neg_edge_index):
    model.eval()
    z = model(data.x, data.edge_index)

    pos_logits = (z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim=1)
    neg_logits = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=1)

    logits = torch.cat([pos_logits, neg_logits])
    labels = torch.cat([torch.ones(pos_logits.size(0)), torch.zeros(neg_logits.size(0))]).to(device)

    link_probs = torch.sigmoid(logits)
    preds = (link_probs > 0.5).float()
    correct = preds.eq(labels).sum().item()
    total = len(labels)
    return correct / total

In [13]:
train_neg_edge_index = negative_sampling(train_data.train_pos_edge_index, train_data.x.size(0))

for epoch in range(100):
    train_loss = train(train_data)
    print(f"Epoch {epoch + 1}, Loss: {train_loss:.4f}")

val_neg_edge_index = negative_sampling(val_data.val_pos_edge_index, val_data.x.size(0))
test_neg_edge_index = negative_sampling(test_data.test_pos_edge_index, test_data.x.size(0))

val_accuracy = evaluate(model, val_data, val_data.val_pos_edge_index, val_neg_edge_index)
test_accuracy = evaluate(model, test_data, test_data.test_pos_edge_index, test_neg_edge_index)

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

Epoch 1, Loss: 1.1676
Epoch 2, Loss: 0.9676
Epoch 3, Loss: 0.7525
Epoch 4, Loss: 0.6868
Epoch 5, Loss: 0.7033
Epoch 6, Loss: 0.6510
Epoch 7, Loss: 0.5754
Epoch 8, Loss: 0.5470
Epoch 9, Loss: 0.5498
Epoch 10, Loss: 0.5453
Epoch 11, Loss: 0.5425
Epoch 12, Loss: 0.5111
Epoch 13, Loss: 0.4899
Epoch 14, Loss: 0.4807
Epoch 15, Loss: 0.4752
Epoch 16, Loss: 0.4645
Epoch 17, Loss: 0.4634
Epoch 18, Loss: 0.4631
Epoch 19, Loss: 0.4650
Epoch 20, Loss: 0.4534
Epoch 21, Loss: 0.4372
Epoch 22, Loss: 0.4388
Epoch 23, Loss: 0.4268
Epoch 24, Loss: 0.4302
Epoch 25, Loss: 0.4273
Epoch 26, Loss: 0.4269
Epoch 27, Loss: 0.4205
Epoch 28, Loss: 0.4155
Epoch 29, Loss: 0.4171
Epoch 30, Loss: 0.4136
Epoch 31, Loss: 0.4163
Epoch 32, Loss: 0.4109
Epoch 33, Loss: 0.4120
Epoch 34, Loss: 0.4061
Epoch 35, Loss: 0.4058
Epoch 36, Loss: 0.4047
Epoch 37, Loss: 0.4058
Epoch 38, Loss: 0.4014
Epoch 39, Loss: 0.4003
Epoch 40, Loss: 0.4058
Epoch 41, Loss: 0.4010
Epoch 42, Loss: 0.4001
Epoch 43, Loss: 0.3992
Epoch 44, Loss: 0.39

In [14]:
import optuna

In [15]:
def objective(trial):
    in_channels = train_data.x.shape[1]
    hidden_channels = trial.suggest_int('hidden_channels', 64, 256)
    out_channels = 64
    lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)

    model = GraphSAGE(in_channels, hidden_channels, out_channels).to(device)
    optimizer = Adam(model.parameters(), lr=lr)
    criterion = torch.nn.BCEWithLogitsLoss()

    def train(data):
        model.train()
        optimizer.zero_grad()
        z = model(data.x, data.train_pos_edge_index)

        pos_logits = (z[data.train_pos_edge_index[0]] * z[data.train_pos_edge_index[1]]).sum(dim=1)
        neg_edge_index = negative_sampling(data.train_pos_edge_index, data.num_nodes, data.train_pos_edge_index.size(1))
        neg_logits = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=1)

        logits = torch.cat([pos_logits, neg_logits])
        labels = torch.cat([torch.ones(pos_logits.size(0)), torch.zeros(neg_logits.size(0))]).to(device)

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        return loss.item()

    @torch.no_grad()
    def evaluate(model, data, pos_edge_index, neg_edge_index):
        model.eval()
        z = model(data.x, data.edge_index)

        pos_logits = (z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim=1)
        neg_logits = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=1)

        logits = torch.cat([pos_logits, neg_logits])
        labels = torch.cat([torch.ones(pos_logits.size(0)), torch.zeros(neg_logits.size(0))]).to(device)

        link_probs = torch.sigmoid(logits)
        preds = (link_probs > 0.5).float()
        correct = preds.eq(labels).sum().item()
        total = len(labels)
        return correct / total

    best_val_acc = 0

    for epoch in range(50):
        train_loss = train(train_data)
        val_neg_edge_index = negative_sampling(val_data.val_pos_edge_index, val_data.x.size(0))
        val_accuracy = evaluate(model, val_data, val_data.val_pos_edge_index, val_neg_edge_index)
        if val_accuracy > best_val_acc:
            best_val_acc = val_accuracy

    return best_val_acc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

print('Mejor Prueba:')
trial = study.best_trial

print('  Valores:', trial.value)
print('  Parametros:')
for key, value in trial.params.items():
    print(f'    {key}: {value}')


[I 2024-05-25 04:46:47,754] A new study created in memory with name: no-name-ae370afd-6317-403b-8fbd-b02489662ec1
  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2024-05-25 04:47:48,611] Trial 0 finished with value: 0.5426450276243094 and parameters: {'hidden_channels': 70, 'lr': 0.00023560406952918326}. Best is trial 0 with value: 0.5426450276243094.
[I 2024-05-25 04:48:50,506] Trial 1 finished with value: 0.5398825966850829 and parameters: {'hidden_channels': 82, 'lr': 0.0007108054271855091}. Best is trial 0 with value: 0.5426450276243094.
[I 2024-05-25 04:49:52,694] Trial 2 finished with value: 0.5276243093922652 and parameters: {'hidden_channels': 234, 'lr': 0.0006320887540333347}. Best is trial 0 with value: 0.5426450276243094.
[I 2024-05-25 04:50:53,975] Trial 3 finished with value: 0.5319406077348067 and parameters: {'hidden_channels': 91, 'lr': 0.00017111112502713702}. Best is trial 0 with value: 0.5426450276243094.
[I 2024-05-25 04:51:55,890] Trial 4 finished with value: 

Mejor Prueba:
  Valores: 0.5426450276243094
  Parametros:
    hidden_channels: 70
    lr: 0.00023560406952918326


In [None]:
best_params = trial.params
in_channels = data.x.shape[1]
hidden_channels = best_params['hidden_channels']
out_channels = 2
lr = best_params['lr']

model = GraphSAGE(in_channels, hidden_channels, out_channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()
early_stopping = EarlyStopping(patience=10, min_delta=0.001)

for epoch in range(100):
    train_loss = train()
    acc = test()
    print(f'Epoch {epoch}, Loss: {train_loss}, Accuracy: {acc}')

    early_stopping(train_loss, model)

    if early_stopping.early_stop:
        print("Early stopping")
        break

model.load_state_dict(torch.load('checkpoint.pt'))

# Evaluacion final de modelo
final_acc = test()
print(f'Accuracy: {final_acc*100:.2f}')