## utils

In [1]:
import math
from tqdm import tqdm
import pandas as pd
from collections import defaultdict
from itertools import combinations
import networkx as nx
import random

In [2]:
def construct_graph(movies_fn, ratings_fn,min_rating=5, min_weight=10):
    """ Construit le graphe des films:
    * movies_fn : movies.csv
    * ratings_fn : ratings.csv
    * min_rating : seuil minimal du score pour lier un utilisateur à un film
    * min_weight : seuil minimal du poids d'une arête pour la garder dans le graphe
    """
    movies = pd.read_csv(movies_fn)
    ratings = pd.read_csv(ratings_fn)

    rated_movies = ratings[ratings.rating >=min_rating]
    grouped_movies = rated_movies[['userId','movieId']].groupby('userId').agg(list)
    pair_freq = defaultdict(int)
    item_freq = defaultdict(int)

    for lst_movies in tqdm(grouped_movies['movieId']):
        pairs = combinations(sorted(lst_movies),2)
        for i in lst_movies:
            item_freq[i] += 1
        for (i,j) in pairs:
            pair_freq[(i,j)] += 1

    movies_graph = nx.Graph()
    log_total = math.log(sum(item_freq.values()))
    # Pointwise Mutual Information : pmi(x,y) = log p(x,y)/(p(x)p(y)) = log (p(x,y)) - log(p(x)) -log(p(y))
    for (i,j),f in pair_freq.items():
        pmi = f*(math.log(f) - math.log(item_freq[i]) - math.log(item_freq[j]) + log_total)
        if pmi >= min_weight:
            movies_graph.add_edge(i,j,weight=pmi)

    return movies_graph, movies

In [3]:
def random_walk(graph, num_walks=5, num_steps=10, p=1, q=1):
    """"
        Construit un ensemble de chemins dans le graphe par marche aléatoire biaisée :
        * graph : graphe
        * num_walks: nombre de chemins par noeud
        * num_step : longueur des chemins
        * p : plus p est grand, plus l'exploration est incitée, p  petit -> plus il y a des retours en arriere
        * q : plus q est grand, plus la marche reste localisée, q petit -> s'écarte des noeuds explorés
    """
    def next_step(previous, current):
        def get_pq(n):
            if n == current: return p
            if graph.has_edge(n,previous): return 1
            return q
        weights = [w['weight']/get_pq(n) for n,w in graph[current].items()]
        return random.choices(list(graph[current]),weights=weights)[0]
    walks = []
    nodes = list(graph.nodes())
    for walk_iter in range((num_walks)):
        for node in tqdm(nodes):
            walk = [node]
            cur_node = node
            prev_node = None
            for step  in range(num_steps):
                next_node = next_step(prev_node,cur_node)
                walk.append(next_node)
                prev_node = cur_node
                cur_node = next_node
            walks.append(walk)
    return walks

## TP 10

In [4]:
# from utils import random_walk,construct_graph
import math
from tqdm import tqdm
import networkx as nx
from torch import nn
from torch.utils.data import DataLoader, Dataset
import random
import torch
from torch.utils.tensorboard import SummaryWriter

import time
import logging

logging.basicConfig(level=logging.INFO)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2025-01-07 11:07:28.579753: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736244449.020604    7627 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736244449.145912    7627 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-07 11:07:30.223625: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## TODO

In [5]:
class TripletDataset(Dataset):

    def __init__(self, graph, walks, nodes2id):
        self.graph = graph
        self.walks = walks
        self.nodes2id = nodes2id

        self.positive_neighbors = {node: list(graph.neighbors(node)) for node in graph.nodes()}
        self.all_nodes = list(graph.nodes())

    def __len__(self):
        return len(self.all_nodes)
    
    def __getitem__(self, index):
        # Randomly select a anchor node
        walk = self.walks[index]
        anchor = random.choice(walk)

        # Positive sample: a neighbor of the anchor
        positive = random.choice(self.positive_neighbors[anchor])

        # Neigative sample: a node not connected to the anchor node
        while True:
            negative = random.choice(self.all_nodes)
            if not self.graph.has_edge(anchor, negative):
                break

        # Convert nodes to indices
        anchor_idx = self.nodes2id[anchor]
        positive_idx = self.nodes2id[positive]
        negative_idx = self.nodes2id[negative]

        return anchor_idx, positive_idx, negative_idx

In [6]:
class NodeEmbeddingModel(nn.Module):

    def __init__(self, num_nodes, embedding_dim):
        super(NodeEmbeddingModel, self).__init__()
        self.embeddings = nn.Embedding(num_nodes, embedding_dim)
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, embedding_dim)
        )

    def forward(self, nodes):
        x = self.embeddings(nodes)
        x = self.fc(x)
        return x

In [7]:
def train_triplet_loss_with_visualization(model, dataloader, optimizer, criterion, id2title, num_epochs=10, log_dir='./runs'):

    # 初始化 SummaryWriter
    writer = SummaryWriter(log_dir)

    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for anchor, positive, negative in dataloader:
            anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)

            # Get embeddings
            anchor_emb = model(anchor)
            positive_emb = model(positive)
            negative_emb = model(negative)

            # Triplet loss
            loss = criterion(anchor_emb, positive_emb, negative_emb)
            epoch_loss += loss.item()

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        logging.info(f"Epoch {epoch+1}/{num_epochs}, Loss = {epoch_loss:.4f}")

    # 每 5 个 epoch 可视化一次嵌入到 TensorBoard
    if (epoch + 1) % 5 == 0:
        model.eval()
        embeddings = model.embeddings.weight.detach().cpu().numpy()
        writer.add_embedding(
            torch.tensor(embeddings), metadata=id2title, tag=f"Epoch {epoch+1}"
        )
        model.train()

    writer.close()
    logging.info("TensorBoard embeddings logged.")

In [8]:
if __name__=="__main__":
    PATH = "data/ml-latest-small/"
    logging.info("Constructing graph")
    movies_graph, movies = construct_graph(PATH + "movies.csv", PATH + "ratings.csv")    # 构建 图
    logging.info("Sampling walks")
    walks = random_walk(movies_graph,5,10,1,1)    # 随机游走
    nodes2id = dict(zip(movies_graph.nodes(),range(len(movies_graph.nodes()))))
    id2nodes = list(movies_graph.nodes())
    id2title = [movies[movies.movieId==idx].iloc[0].title for idx in id2nodes]
    ##  TODO: 

    dataset = TripletDataset(movies_graph, walks, nodes2id)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

    num_nodes = len(nodes2id)
    embedding_dim = 64
    model = NodeEmbeddingModel(num_nodes, embedding_dim).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.TripletMarginLoss(margin = 1.0)

    logging.info("Training loss with TensorBoard visualization")
    train_triplet_loss_with_visualization(
        model, dataloader, optimizer, criterion, id2title, num_epochs=10, log_dir="./runs"
    )

INFO:root:Constructing graph
100%|███████████████████████████████████████████████████████████████████████████████| 573/573 [00:00<00:00, 4979.32it/s]
INFO:root:Sampling walks
100%|██████████████████████████████████████████████████████████████████████████████| 1405/1405 [00:01<00:00, 936.74it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1405/1405 [00:01<00:00, 933.45it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1405/1405 [00:01<00:00, 911.58it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1405/1405 [00:01<00:00, 929.57it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1405/1405 [00:01<00:00, 926.99it/s]
INFO:root:Training loss with TensorBoard visualization
INFO:root:Epoch 1/10, Loss = 43.0571
INFO:root:Epoch 2/10, Loss = 39.9444
INFO:root:Epoch 3/10, Loss = 35.5263
INFO:root:Epoch 4/10, Loss = 33.0590
INFO:root:Epoch 5