## utils

In [None]:
import math
from tqdm import tqdm
import pandas as pd
from collections import defaultdict
from itertools import combinations
import networkx as nx
import random

In [None]:
def construct_graph(movies_fn, ratings_fn,min_rating=5, min_weight=10):
    """ Construit le graphe des films:
    * movies_fn : movies.csv
    * ratings_fn : ratings.csv
    * min_rating : seuil minimal du score pour lier un utilisateur à un film
    * min_weight : seuil minimal du poids d'une arête pour la garder dans le graphe
    """
    movies = pd.read_csv(movies_fn)
    ratings = pd.read_csv(ratings_fn)

    rated_movies = ratings[ratings.rating >=min_rating]
    grouped_movies = rated_movies[['userId','movieId']].groupby('userId').agg(list)
    pair_freq = defaultdict(int)
    item_freq = defaultdict(int)

    for lst_movies in tqdm(grouped_movies['movieId']):
        pairs = combinations(sorted(lst_movies),2)
        for i in lst_movies:
            item_freq[i] += 1
        for (i,j) in pairs:
            pair_freq[(i,j)] += 1

    movies_graph = nx.Graph()
    log_total = math.log(sum(item_freq.values()))
    # Pointwise Mutual Information : pmi(x,y) = log p(x,y)/(p(x)p(y)) = log (p(x,y)) - log(p(x)) -log(p(y))
    for (i,j),f in pair_freq.items():
        pmi = f*(math.log(f) - math.log(item_freq[i]) - math.log(item_freq[j]) + log_total)
        if pmi >= min_weight:
            movies_graph.add_edge(i,j,weight=pmi)

    return movies_graph, movies

In [None]:
def random_walk(graph, num_walks=5, num_steps=10, p=1, q=1):
    """"
        Construit un ensemble de chemins dans le graphe par marche aléatoire biaisée :
        * graph : graphe
        * num_walks: nombre de chemins par noeud
        * num_step : longueur des chemins
        * p : plus p est grand, plus l'exploration est incitée, p  petit -> plus il y a des retours en arriere
        * q : plus q est grand, plus la marche reste localisée, q petit -> s'écarte des noeuds explorés
    """
    def next_step(previous, current):
        def get_pq(n):
            if n == current: return p
            if graph.has_edge(n,previous): return 1
            return q
        weights = [w['weight']/get_pq(n) for n,w in graph[current].items()]
        return random.choices(list(graph[current]),weights=weights)[0]
    walks = []
    nodes = list(graph.nodes())
    for walk_iter in range((num_walks)):
        for node in tqdm(nodes):
            walk = [node]
            cur_node = node
            prev_node = None
            for step  in range(num_steps):
                next_node = next_step(prev_node,cur_node)
                walk.append(next_node)
                prev_node = cur_node
                cur_node = next_node
            walks.append(walk)
    return walks

## TP 10

In [None]:
# from utils import random_walk,construct_graph
import math
from tqdm import tqdm
import networkx as nx
from torch import nn
from torch.utils.data import DataLoader, Dataset
import random
import torch
from torch.utils.tensorboard import SummaryWriter

import time
import logging

logging.basicConfig(level=logging.INFO)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2025-01-07 11:07:28.579753: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736244449.020604    7627 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736244449.145912    7627 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-07 11:07:30.223625: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## TODO

In [None]:
class SkipGramDataset(Dataset):
    """Dataset for Skip-Gram training"""
    def __init__(self, walks, window_size, nodes2id):
        self.pairs = []
        for walk in walks:
            for i, center_node in enumerate(walk):
                context_start = max(0, i - window_size)
                context_end = min(len(walk), i + window_size + 1)
                for j in range(context_start, context_end):
                    if i != j:
                        self.pairs.append((nodes2id[center_node], nodes2id[walk[j]]))
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, index):
        return torch.tensor(self.pairs[index][0]), torch.tensor(self.pairs[index][1])

In [None]:
class SkipGramModel(nn.Module):
    """Skip-Gram Model for Vec2node"""
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.input_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center, context, negative):
        center_emb = self.input_embeddings(center)
        context_emb = self.output_embeddings(context)
        negative_emb = self.output_embeddings(negative)

        # Positive score
        pos_score = torch.sum(center_emb * context_emb, dim=1)
        pos_loss = -torch.log(torch.sigmoid(pos_score) + 1e-10).mean()

        # Negative score
        neg_score = torch.bmm(negative_emb, center_emb.unsqueeze(2)).squeeze()
        neg_loss = -torch.log(1 - torch.sigmoid(neg_score) + 1e-10).mean()

        return pos_loss + neg_loss
    
    def get_embeddings(self):
        return self.input_embeddings.weight.detach().cpu().numpy()

In [None]:
def train_skipgram(model, dataloader, optimizer, epochs, negative_samples, vocab_size, id2title, log_dir='./runs/Q2'):
    writer = SummaryWriter(log_dir)

    for epoch in range(epochs):
        total_loss = 0
        for center, context in dataloader:
            center = center.to(device)
            context = context.to(device)

            # Generate negative samples
            negative = torch.randint(0, vocab_size, (center.size(0), negative_samples), device=device)

            optimizer.zero_grad()
            loss = model(center, context, negative)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        logging.info(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader)}")

        if (epoch + 1) % 5 == 0:
            model.eval()
            embeddings = model.embeddings.weight.detach().cpu().numpy()
            writer.add_embedding(
                torch.tensor(embeddings), metadata=id2title, tag=f"Epoch {epoch+1}"
            )
            model.train()

    writer.close()
    logging.info("TensorBoard embeddings logged.")

In [None]:
if __name__=="__main__":
    PATH = "data/ml-latest-small/"
    logging.info("Constructing graph")
    movies_graph, movies = construct_graph(PATH + "movies.csv", PATH + "ratings.csv")    # 构建 图
    logging.info("Sampling walks")
    walks = random_walk(movies_graph,5,10,1,1)    # 随机游走
    nodes2id = dict(zip(movies_graph.nodes(),range(len(movies_graph.nodes()))))
    id2nodes = list(movies_graph.nodes())
    id2title = [movies[movies.movieId==idx].iloc[0].title for idx in id2nodes]
    ##  TODO: 

    # Parameters
    embedding_dim = 128
    window_size = 2
    negative_samples = 5
    epochs = 10
    batch_size = 128

    # Generate dataset
    dataset = SkipGramDataset(walks, window_size, nodes2id)
    dataloader = DataLoader(dataset, batch_size, shuffle=True)

    # Initialize model
    vocab_size = len(nodes2id)
    model = SkipGramModel(vocab_size, embedding_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    # Train model
    logging.info("Training loss with TensorBoard visualization")
    train_skipgram(model, dataloader, optimizer, epochs, negative_samples, vocab_size, id2title)
