In [1]:
# import warnings
# warnings.filterwarnings('ignore')

import pandas as pd
import csv
from collections import defaultdict
import numpy as np
import json
import torch
import joblib
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
import random
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from math import log2
# Default path to data files
PATH = "../data/"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load user-item interaction data
interaction_data = pd.read_csv(
    PATH + 'ml-100k/u.data',
    sep='\t',
    encoding="latin1",
    names=['user_id', 'item_id', 'rating', 'timestamp']
    )[['user_id', 'item_id', 'rating']]
interaction_data['rating'] = 1
display(interaction_data.shape)
interaction_data.head(5)

(100000, 3)

Unnamed: 0,user_id,item_id,rating
0,196,242,1
1,186,302,1
2,22,377,1
3,244,51,1
4,166,346,1


In [3]:
# Load test item IDs from the json file saved
# previously from Knowledge Graph Method
with open('../experiments/test_ids.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
# Extract test item IDs as integers
test_item_ids = [item['movieId'] for item in data]
print(f"items on test set: {len(test_item_ids)}")
display(test_item_ids[:5])

items on test set: 30


['159', '458', '679', '128', '658']

In [4]:
# Split interaction data into train and test sets
# to allow later cold start evaluation 
train_interactions = interaction_data[
    ~interaction_data['item_id'].astype(str).isin(test_item_ids)].reset_index(drop=True)
test_interactions = interaction_data[
    interaction_data['item_id'].astype(str).isin(test_item_ids)].reset_index(drop=True)

In [5]:
# garantir que ids sejam strings para comparação segura
train_interactions['user_id'] = train_interactions['user_id'].astype(str)
train_interactions['item_id'] = train_interactions['item_id'].astype(str)
test_item_ids = [str(x) for x in test_item_ids]  # já definido antes

# criar lista ordenada de users observados no treino e items que permaneceram no grafo de treino
unique_user_ids = sorted(train_interactions['user_id'].unique().tolist())
train_item_ids = sorted(train_interactions['item_id'].unique().tolist())  # exclui itens de teste

# construir mapas id -> índice
user2idx = {u: i for i, u in enumerate(unique_user_ids)}
item2idx = {v: i for i, v in enumerate(train_item_ids)}

# contar e salvar (opcional, mas útil)
n_users = len(user2idx)
n_train_items = len(item2idx)
print(f"n_users={n_users}, n_train_items={n_train_items}")

n_users=943, n_train_items=1652


In [6]:
# Mapear colunas para índices locais
train_df = train_interactions.copy()
train_df['u_idx'] = train_df['user_id'].map(user2idx)
train_df['i_idx_local'] = train_df['item_id'].map(item2idx)  # items de treino apenas

# Remover linhas sem mapeamento (segurança)
train_df = train_df.dropna(subset=['u_idx','i_idx_local']).astype({'u_idx':int, 'i_idx_local':int})

# Construir índices globais: shift nos items para evitar sobreposição com users
u_nodes = train_df['u_idx'].values                      # já 0..n_users-1
i_nodes = train_df['i_idx_local'].values + n_users     # items -> n_users .. n_users + n_train_items - 1

# Duplicar para grafo não direcionado: user->item e item->user
src = np.concatenate([u_nodes, i_nodes])
dst = np.concatenate([i_nodes, u_nodes])

edge_index_train = torch.tensor([src, dst], dtype=torch.long)

# Sanity checks e salvar
print("edge_index_train shape:", edge_index_train.shape)
print("num_edges (directed count):", edge_index_train.shape[1])
print("num_unique_nodes referenced:", int(torch.unique(edge_index_train).numel()))
# salvar para uso posterior
torch.save(edge_index_train, "edge_index_train.pt")

edge_index_train shape: torch.Size([2, 191922])
num_edges (directed count): 191922
num_unique_nodes referenced: 2595


  edge_index_train = torch.tensor([src, dst], dtype=torch.long)


In [7]:
# Instantiate a defaultdict to hold user features
user_data = defaultdict(dict)

# Read data and build user features dictionary
def load_feature(file_path, feature_name):
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            userId = row['userId']
            value = row[feature_name]
            user_data[userId][feature_name] = value

# Load each feature file
load_feature(PATH + 'ageRel.csv', 'age')
load_feature(PATH + 'genderRel.csv', 'gender')
load_feature(PATH + 'occupationRel.csv', 'occupation')
load_feature(PATH + 'residesRel.csv', 'zipcode')

# Build user features as a dictionary (user_id -> list of "k:v" features)
user_features_raw = {
    str(userId): [
        f'age:{data.get("age","")}',
        f'gender:{data.get("gender","")}',
        f'occupation:{data.get("occupation","")}',
        f'zipcode:{data.get("zipcode","")}'
    ]
    for userId, data in user_data.items()
}

# Display first 5 user features
for item in list(user_features_raw.items())[:5]:
    print(item)

('1', ['age:24', 'gender:M', 'occupation:technician', 'zipcode:85'])
('2', ['age:53', 'gender:F', 'occupation:other', 'zipcode:94'])
('3', ['age:23', 'gender:M', 'occupation:writer', 'zipcode:32'])
('4', ['age:24', 'gender:M', 'occupation:technician', 'zipcode:43'])
('5', ['age:33', 'gender:F', 'occupation:other', 'zipcode:15'])


In [8]:
# Instantiate a defaultdict to hold item features
item_data = defaultdict(lambda: defaultdict(list))

# Read data and build item features dictionary
# Modified version to handle multiple genres
def load_feature(file_path, feature_name):
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            itemId = row['movieId']
            value = row[feature_name]
            if feature_name == 'genreDesc':
                item_data[itemId]['genre'].append(value)
            else:
                item_data[itemId][feature_name] = value

# Load each feature file
load_feature(PATH + 'releaseRel.csv', 'releaseDate')
load_feature(PATH + 'genreRel.csv', 'genreDesc')

# Build item features as a dictionary (item_id -> list of "k:v" features)
item_features_raw = {
    str(itemId): (
        [f'releaseDate:{data.get("releaseDate","")}'] +
        [f'genre:{genre}' for genre in data.get('genre', [])]
    )
    for itemId, data in item_data.items()
}

# Display first 5 item features
for item in list(item_features_raw.items())[:5]:
    print(item)

('2', ['releaseDate:Jan-1995', 'genre:Action', 'genre:Adventure', 'genre:Thriller'])
('4', ['releaseDate:Jan-1995', 'genre:Action', 'genre:Comedy', 'genre:Drama'])
('17', ['releaseDate:Feb-1996', 'genre:Action', 'genre:Comedy', 'genre:Crime', 'genre:Horror', 'genre:Thriller'])
('21', ['releaseDate:Feb-1996', 'genre:Action', 'genre:Adventure', 'genre:Comedy', 'genre:Musical', 'genre:Thriller'])
('22', ['releaseDate:Feb-1996', 'genre:Action', 'genre:Drama', 'genre:War'])


In [9]:
# helper para parse "k:v" lists
def parse_kv_list(kv_list):
    d = defaultdict(list)
    for kv in kv_list:
        if isinstance(kv, str) and ":" in kv:
            k,v = kv.split(":",1)
            d[k].append(v)
    # return simplified: single-value -> string, multi -> list
    out = {}
    for k,vals in d.items():
        out[k] = vals[0] if len(vals)==1 else vals
    return out

# garantir chaves strings
user_keys = list(user2idx.keys())        # ordem já definida antes
train_item_keys = list(item2idx.keys())  # itens que estão no grafo de treino

# criar parsed lists (preserve order)
users_parsed = []
for uid in user_keys:
    kvs = user_features_raw.get(str(uid), [])   # ajuste: metadata map may use string keys
    parsed = parse_kv_list(kvs)
    parsed['id'] = uid
    users_parsed.append(parsed)
df_users = pd.DataFrame(users_parsed).set_index('id').fillna("")

items_parsed = []
for iid in train_item_keys:
    kvs = item_features_raw.get(str(iid), [])
    parsed = parse_kv_list(kvs)
    parsed['id'] = iid
    items_parsed.append(parsed)
df_items = pd.DataFrame(items_parsed).set_index('id').fillna("")

In [10]:
# USERS
# age numeric (fallback 0), gender string, occupation string, zipcode string
user_age = df_users['age'].astype(float).values.reshape(-1,1) if 'age' in df_users.columns else np.zeros((len(df_users),1))
user_gender = df_users['gender'].astype(str).values.reshape(-1,1) if 'gender' in df_users.columns else np.array([[""]] * len(df_users))
user_occupation = df_users['occupation'].astype(str).values.reshape(-1,1) if 'occupation' in df_users.columns else np.array([[""]] * len(df_users))
user_zipcode = df_users['zipcode'].astype(str).values.reshape(-1,1) if 'zipcode' in df_users.columns else np.array([[""]] * len(df_users))

# ITEMS
# releaseDate kept as original string (e.g., "Jan-1995")
# genres: ensure list for each item
item_release_str = []
item_genres_list = []
for iid in df_items.index:
    rd = df_items.loc[iid].get('releaseDate', "")
    # keep original string (or empty string if missing)
    item_release_str.append(rd if isinstance(rd, str) else "")
    # genres: parsed could be list or single string
    genres = df_items.loc[iid].get('genre', [])
    if isinstance(genres, str):
        genres = [genres] if genres else []
    item_genres_list.append(genres)

# Converter para arrays numpy apropriados
# user_age already shaped (n_users,1)
# user_gender, user_occupation, user_zipcode shaped (n_users,1)
user_cat_inputs = {
    'gender': user_gender,
    'occupation': user_occupation,
    'zipcode': user_zipcode
}

# item_release_str is a list of strings length n_items
# item_genres_list is a list of lists length n_items

# Mostre formas e amostras para verificação rápida
print("n_users:", len(df_users))
print("user_age shape:", user_age.shape)
print("user_gender sample:", user_gender[:3].ravel())
print("user_occupation sample:", user_occupation[:3].ravel())
print("user_zipcode sample:", user_zipcode[:3].ravel())

print("n_train_items:", len(df_items))
print("item_release sample:", item_release_str[:3])
print("item_genres sample:", item_genres_list[:3])


n_users: 943
user_age shape: (943, 1)
user_gender sample: ['M' 'M' 'M']
user_occupation sample: ['technician' 'lawyer' 'executive']
user_zipcode sample: ['85' '90' '90']
n_train_items: 1652
item_release sample: ['Jan-1995', 'Feb-1997', 'Jan-1994']
item_genres sample: [['Animation', 'Childrens', 'Comedy'], ['Crime', 'Drama', 'Thriller'], ['Comedy', 'Western']]


In [11]:
# USERS
# ohe_user: gender, occupation, zipcode combinados
ohe_user = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
user_cat_input = np.hstack([user_cat_inputs['gender'], user_cat_inputs['occupation'], user_cat_inputs['zipcode']])  # shape (n_users, 3)
user_cat_ohe = ohe_user.fit_transform(user_cat_input)  # (n_users, D_user_cat)

# age scaler
sc_user_age = StandardScaler().fit(user_age)  # user_age shape (n_users,1)
user_age_scaled = sc_user_age.transform(user_age)  # (n_users,1)

# ITEMS
# releaseDate as categorical token -> OneHotEncoder
ohe_release = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
item_release_arr = np.array(item_release_str).reshape(-1,1)  # (n_items,1)
item_release_ohe = ohe_release.fit_transform(item_release_arr)  # (n_items, D_release)

# genres multi-hot
mlb_genres = MultiLabelBinarizer(sparse_output=False)
genres_mx = mlb_genres.fit_transform(item_genres_list)  # (n_items, n_genres)

# salvar encoders
joblib.dump(ohe_user, "ohe_user.joblib")
joblib.dump(sc_user_age, "sc_user_age.joblib")
joblib.dump(ohe_release, "ohe_release.joblib")
joblib.dump(mlb_genres, "mlb_genres.joblib")

# prints para verificação
print("user_cat_ohe shape:", user_cat_ohe.shape)
print("user_age_scaled shape:", user_age_scaled.shape)
print("item_release_ohe shape:", item_release_ohe.shape)
print("genres_mx shape:", genres_mx.shape)


user_cat_ohe shape: (943, 134)
user_age_scaled shape: (943, 1)
item_release_ohe shape: (1652, 109)
genres_mx shape: (1652, 18)


In [12]:
# Construir matrizes finais e concatenar (users then train items), salvar artefatos
# verificar shapes
user_feat_dim = user_age_scaled.shape[1] + user_cat_ohe.shape[1]   # ou user_features.shape[1] se já tiver
item_feat_dim = item_release_ohe.shape[1] + genres_mx.shape[1]    # ou item_features.shape[1]

print("user sub-dim:", user_feat_dim)
print("item sub-dim:", item_feat_dim)

# construir matrizes finais separadas (recalcular para evitar confusão)
user_features = np.hstack([user_age_scaled, user_cat_ohe]).astype(float)    # shape (n_users, D_u)
item_features = np.hstack([item_release_ohe, genres_mx]).astype(float)     # shape (n_items_train, D_i)

n_users = user_features.shape[0]
n_items_train = item_features.shape[0]

# Dimensão comum: concatenação das sub-dimensões (users first, items second)
D_user = user_features.shape[1]
D_item = item_features.shape[1]
D = D_user + D_item
print("Final feature dimension D =", D)

# construir user_full: [user_features , zeros(n_users, D_item)]
user_pad = np.zeros((n_users, D_item), dtype=float)
user_full = np.hstack([user_features, user_pad])   # shape (n_users, D)

# construir item_full: [zeros(n_items_train, D_user) , item_features]
item_pad = np.zeros((n_items_train, D_user), dtype=float)
item_full = np.hstack([item_pad, item_features])   # shape (n_items_train, D)

# agora empilhar em X_all mantendo ordem users then train items
X_all = np.vstack([user_full, item_full]).astype(float)   # shape (n_users + n_items_train, D)
X_all_tensor = torch.tensor(np.nan_to_num(X_all), dtype=torch.float)

# salvar
torch.save(X_all_tensor, "X_all_train.pt")
np.save("user_features_train_full.npy", user_full)
np.save("item_features_train_full.npy", item_full)
with open("dims.json", "w", encoding="utf-8") as f:
    json.dump({"n_users": n_users, "n_train_items": n_items_train, "feature_dim": D, "D_user": D_user, "D_item": D_item}, f)

print("Saved X_all_train.pt  shape:", X_all_tensor.shape)
print("user_full shape:", user_full.shape)
print("item_full shape:", item_full.shape)

user sub-dim: 135
item sub-dim: 127
Final feature dimension D = 262
Saved X_all_train.pt  shape: torch.Size([2595, 262])
user_full shape: (943, 262)
item_full shape: (1652, 262)


In [13]:
# Transformar itens de teste em vetores compatíveis com X_all

# carregar encoders e dims previamente salvos
ohe_release = joblib.load("ohe_release.joblib")
mlb_genres = joblib.load("mlb_genres.joblib")

with open("dims.json", "r", encoding="utf-8") as f:
    dims = json.load(f)
D_user = int(dims["D_user"])
D_item = int(dims["D_item"])
D = int(dims["feature_dim"])

# utilitário para parse de lista ["k:v", ...] -> dict
def parse_kv_list_to_dict(kv_list):
    d = {}
    for kv in kv_list:
        if isinstance(kv, str) and ":" in kv:
            k,v = kv.split(":",1)
            d.setdefault(k, []).append(v)
    return d

# transformar um único item (lista de tokens "k:v") em vetor item-part (D_item,)
def transform_single_test_item_itempart(kv_list):
    parsed = parse_kv_list_to_dict(kv_list)
    # releaseDate token (string) -> shape (1, )
    rd = parsed.get('releaseDate', [""])[0] if parsed.get('releaseDate') else ""
    if rd:
        try:
            rd_arr = np.array([rd]).reshape(-1,1)
            rd_vec = ohe_release.transform(rd_arr)  # shape (1, D_release)
        except Exception:
            # fallback: unseen or bad -> zeros
            rd_vec = np.zeros((1, ohe_release.transform(np.array([ohe_release.categories_[0][0]]).reshape(-1,1)).shape[1]))
    else:
        rd_vec = np.zeros((1, ohe_release.transform(np.array([ohe_release.categories_[0][0]]).reshape(-1,1)).shape[1]))
    # genres list -> shape (1, n_genres)
    genres = parsed.get('genre', [])
    if isinstance(genres, str):
        genres = [genres]
    try:
        genres_vec = mlb_genres.transform([genres])  # (1, n_genres)
    except Exception:
        genres_vec = np.zeros((1, len(mlb_genres.classes_)))
    # concatenar partes do item (ordenadas exatamente como item_features)
    item_part = np.hstack([rd_vec, genres_vec]).astype(float).ravel()
    # garantir dimensão D_item (pad ou trim se necessário)
    if item_part.shape[0] < D_item:
        pad = np.zeros((D_item - item_part.shape[0],), dtype=float)
        item_part = np.hstack([item_part, pad])
    elif item_part.shape[0] > D_item:
        item_part = item_part[:D_item]
    return item_part

# transformar um lote de items (lista de kv_lists) e retornar matriz full (n_items, D)
def transform_test_items_batch(kv_lists, save_path=None):
    item_parts = [transform_single_test_item_itempart(kv) for kv in kv_lists]
    item_parts = np.vstack(item_parts)  # shape (n_items, D_item)
    # construir full vectors: [zeros(D_user) , item_part]
    user_zeros = np.zeros((item_parts.shape[0], D_user), dtype=float)
    full = np.hstack([user_zeros, item_parts])  # shape (n_items, D)
    # sanity: ensure correct width
    if full.shape[1] != D:
        # try to pad or trim to exact D
        if full.shape[1] < D:
            pad = np.zeros((full.shape[0], D - full.shape[1]), dtype=float)
            full = np.hstack([full, pad])
        else:
            full = full[:, :D]
    if save_path:
        np.save(save_path, full)
    return full

In [None]:
# Treino GraphSAGE model
# Carregar artefatos
edge_index = torch.load("edge_index_train.pt")                # shape [2, E]
X_all = torch.load("X_all_train.pt")                          # shape [n_nodes, D]
with open("dims.json","r",encoding="utf-8") as f:
    dims = json.load(f)
n_users = int(dims["n_users"])
n_train_items = int(dims["n_train_items"])
D = int(dims["feature_dim"])

# Dispositivos
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42); np.random.seed(42); random.seed(42)

# Preparar pares positivos (u_idx, i_idx_global)
#Reconstruir a lista pos_pairs a partir de edge_index: filtrar arestas user->item
ei = edge_index.cpu().numpy()
src = ei[0]; dst = ei[1]
# edges user->item têm src < n_users e dst >= n_users
mask = (src < n_users) & (dst >= n_users)
pos_u = src[mask]
pos_i = dst[mask]
pos_pairs = list(zip(pos_u.tolist(), pos_i.tolist()))  # i já é global (n_users + local_item_idx)

print("num positive pairs:", len(pos_pairs))

# negative sampling helper: sample random item global index (n_users .. n_users+n_train_items-1)
all_item_globals = np.arange(n_users, n_users + n_train_items)

def sample_negatives(batch_users, k=1):
    # retorna array shape (B, k) de itens globais
    negs = []
    for u in batch_users:
        choices = np.random.choice(all_item_globals, size=k, replace=True)
        negs.append(choices)
    return np.array(negs)

# GraphSAGE encoder
class GraphSAGEEncoder(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim=128, out_dim=64, num_layers=2):
        super().__init__()
        self.convs = torch.nn.ModuleList()
        if num_layers == 1:
            self.convs.append(SAGEConv(in_dim, out_dim))
        else:
            self.convs.append(SAGEConv(in_dim, hidden_dim))
            for _ in range(num_layers-2):
                self.convs.append(SAGEConv(hidden_dim, hidden_dim))
            self.convs.append(SAGEConv(hidden_dim, out_dim))
        self.act = torch.nn.ReLU()

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            x = self.act(x)
        x = self.convs[-1](x, edge_index)
        return F.normalize(x, p=2, dim=-1)

in_dim = X_all.shape[1]
hidden_dim = 64
out_dim = 32
num_layers = 1
lr = 5e-4
weight_decay = 1e-6
epochs = 30
batch_size = 512  # ajuste conforme seu dataset/ memória; para poucos pares use len(pos_pairs)

# instanciar modelo e optimizer
encoder = GraphSAGEEncoder(in_dim, hidden_dim, out_dim, num_layers).to(DEVICE)
optimizer = torch.optim.Adam(encoder.parameters(), lr=lr, weight_decay=weight_decay)

# mover tensores para DEVICE
X_all = X_all.to(DEVICE)
edge_index = edge_index.to(DEVICE)

# treino
for epoch in range(1, epochs+1):
    encoder.train()
    random.shuffle(pos_pairs)
    total_loss = 0.0
    # se dataset pequeno, processa tudo de uma vez
    if batch_size >= len(pos_pairs):
        batch = pos_pairs
        batch_users = [p[0] for p in batch]
        batch_pos_items = [p[1] for p in batch]
        neg_items = sample_negatives(batch_users, k=1)[:,0].tolist()
        z = encoder(X_all, edge_index)  # embeddings para todos os nós
        u_emb = z[torch.tensor(batch_users, device=DEVICE)]
        pos_emb = z[torch.tensor(batch_pos_items, device=DEVICE)]
        neg_emb = z[torch.tensor(neg_items, device=DEVICE)]
        pos_scores = (u_emb * pos_emb).sum(dim=-1)
        neg_scores = (u_emb * neg_emb).sum(dim=-1)
        loss = -torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-15).mean()
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        total_loss = loss.item()
    else:
        # mini-batch loop
        for i in range(0, len(pos_pairs), batch_size):
            batch = pos_pairs[i:i+batch_size]
            batch_users = [p[0] for p in batch]
            batch_pos_items = [p[1] for p in batch]
            neg_items = sample_negatives(batch_users, k=1)[:,0].tolist()
            z = encoder(X_all, edge_index)
            u_emb = z[torch.tensor(batch_users, device=DEVICE)]
            pos_emb = z[torch.tensor(batch_pos_items, device=DEVICE)]
            neg_emb = z[torch.tensor(neg_items, device=DEVICE)]
            pos_scores = (u_emb * pos_emb).sum(dim=-1)
            neg_scores = (u_emb * neg_emb).sum(dim=-1)
            loss = -torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-15).mean()
            optimizer.zero_grad(); loss.backward(); optimizer.step()
            total_loss += loss.item() * len(batch)
        total_loss = total_loss / len(pos_pairs)

    print(f"Epoch {epoch:02d}/{epochs}  loss={total_loss:.4f}")

# salvar encoder e embeddings
torch.save(encoder.state_dict(), "graphsage_encoder.pt")
encoder.eval()
with torch.no_grad():
    z_all = encoder(X_all, edge_index).cpu().numpy()  # shape (n_nodes, out_dim)
user_embeddings = z_all[:n_users]
item_embeddings_train = z_all[n_users:]  # correspond to train items in item2idx order

np.save("user_embeddings.npy", user_embeddings)
np.save("item_embeddings_train.npy", item_embeddings_train)
print("Saved encoder and embeddings. user_embeddings shape:", user_embeddings.shape, "item_embeddings_train shape:", item_embeddings_train.shape)

num positive pairs: 95961
Epoch 01/30  loss=0.5659
Epoch 02/30  loss=0.4773
Epoch 03/30  loss=0.4544
Epoch 04/30  loss=0.4440
Epoch 05/30  loss=0.4421
Epoch 06/30  loss=0.4398
Epoch 07/30  loss=0.4378
Epoch 08/30  loss=0.4381
Epoch 09/30  loss=0.4372
Epoch 10/30  loss=0.4361
Epoch 11/30  loss=0.4361
Epoch 12/30  loss=0.4342
Epoch 13/30  loss=0.4340
Epoch 14/30  loss=0.4346
Epoch 15/30  loss=0.4336
Epoch 16/30  loss=0.4339
Epoch 17/30  loss=0.4338
Epoch 18/30  loss=0.4334
Epoch 19/30  loss=0.4335
Epoch 20/30  loss=0.4337
Epoch 21/30  loss=0.4330
Epoch 22/30  loss=0.4342
Epoch 23/30  loss=0.4327
Epoch 24/30  loss=0.4327
Epoch 25/30  loss=0.4322
Epoch 26/30  loss=0.4322
Epoch 27/30  loss=0.4337
Epoch 28/30  loss=0.4344
Epoch 29/30  loss=0.4329
Epoch 30/30  loss=0.4336
Saved encoder and embeddings. user_embeddings shape: (943, 32) item_embeddings_train shape: (1652, 32)


In [15]:
# Inferência de embeddings para itens de teste
# inscrever itens de teste (isolados) e obter embeddings com encoder treinado

# Carregar artefatos
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_all_train = torch.load("X_all_train.pt")           # (n_nodes_train, D)
edge_index = torch.load("edge_index_train.pt")       # (2, E)
with open("dims.json","r",encoding="utf-8") as f:
    dims = json.load(f)
n_nodes_train = X_all_train.shape[0]
D = int(dims["feature_dim"])

# carregar encoder treinado
in_dim = X_all_train.shape[1]
encoder = GraphSAGEEncoder(in_dim, hidden_dim=64, out_dim=32, num_layers=1).to(DEVICE)
encoder.load_state_dict(torch.load("graphsage_encoder.pt", map_location=DEVICE))
encoder.eval()

# Construir X_test_full: use transform_test_items_batch(kv_lists)
# Assuma que `test_kv_lists` está definido e alinhado com `test_item_ids`
# Se não estiver, construa test_kv_lists a partir de metadata_items e a lista de test_item_ids presentes
try:
    test_kv_lists
except NameError:
    test_kv_lists = [ item_features_raw.get(str(iid), []) for iid in test_item_ids ]

# Transformar para matriz full (n_test, D) — função definida na Célula 6
X_test_full = transform_test_items_batch(test_kv_lists, save_path=None)  # numpy (n_test, D)
assert X_test_full.shape[1] == D, "Dimensão incompatível: ajuste encoders/padding"

# Concatenar sem alterar edge_index: novos nós estarão isolados (ou conectados se preferir)
X_all_extended = np.vstack([ X_all_train.cpu().numpy(), X_test_full ])   # (n_nodes_train + n_test, D)
X_all_ext_t = torch.tensor(X_all_extended, dtype=torch.float).to(DEVICE)

# Rodar forward do encoder (edge_index permanece o do grafo de treino)
edge_index = edge_index.to(DEVICE)
with torch.no_grad():
    embeddings_all = encoder(X_all_ext_t, edge_index)   # (n_nodes_train + n_test, out_dim)
emb_np = embeddings_all.cpu().numpy()

# Extrair embeddings dos itens de teste: índices test_global = range(n_nodes_train, n_nodes_train + n_test)
n_test = X_test_full.shape[0]
test_global_start = n_nodes_train
test_global_indices = np.arange(test_global_start, test_global_start + n_test)
test_embeddings = emb_np[test_global_indices, :]

# salvar embeddings e mapping id->index
np.save("test_item_embeddings.npy", test_embeddings)

mapping = { str(test_item_ids[i]): int(i) for i in range(len(test_item_ids)) }
with open("test_item_id2idx.json","w",encoding="utf-8") as f:
    json.dump(mapping,f)

print("Done. test_embeddings shape:", test_embeddings.shape)
print("Saved mapping test_item_id2idx.json (id -> row index in test_item_embeddings.npy).")

Done. test_embeddings shape: (30, 32)
Saved mapping test_item_id2idx.json (id -> row index in test_item_embeddings.npy).


In [16]:
# Carregar artefatos e construir ground truth sets por item

# Carregar embeddings e mapas
user_embeddings = np.load("user_embeddings.npy")            # shape (n_users, emb_dim)
item_embeddings_test = np.load("test_item_embeddings.npy")  # shape (n_test, emb_dim)
with open("test_item_id2idx.json","r",encoding="utf-8") as f:
    test_id2row = json.load(f)     # map item_id -> row index in item_embeddings_test

# Normalizar tipos
test_interactions['user_id'] = test_interactions['user_id'].astype(str)
test_interactions['item_id'] = test_interactions['item_id'].astype(str)

# Construir ground truth: dict item_id -> set(user_idx)
gt_item2users = {}
for _, row in test_interactions.iterrows():
    iid = str(row['item_id'])
    uid = str(row['user_id'])
    if iid in test_id2row and uid in user2idx:
        r = test_id2row[iid]
        uidx = int(user2idx[uid])
        gt_item2users.setdefault(iid, set()).add(uidx)

# Reduzir para items de teste que têm pelo menos um positivo (avaliáveis)
evaluated_items = [iid for iid, s in gt_item2users.items() if len(s) > 0]
print("n_test_items_total:", len(test_id2row))
print("n_test_items_with_positives:", len(evaluated_items))


n_test_items_total: 30
n_test_items_with_positives: 30


In [17]:
# Normalizar embeddings para usar dot = cosine
def l2_normalize_rows(X):
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    return X / norms

U = l2_normalize_rows(user_embeddings)         # (n_users, d)
I = l2_normalize_rows(item_embeddings_test)     # (n_test, d)

# produto matriz I @ U.T -> (n_test, n_users) de similaridades
sims = I.dot(U.T)  # cuidado com memória se n_test * n_users grande; seu dataset ML-100k razoável

K_max = 50
# para cada test item (row index), recuperar top-K_max user indices (descendente)
topk_users_per_test = {}
n_test = I.shape[0]
for test_i in range(n_test):
    row = sims[test_i]
    # argsort decrescente; usar partition para eficiência
    if row.shape[0] <= K_max:
        topk = np.argsort(-row)
    else:
        idx_part = np.argpartition(-row, K_max-1)[:K_max]
        topk = idx_part[np.argsort(-row[idx_part])]
    topk_users_per_test[test_i] = topk  # array of user indices, length <= K_max

print("Computed top-50 users for each test item (by row index).")

Computed top-50 users for each test item (by row index).


In [18]:
# Calcular Precision@K e NDCG@K médios, para K=10, 20 e 50, nos nós de teste

def precision_at_k(recommended, ground_truth_set, k):
    rec_k = recommended[:k]
    hits = sum(1 for u in rec_k if u in ground_truth_set)
    return hits / k

def dcg_at_k(recommended, ground_truth_set, k):
    rec_k = recommended[:k]
    dcg = 0.0
    for i, u in enumerate(rec_k):
        rel = 1.0 if u in ground_truth_set else 0.0
        denom = log2(i+2)  # i starts at 0 -> position 1 -> log2(2)
        dcg += rel / denom
    return dcg

def idcg_at_k(ground_truth_set, k):
    # ideal DCG has min(len(gt), k) ones at top
    n_rel = min(len(ground_truth_set), k)
    idcg = sum(1.0 / log2(i+2) for i in range(n_rel))
    return idcg if idcg > 0 else 1.0  # avoid div0; if no positives, we will skip such item

Ks = [10, 20, 50]
sum_prec = {k: 0.0 for k in Ks}
sum_ndcg = {k: 0.0 for k in Ks}
count_items = 0

for iid in evaluated_items:
    row_idx = test_id2row[iid]
    recommended = topk_users_per_test[row_idx].tolist()
    gt_set = gt_item2users[iid]
    if len(gt_set) == 0:
        continue
    count_items += 1
    for k in Ks:
        prec = precision_at_k(recommended, gt_set, k)
        sum_prec[k] += prec
        dcg = dcg_at_k(recommended, gt_set, k)
        idcg = idcg_at_k(gt_set, k)
        ndcg = dcg / idcg
        sum_ndcg[k] += ndcg

# Médias
avg_prec = {k: (sum_prec[k] / count_items) for k in Ks}
avg_ndcg = {k: (sum_ndcg[k] / count_items) for k in Ks}

print("n_evaluated_items:", count_items)
for k in Ks:
    print(f"Precision@{k}: {avg_prec[k]:.4f}    NDCG@{k}: {avg_ndcg[k]:.4f}")

n_evaluated_items: 30
Precision@10: 0.3100    NDCG@10: 0.3349
Precision@20: 0.2683    NDCG@20: 0.2959
Precision@50: 0.2487    NDCG@50: 0.2679


Configuração de hiperparâmetros:
in_dim = X_all.shape[1]
hidden_dim = 128
out_dim = 64
num_layers = 2
lr = 1e-3
weight_decay = 1e-5
epochs = 30
batch_size = 1024

Resultados:
n_evaluated_items: 30
Precision@10: 0.1833    NDCG@10: 0.1910
Precision@20: 0.2000    NDCG@20: 0.1997
Precision@50: 0.2153    NDCG@50: 0.2120

Configuração de hiperparâmetros:
in_dim = X_all.shape[1]
hidden_dim = 128
out_dim = 64
num_layers = 2
lr = 1e-3
weight_decay = 1e-5
epochs = 30
batch_size = 1024

Resultados:
n_evaluated_items: 30
Precision@10: 0.2067    NDCG@10: 0.2160
Precision@20: 0.1767    NDCG@20: 0.1913
Precision@50: 0.1620    NDCG@50: 0.1734