In [1]:
# import warnings
# warnings.filterwarnings('ignore')

import pandas as pd
import csv
from collections import defaultdict
import numpy as np
import json
import torch
import joblib
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
import random
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from math import log2
import os
import random
from datetime import datetime
import glob
# Default path to data files
PATH = "../../data/"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load user-item interaction data
interaction_data = pd.read_csv(
    PATH + 'ml-100k/u.data',
    sep='\t',
    encoding="latin1",
    names=['user_id', 'item_id', 'rating', 'timestamp']
    )[['user_id', 'item_id', 'rating']]
interaction_data['rating'] = 1
display(interaction_data.shape)
interaction_data.head(5)

(100000, 3)

Unnamed: 0,user_id,item_id,rating
0,196,242,1
1,186,302,1
2,22,377,1
3,244,51,1
4,166,346,1


In [3]:
# Load test item IDs from the json file saved
# previously from Knowledge Graph Method
with open('../../experiments/test_ids.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
# Extract test item IDs as integers
test_item_ids = [item['movieId'] for item in data]
print(f"items on test set: {len(test_item_ids)}")
display(test_item_ids[:5])

items on test set: 30


['159', '458', '679', '128', '658']

In [4]:
# Split interaction data into train and test sets
# to allow later cold start evaluation 
train_interactions = interaction_data[
    ~interaction_data['item_id'].astype(str).isin(test_item_ids)].reset_index(drop=True)
test_interactions = interaction_data[
    interaction_data['item_id'].astype(str).isin(test_item_ids)].reset_index(drop=True)

In [5]:
# garantir que ids sejam strings para comparação segura
train_interactions['user_id'] = train_interactions['user_id'].astype(str)
train_interactions['item_id'] = train_interactions['item_id'].astype(str)
test_item_ids = [str(x) for x in test_item_ids]  # já definido antes

# criar lista ordenada de users observados no treino e items que permaneceram no grafo de treino
unique_user_ids = sorted(train_interactions['user_id'].unique().tolist())
train_item_ids = sorted(train_interactions['item_id'].unique().tolist())  # exclui itens de teste

# construir mapas id -> índice
user2idx = {u: i for i, u in enumerate(unique_user_ids)}
item2idx = {v: i for i, v in enumerate(train_item_ids)}

# contar e salvar (opcional, mas útil)
n_users = len(user2idx)
n_train_items = len(item2idx)
print(f"n_users={n_users}, n_train_items={n_train_items}")

n_users=943, n_train_items=1652


In [6]:
# Mapear colunas para índices locais
train_df = train_interactions.copy()
train_df['u_idx'] = train_df['user_id'].map(user2idx)
train_df['i_idx_local'] = train_df['item_id'].map(item2idx)  # items de treino apenas

# Remover linhas sem mapeamento (segurança)
train_df = train_df.dropna(subset=['u_idx','i_idx_local']).astype({'u_idx':int, 'i_idx_local':int})

# Construir índices globais: shift nos items para evitar sobreposição com users
u_nodes = train_df['u_idx'].values                      # já 0..n_users-1
i_nodes = train_df['i_idx_local'].values + n_users     # items -> n_users .. n_users + n_train_items - 1

# Duplicar para grafo não direcionado: user->item e item->user
src = np.concatenate([u_nodes, i_nodes])
dst = np.concatenate([i_nodes, u_nodes])

edge_index_train = torch.tensor([src, dst], dtype=torch.long)

# Sanity checks e salvar
print("edge_index_train shape:", edge_index_train.shape)
print("num_edges (directed count):", edge_index_train.shape[1])
print("num_unique_nodes referenced:", int(torch.unique(edge_index_train).numel()))
# salvar para uso posterior
torch.save(edge_index_train, "edge_index_train.pt")

edge_index_train shape: torch.Size([2, 191922])
num_edges (directed count): 191922


  edge_index_train = torch.tensor([src, dst], dtype=torch.long)


num_unique_nodes referenced: 2595


In [7]:
# Instantiate a defaultdict to hold user features
user_data = defaultdict(dict)

# Read data and build user features dictionary
def load_feature(file_path, feature_name):
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            userId = row['userId']
            value = row[feature_name]
            user_data[userId][feature_name] = value

# Load each feature file
load_feature(PATH + 'ageRel.csv', 'age')
load_feature(PATH + 'genderRel.csv', 'gender')
load_feature(PATH + 'occupationRel.csv', 'occupation')
load_feature(PATH + 'residesRel.csv', 'zipcode')

# Build user features as a dictionary (user_id -> list of "k:v" features)
user_features_raw = {
    str(userId): [
        f'age:{data.get("age","")}',
        f'gender:{data.get("gender","")}',
        f'occupation:{data.get("occupation","")}',
        f'zipcode:{data.get("zipcode","")}'
    ]
    for userId, data in user_data.items()
}

# Display first 5 user features
for item in list(user_features_raw.items())[:5]:
    print(item)

('1', ['age:24', 'gender:M', 'occupation:technician', 'zipcode:85'])
('2', ['age:53', 'gender:F', 'occupation:other', 'zipcode:94'])
('3', ['age:23', 'gender:M', 'occupation:writer', 'zipcode:32'])
('4', ['age:24', 'gender:M', 'occupation:technician', 'zipcode:43'])
('5', ['age:33', 'gender:F', 'occupation:other', 'zipcode:15'])


In [8]:
# Instantiate a defaultdict to hold item features
item_data = defaultdict(lambda: defaultdict(list))

# Read data and build item features dictionary
# Modified version to handle multiple genres
def load_feature(file_path, feature_name):
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            itemId = row['movieId']
            value = row[feature_name]
            if feature_name == 'genreDesc':
                item_data[itemId]['genre'].append(value)
            else:
                item_data[itemId][feature_name] = value

# Load each feature file
load_feature(PATH + 'releaseRel.csv', 'releaseDate')
load_feature(PATH + 'genreRel.csv', 'genreDesc')

# Build item features as a dictionary (item_id -> list of "k:v" features)
item_features_raw = {
    str(itemId): (
        [f'releaseDate:{data.get("releaseDate","")}'] +
        [f'genre:{genre}' for genre in data.get('genre', [])]
    )
    for itemId, data in item_data.items()
}

# Display first 5 item features
for item in list(item_features_raw.items())[:5]:
    print(item)

('2', ['releaseDate:Jan-1995', 'genre:Action', 'genre:Adventure', 'genre:Thriller'])
('4', ['releaseDate:Jan-1995', 'genre:Action', 'genre:Comedy', 'genre:Drama'])
('17', ['releaseDate:Feb-1996', 'genre:Action', 'genre:Comedy', 'genre:Crime', 'genre:Horror', 'genre:Thriller'])
('21', ['releaseDate:Feb-1996', 'genre:Action', 'genre:Adventure', 'genre:Comedy', 'genre:Musical', 'genre:Thriller'])
('22', ['releaseDate:Feb-1996', 'genre:Action', 'genre:Drama', 'genre:War'])


In [9]:
# helper para parse "k:v" lists
def parse_kv_list(kv_list):
    d = defaultdict(list)
    for kv in kv_list:
        if isinstance(kv, str) and ":" in kv:
            k,v = kv.split(":",1)
            d[k].append(v)
    # return simplified: single-value -> string, multi -> list
    out = {}
    for k,vals in d.items():
        out[k] = vals[0] if len(vals)==1 else vals
    return out

# garantir chaves strings
user_keys = list(user2idx.keys())        # ordem já definida antes
train_item_keys = list(item2idx.keys())  # itens que estão no grafo de treino

# criar parsed lists (preserve order)
users_parsed = []
for uid in user_keys:
    kvs = user_features_raw.get(str(uid), [])   # ajuste: metadata map may use string keys
    parsed = parse_kv_list(kvs)
    parsed['id'] = uid
    users_parsed.append(parsed)
df_users = pd.DataFrame(users_parsed).set_index('id').fillna("")

items_parsed = []
for iid in train_item_keys:
    kvs = item_features_raw.get(str(iid), [])
    parsed = parse_kv_list(kvs)
    parsed['id'] = iid
    items_parsed.append(parsed)
df_items = pd.DataFrame(items_parsed).set_index('id').fillna("")

In [10]:
# USERS
# age numeric (fallback 0), gender string, occupation string, zipcode string
user_age = df_users['age'].astype(float).values.reshape(-1,1) if 'age' in df_users.columns else np.zeros((len(df_users),1))
user_gender = df_users['gender'].astype(str).values.reshape(-1,1) if 'gender' in df_users.columns else np.array([[""]] * len(df_users))
user_occupation = df_users['occupation'].astype(str).values.reshape(-1,1) if 'occupation' in df_users.columns else np.array([[""]] * len(df_users))
user_zipcode = df_users['zipcode'].astype(str).values.reshape(-1,1) if 'zipcode' in df_users.columns else np.array([[""]] * len(df_users))

# ITEMS
# releaseDate kept as original string (e.g., "Jan-1995")
# genres: ensure list for each item
item_release_str = []
item_genres_list = []
for iid in df_items.index:
    rd = df_items.loc[iid].get('releaseDate', "")
    # keep original string (or empty string if missing)
    item_release_str.append(rd if isinstance(rd, str) else "")
    # genres: parsed could be list or single string
    genres = df_items.loc[iid].get('genre', [])
    if isinstance(genres, str):
        genres = [genres] if genres else []
    item_genres_list.append(genres)

# Converter para arrays numpy apropriados
# user_age already shaped (n_users,1)
# user_gender, user_occupation, user_zipcode shaped (n_users,1)
user_cat_inputs = {
    'gender': user_gender,
    'occupation': user_occupation,
    'zipcode': user_zipcode
}

# item_release_str is a list of strings length n_items
# item_genres_list is a list of lists length n_items

# Mostre formas e amostras para verificação rápida
print("n_users:", len(df_users))
print("user_age shape:", user_age.shape)
print("user_gender sample:", user_gender[:3].ravel())
print("user_occupation sample:", user_occupation[:3].ravel())
print("user_zipcode sample:", user_zipcode[:3].ravel())

print("n_train_items:", len(df_items))
print("item_release sample:", item_release_str[:3])
print("item_genres sample:", item_genres_list[:3])


n_users: 943
user_age shape: (943, 1)
user_gender sample: ['M' 'M' 'M']
user_occupation sample: ['technician' 'lawyer' 'executive']
user_zipcode sample: ['85' '90' '90']
n_train_items: 1652
item_release sample: ['Jan-1995', 'Feb-1997', 'Jan-1994']
item_genres sample: [['Animation', 'Childrens', 'Comedy'], ['Crime', 'Drama', 'Thriller'], ['Comedy', 'Western']]


In [11]:
# USERS
# ohe_user: gender, occupation, zipcode combinados
ohe_user = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
user_cat_input = np.hstack([user_cat_inputs['gender'], user_cat_inputs['occupation'], user_cat_inputs['zipcode']])  # shape (n_users, 3)
user_cat_ohe = ohe_user.fit_transform(user_cat_input)  # (n_users, D_user_cat)

# age scaler
sc_user_age = StandardScaler().fit(user_age)  # user_age shape (n_users,1)
user_age_scaled = sc_user_age.transform(user_age)  # (n_users,1)

# ITEMS
# releaseDate as categorical token -> OneHotEncoder
ohe_release = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
item_release_arr = np.array(item_release_str).reshape(-1,1)  # (n_items,1)
item_release_ohe = ohe_release.fit_transform(item_release_arr)  # (n_items, D_release)

# genres multi-hot
mlb_genres = MultiLabelBinarizer(sparse_output=False)
genres_mx = mlb_genres.fit_transform(item_genres_list)  # (n_items, n_genres)

# salvar encoders
joblib.dump(ohe_user, "ohe_user.joblib")
joblib.dump(sc_user_age, "sc_user_age.joblib")
joblib.dump(ohe_release, "ohe_release.joblib")
joblib.dump(mlb_genres, "mlb_genres.joblib")

# prints para verificação
print("user_cat_ohe shape:", user_cat_ohe.shape)
print("user_age_scaled shape:", user_age_scaled.shape)
print("item_release_ohe shape:", item_release_ohe.shape)
print("genres_mx shape:", genres_mx.shape)


user_cat_ohe shape: (943, 134)
user_age_scaled shape: (943, 1)
item_release_ohe shape: (1652, 109)
genres_mx shape: (1652, 18)


In [12]:
# Construir matrizes finais e concatenar (users then train items), salvar artefatos
# verificar shapes
user_feat_dim = user_age_scaled.shape[1] + user_cat_ohe.shape[1]   # ou user_features.shape[1] se já tiver
item_feat_dim = item_release_ohe.shape[1] + genres_mx.shape[1]    # ou item_features.shape[1]

print("user sub-dim:", user_feat_dim)
print("item sub-dim:", item_feat_dim)

# construir matrizes finais separadas (recalcular para evitar confusão)
user_features = np.hstack([user_age_scaled, user_cat_ohe]).astype(float)    # shape (n_users, D_u)
item_features = np.hstack([item_release_ohe, genres_mx]).astype(float)     # shape (n_items_train, D_i)

n_users = user_features.shape[0]
n_items_train = item_features.shape[0]

# Dimensão comum: concatenação das sub-dimensões (users first, items second)
D_user = user_features.shape[1]
D_item = item_features.shape[1]
D = D_user + D_item
print("Final feature dimension D =", D)

# construir user_full: [user_features , zeros(n_users, D_item)]
user_pad = np.zeros((n_users, D_item), dtype=float)
user_full = np.hstack([user_features, user_pad])   # shape (n_users, D)

# construir item_full: [zeros(n_items_train, D_user) , item_features]
item_pad = np.zeros((n_items_train, D_user), dtype=float)
item_full = np.hstack([item_pad, item_features])   # shape (n_items_train, D)

# agora empilhar em X_all mantendo ordem users then train items
X_all = np.vstack([user_full, item_full]).astype(float)   # shape (n_users + n_items_train, D)
X_all_tensor = torch.tensor(np.nan_to_num(X_all), dtype=torch.float)

# salvar
torch.save(X_all_tensor, "X_all_train.pt")
np.save("user_features_train_full.npy", user_full)
np.save("item_features_train_full.npy", item_full)
with open("dims.json", "w", encoding="utf-8") as f:
    json.dump({"n_users": n_users, "n_train_items": n_items_train, "feature_dim": D, "D_user": D_user, "D_item": D_item}, f)

print("Saved X_all_train.pt  shape:", X_all_tensor.shape)
print("user_full shape:", user_full.shape)
print("item_full shape:", item_full.shape)

user sub-dim: 135
item sub-dim: 127
Final feature dimension D = 262
Saved X_all_train.pt  shape: torch.Size([2595, 262])
user_full shape: (943, 262)
item_full shape: (1652, 262)


In [13]:
# Transformar itens de teste em vetores compatíveis com X_all

# carregar encoders e dims previamente salvos
ohe_release = joblib.load("ohe_release.joblib")
mlb_genres = joblib.load("mlb_genres.joblib")

with open("dims.json", "r", encoding="utf-8") as f:
    dims = json.load(f)
D_user = int(dims["D_user"])
D_item = int(dims["D_item"])
D = int(dims["feature_dim"])

# utilitário para parse de lista ["k:v", ...] -> dict
def parse_kv_list_to_dict(kv_list):
    d = {}
    for kv in kv_list:
        if isinstance(kv, str) and ":" in kv:
            k,v = kv.split(":",1)
            d.setdefault(k, []).append(v)
    return d

# transformar um único item (lista de tokens "k:v") em vetor item-part (D_item,)
def transform_single_test_item_itempart(kv_list):
    parsed = parse_kv_list_to_dict(kv_list)
    # releaseDate token (string) -> shape (1, )
    rd = parsed.get('releaseDate', [""])[0] if parsed.get('releaseDate') else ""
    if rd:
        try:
            rd_arr = np.array([rd]).reshape(-1,1)
            rd_vec = ohe_release.transform(rd_arr)  # shape (1, D_release)
        except Exception:
            # fallback: unseen or bad -> zeros
            rd_vec = np.zeros((1, ohe_release.transform(np.array([ohe_release.categories_[0][0]]).reshape(-1,1)).shape[1]))
    else:
        rd_vec = np.zeros((1, ohe_release.transform(np.array([ohe_release.categories_[0][0]]).reshape(-1,1)).shape[1]))
    # genres list -> shape (1, n_genres)
    genres = parsed.get('genre', [])
    if isinstance(genres, str):
        genres = [genres]
    try:
        genres_vec = mlb_genres.transform([genres])  # (1, n_genres)
    except Exception:
        genres_vec = np.zeros((1, len(mlb_genres.classes_)))
    # concatenar partes do item (ordenadas exatamente como item_features)
    item_part = np.hstack([rd_vec, genres_vec]).astype(float).ravel()
    # garantir dimensão D_item (pad ou trim se necessário)
    if item_part.shape[0] < D_item:
        pad = np.zeros((D_item - item_part.shape[0],), dtype=float)
        item_part = np.hstack([item_part, pad])
    elif item_part.shape[0] > D_item:
        item_part = item_part[:D_item]
    return item_part

# transformar um lote de items (lista de kv_lists) e retornar matriz full (n_items, D)
def transform_test_items_batch(kv_lists, save_path=None):
    item_parts = [transform_single_test_item_itempart(kv) for kv in kv_lists]
    item_parts = np.vstack(item_parts)  # shape (n_items, D_item)
    # construir full vectors: [zeros(D_user) , item_part]
    user_zeros = np.zeros((item_parts.shape[0], D_user), dtype=float)
    full = np.hstack([user_zeros, item_parts])  # shape (n_items, D)
    # sanity: ensure correct width
    if full.shape[1] != D:
        # try to pad or trim to exact D
        if full.shape[1] < D:
            pad = np.zeros((full.shape[0], D - full.shape[1]), dtype=float)
            full = np.hstack([full, pad])
        else:
            full = full[:, :D]
    if save_path:
        np.save(save_path, full)
    return full

In [14]:
# Execução de R runs independentes do treino GraphSAGE

R = 5
OUT_DIR = "runs_graphsage"
os.makedirs(OUT_DIR, exist_ok=True)

# Hiperparâmetros
hidden_dim = 128
out_dim = 64
num_layers = 2
lr = 5e-4
weight_decay = 1e-6
epochs = 30
batch_size = 1024

# Dispositivos
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Função utilitária para fixar seeds por run
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    # determinismo (pode impactar performance; comente se desejar velocidade)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

edge_index = torch.load("edge_index_train.pt")                # shape [2, E]
X_all = torch.load("X_all_train.pt")                          # shape [n_nodes, D]
with open("dims.json","r",encoding="utf-8") as f:
    dims = json.load(f)
n_users = int(dims["n_users"])
n_train_items = int(dims["n_train_items"])
D = int(dims["feature_dim"])

# Reconstruir pos_pairs a partir de edge_index: filtrar arestas user->item
ei = edge_index.cpu().numpy()
src = ei[0]; dst = ei[1]
mask = (src < n_users) & (dst >= n_users)
pos_u = src[mask]
pos_i = dst[mask]
pos_pairs = list(zip(pos_u.tolist(), pos_i.tolist()))
print("num positive pairs:", len(pos_pairs))

# negative sampling helper
all_item_globals = np.arange(n_users, n_users + n_train_items)

def sample_negatives(batch_users, k=1):
    negs = []
    for u in batch_users:
        choices = np.random.choice(all_item_globals, size=k, replace=True)
        negs.append(choices)
    return np.array(negs)

# GraphSAGE encoder model
class GraphSAGEEncoder(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim=128, out_dim=64, num_layers=2):
        super().__init__()
        self.convs = torch.nn.ModuleList()
        if num_layers == 1:
            self.convs.append(SAGEConv(in_dim, out_dim))
        else:
            self.convs.append(SAGEConv(in_dim, hidden_dim))
            for _ in range(num_layers-2):
                self.convs.append(SAGEConv(hidden_dim, hidden_dim))
            self.convs.append(SAGEConv(hidden_dim, out_dim))
        self.act = torch.nn.ReLU()

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            x = self.act(x)
        x = self.convs[-1](x, edge_index)
        return F.normalize(x, p=2, dim=-1)

# Determina dimensão de entrada
in_dim = X_all.shape[1]

# Mover tensores constantes para DEVICE
X_all = X_all.to(DEVICE)
edge_index = edge_index.to(DEVICE)

# --- Loop de R runs ---
runs_summary = []
for run in range(1, R+1):
    # semente diferente por run
    seed = 42 + run
    seed_everything(seed)
    np.random.seed(seed) 

    print(f"\n=== Run {run}/{R}  seed={seed} ===")
    run_ts = datetime.now().strftime("%Y%m%dT%H%M%SZ")
    run_folder = os.path.join(OUT_DIR, f"run{run}_{run_ts}")
    os.makedirs(run_folder, exist_ok=True)

    # (Re)instanciar modelo e optimizer
    encoder = GraphSAGEEncoder(in_dim, hidden_dim, out_dim, num_layers).to(DEVICE)
    optimizer = torch.optim.Adam(encoder.parameters(), lr=lr, weight_decay=weight_decay)

    # treino
    epoch_losses = []
    for epoch in range(1, epochs+1):
        encoder.train()
        random.shuffle(pos_pairs)
        total_loss = 0.0

        # processa tudo de uma vez se dataset pequeno
        if batch_size >= len(pos_pairs):
            batch = pos_pairs
            batch_users = [p[0] for p in batch]
            batch_pos_items = [p[1] for p in batch]
            neg_items = sample_negatives(batch_users, k=1)[:,0].tolist()
            z = encoder(X_all, edge_index)
            u_emb = z[torch.tensor(batch_users, device=DEVICE)]
            pos_emb = z[torch.tensor(batch_pos_items, device=DEVICE)]
            neg_emb = z[torch.tensor(neg_items, device=DEVICE)]
            pos_scores = (u_emb * pos_emb).sum(dim=-1)
            neg_scores = (u_emb * neg_emb).sum(dim=-1)
            loss = -torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-15).mean()
            optimizer.zero_grad(); loss.backward(); optimizer.step()
            total_loss = loss.item()
        else:
            # mini-batch loop
            for i in range(0, len(pos_pairs), batch_size):
                batch = pos_pairs[i:i+batch_size]
                batch_users = [p[0] for p in batch]
                batch_pos_items = [p[1] for p in batch]
                neg_items = sample_negatives(batch_users, k=1)[:,0].tolist()
                z = encoder(X_all, edge_index)
                u_emb = z[torch.tensor(batch_users, device=DEVICE)]
                pos_emb = z[torch.tensor(batch_pos_items, device=DEVICE)]
                neg_emb = z[torch.tensor(neg_items, device=DEVICE)]
                pos_scores = (u_emb * pos_emb).sum(dim=-1)
                neg_scores = (u_emb * neg_emb).sum(dim=-1)
                loss = -torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-15).mean()
                optimizer.zero_grad(); loss.backward(); optimizer.step()
                total_loss += loss.item() * len(batch)
            total_loss = total_loss / len(pos_pairs)

        epoch_losses.append(float(total_loss))
        print(f"Run {run}  Epoch {epoch:02d}/{epochs}  loss={total_loss:.6f}")

    # salvar encoder (pesos) do run
    encoder_path = os.path.join(run_folder, f"graphsage_encoder_run{run}.pt")
    torch.save(encoder.state_dict(), encoder_path)

    # gerar e salvar embeddings (em CPU numpy)
    encoder.eval()
    with torch.no_grad():
        z_all = encoder(X_all, edge_index).cpu().numpy()  # shape (n_nodes, out_dim)
    user_embeddings = z_all[:n_users]
    item_embeddings_train = z_all[n_users:n_users + n_train_items]

    user_emb_path = os.path.join(run_folder, f"user_embeddings_run{run}.npy")
    item_emb_path = os.path.join(run_folder, f"item_embeddings_train_run{run}.npy")
    np.save(user_emb_path, user_embeddings)
    np.save(item_emb_path, item_embeddings_train)

    print(f"Saved encoder and embeddings for run {run}:")
    print(" - encoder:", encoder_path)
    print(" - user_emb:", user_emb_path, "shape:", user_embeddings.shape)
    print(" - item_emb (train):", item_emb_path, "shape:", item_embeddings_train.shape)

    # salvar resumo da run em JSON
    run_summary = {
        "run": run,
        "seed": seed,
        "timestamp": run_ts,
        "epochs": epochs,
        "batch_size": batch_size,
        "hidden_dim": hidden_dim,
        "out_dim": out_dim,
        "num_layers": num_layers,
        "lr": lr,
        "weight_decay": weight_decay,
        "num_positive_pairs": len(pos_pairs),
        "encoder_path": encoder_path,
        "user_emb_path": user_emb_path,
        "item_emb_path": item_emb_path,
        "epoch_losses": epoch_losses
    }
    json_path = os.path.join(run_folder, f"run_summary_{run}.json")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(run_summary, f, indent=2)
    runs_summary.append(run_summary)

# salvar resumo agregado de todas as runs
agg_path = os.path.join(OUT_DIR, f"all_runs_summary_{datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')}.json")
with open(agg_path, "w", encoding="utf-8") as f:
    json.dump({"n_runs": R, "runs": runs_summary}, f, indent=2)
print(f"\nAll runs finished. Aggregate summary saved to {agg_path}")


num positive pairs: 95961

=== Run 1/5  seed=43 ===
Run 1  Epoch 01/30  loss=0.530310
Run 1  Epoch 02/30  loss=0.445304
Run 1  Epoch 03/30  loss=0.427868
Run 1  Epoch 04/30  loss=0.420934
Run 1  Epoch 05/30  loss=0.418531
Run 1  Epoch 06/30  loss=0.418798
Run 1  Epoch 07/30  loss=0.415724
Run 1  Epoch 08/30  loss=0.415452
Run 1  Epoch 09/30  loss=0.414447
Run 1  Epoch 10/30  loss=0.414588
Run 1  Epoch 11/30  loss=0.413485
Run 1  Epoch 12/30  loss=0.415710
Run 1  Epoch 13/30  loss=0.413755
Run 1  Epoch 14/30  loss=0.412386
Run 1  Epoch 15/30  loss=0.413173
Run 1  Epoch 16/30  loss=0.413661
Run 1  Epoch 17/30  loss=0.412152
Run 1  Epoch 18/30  loss=0.413040
Run 1  Epoch 19/30  loss=0.411629
Run 1  Epoch 20/30  loss=0.411589
Run 1  Epoch 21/30  loss=0.412465
Run 1  Epoch 22/30  loss=0.411617
Run 1  Epoch 23/30  loss=0.412352
Run 1  Epoch 24/30  loss=0.411355
Run 1  Epoch 25/30  loss=0.411851
Run 1  Epoch 26/30  loss=0.412865
Run 1  Epoch 27/30  loss=0.411513
Run 1  Epoch 28/30  loss=0.412

In [16]:
# Inferência de embeddings para itens de teste
# inscrever itens de teste (isolados) e obter embeddings com encoder treinado

# Configurações
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
STATE_DIR = "runs_graphsage" 
STATE_PATTERN = os.path.join(STATE_DIR, "**", "graphsage_encoder_run*.pt")
OUT_DIR = "test_embeddings_runs"
os.makedirs(OUT_DIR, exist_ok=True)

# Artefatos de entrada
X_all_train = torch.load("X_all_train.pt")           # (n_nodes_train, D)
edge_index = torch.load("edge_index_train.pt")       # (2, E)
with open("dims.json", "r", encoding="utf-8") as f:
    dims = json.load(f)
D = int(dims["feature_dim"])
n_nodes_train = X_all_train.shape[0]

# Preparar X_test_full usando a função que você já tem
# Assegure que `test_item_ids` e `transform_test_items_batch` estão definidos no escopo
try:
    test_item_ids
except NameError:
    raise RuntimeError("Variável `test_item_ids` não encontrada. Defina a lista de ids de itens de teste antes de rodar.")

try:
    transform_test_items_batch
except NameError:
    raise RuntimeError("Função `transform_test_items_batch` não encontrada. Importe/defina-a antes de rodar.")

# Construir X_test_full (numpy) e checar dimensão
X_test_full = transform_test_items_batch([ item_features_raw.get(str(iid), []) for iid in test_item_ids ], save_path=None)
if X_test_full.shape[1] != D:
    raise RuntimeError(f"Dimensão de features dos itens de teste ({X_test_full.shape[1]}) difere de D ({D}). Ajuste transform_test_items_batch/feature padding.")

n_test = X_test_full.shape[0]

# Preparar array estendido X_all_ext_t (torch tensor no DEVICE)
X_all_extended = np.vstack([ X_all_train.cpu().numpy(), X_test_full ])   # (n_nodes_train + n_test, D)
X_all_ext_t = torch.tensor(X_all_extended, dtype=torch.float, device=DEVICE)

# edge_index usado permanece o do treino; enviar para device
edge_index = edge_index.to(DEVICE)

# Obter lista de arquivos de estado do encoder
state_paths = sorted(glob.glob(STATE_PATTERN, recursive=True))
if len(state_paths) == 0:
    raise RuntimeError(f"Nenhum arquivo de estado localizado com o padrão: {STATE_PATTERN}. Ajuste STATE_DIR/STATE_PATTERN.")

print(f"Found {len(state_paths)} state files. Generating embeddings for {n_test} test items each.")

# Se quiser controlar ordem de run (opcional): extrair número do nome do arquivo
def extract_run_number(path):
    base = os.path.basename(path)
    import re
    m = re.search(r"run(\d+)", base)
    return int(m.group(1)) if m else None

state_paths_sorted = sorted(state_paths, key=lambda p: (extract_run_number(p) or 0, p))

# Loop sobre estados, carregar e gerar embeddings de teste
runs_info = []
for idx, sp in enumerate(state_paths_sorted, start=1):
    run_num = extract_run_number(sp) or idx
    ts = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
    print(f"\n[{idx}/{len(state_paths_sorted)}] Run {run_num}: loading state -> {sp}")

    # Instanciar encoder com os mesmos hiperparâmetros usados no treino
    # Ajuste hidden_dim/out_dim/num_layers se diferentes do seu treino original
    encoder = GraphSAGEEncoder(in_dim=X_all_train.shape[1], hidden_dim=128, out_dim=64, num_layers=2).to(DEVICE)
    state = torch.load(sp, map_location=DEVICE)
    encoder.load_state_dict(state)
    encoder.eval()

    # Forward para todos os nós estendidos (usa edge_index do treino)
    with torch.no_grad():
        emb_all = encoder(X_all_ext_t, edge_index)   # tensor (n_nodes_train + n_test, out_dim)

    emb_all_np = emb_all.cpu().numpy()
    test_global_start = n_nodes_train
    test_global_indices = np.arange(test_global_start, test_global_start + n_test)
    test_embeddings = emb_all_np[test_global_indices, :]

    # Salvar embeddings por run
    out_emb_path = os.path.join(OUT_DIR, f"test_item_embeddings_run{run_num}.npy")
    np.save(out_emb_path, test_embeddings)

    # Salvar mapping id->index (salva apenas uma vez por segurança; sobrescreve se já existir)
    mapping = { str(test_item_ids[i]): int(i) for i in range(len(test_item_ids)) }
    mapping_path = os.path.join(OUT_DIR, "test_item_id2idx.json")
    with open(mapping_path, "w", encoding="utf-8") as f:
        json.dump(mapping, f, indent=2, ensure_ascii=False)

    # Salvar metadados da run
    run_meta = {
        "run_index_in_list": idx,
        "run_number_extracted": run_num,
        "state_path": sp,
        "embeddings_path": out_emb_path,
        "n_test_items": n_test,
        "timestamp_utc": ts
    }
    meta_path = os.path.join(OUT_DIR, f"run{run_num}_meta.json")
    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump(run_meta, f, indent=2, ensure_ascii=False)

    print(f" Saved: {out_emb_path}  meta: {meta_path}")
    runs_info.append(run_meta)

# Salvar resumo agregado
agg_path = os.path.join(OUT_DIR, f"all_runs_test_embeddings_summary_{datetime.now().strftime('%Y%m%dT%H%M%SZ')}.json")
with open(agg_path, "w", encoding="utf-8") as f:
    json.dump({"n_states": len(state_paths_sorted), "runs": runs_info}, f, indent=2, ensure_ascii=False)

print(f"\nFinished: saved embeddings for {len(runs_info)} runs in {OUT_DIR}")


Found 5 state files. Generating embeddings for 30 test items each.

[1/5] Run 1: loading state -> runs_graphsage\run1_20251023T141444Z\graphsage_encoder_run1.pt
 Saved: test_embeddings_runs\test_item_embeddings_run1.npy  meta: test_embeddings_runs\run1_meta.json

[2/5] Run 2: loading state -> runs_graphsage\run2_20251023T142813Z\graphsage_encoder_run2.pt
 Saved: test_embeddings_runs\test_item_embeddings_run2.npy  meta: test_embeddings_runs\run2_meta.json

[3/5] Run 3: loading state -> runs_graphsage\run3_20251023T144217Z\graphsage_encoder_run3.pt
 Saved: test_embeddings_runs\test_item_embeddings_run3.npy  meta: test_embeddings_runs\run3_meta.json

[4/5] Run 4: loading state -> runs_graphsage\run4_20251023T145539Z\graphsage_encoder_run4.pt
 Saved: test_embeddings_runs\test_item_embeddings_run4.npy  meta: test_embeddings_runs\run4_meta.json

[5/5] Run 5: loading state -> runs_graphsage\run5_20251023T150858Z\graphsage_encoder_run5.pt
 Saved: test_embeddings_runs\test_item_embeddings_run5.

In [20]:
# Carregar artefatos, construir ground truth sets por item e buscar
# top-50 users por similaridade para cada item de teste, em cada run

import os, glob, json, numpy as np
from datetime import datetime

# Configurações
EMB_DIR = "test_embeddings_runs"
PATTERN = os.path.join(EMB_DIR, "test_item_embeddings_run*.npy")
OUT_DIR = os.path.join(EMB_DIR, "topk_per_run")
os.makedirs(OUT_DIR, exist_ok=True)

K_max = 50
TEST_ID2ROW_PATH = os.path.join(EMB_DIR, "test_item_id2idx.json")

# Verificações iniciais
state_files = sorted(glob.glob(PATTERN))
if len(state_files) == 0:
    raise RuntimeError(f"Nenhum arquivo de embeddings encontrado com o padrão: {PATTERN}")

if not os.path.isfile(TEST_ID2ROW_PATH):
    raise RuntimeError(f"Arquivo de mapeamento test_item_id2idx não encontrado: {TEST_ID2ROW_PATH}")

# Carregar mapping test_id2row
with open(TEST_ID2ROW_PATH, "r", encoding="utf-8") as f:
    test_id2row = json.load(f)          # map item_id -> row_index

# Normalização L2 das linhas (segura para zero-vectors)
def l2_normalize_rows(X):
    X = X.astype(np.float64)
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    return (X / norms)

runs_summary = []
for emb_path in state_files:
    base = os.path.basename(emb_path)
    # ex: test_item_embeddings_run3.npy -> run3
    run_label = base.replace("test_item_embeddings_", "").replace(".npy","")
    out_prefix = os.path.join(OUT_DIR, run_label)

    print(f"\nProcessing {base} -> label {run_label}")

    # localizar user_embeddings correspondente ao mesmo run
    # possíveis padrões: user_embeddings_run{r}.npy ou user_embeddings_run{r}_*.npy
    candidate_patterns = [
        os.path.join("runs_graphsage", "**", f"user_embeddings_{run_label}.npy"),
        os.path.join("runs_graphsage", "**", f"user_embeddings_{run_label}*.npy"),
        os.path.join("runs_graphsage", "**", f"user_embeddings_run{run_label.replace('run','')}.npy"),
        os.path.join("runs_graphsage", "**", f"user_embeddings_run*.npy"),
    ]
    user_emb_candidate = None
    for pat in candidate_patterns:
        cands = sorted(glob.glob(pat, recursive=True))
        if cands:
            # prefer file in same run folder as emb_path if present
            same_dir = [p for p in cands if os.path.dirname(p) == os.path.dirname(emb_path)]
            if same_dir:
                user_emb_candidate = same_dir[0]
            else:
                user_emb_candidate = cands[0]
            break

    if user_emb_candidate is None:
        raise RuntimeError(f"Não foi possível localizar user_embeddings correspondente para {run_label}. Verifique runs_graphsage/.")

    print(f" Using user embeddings: {user_emb_candidate}")
    U = np.load(user_emb_candidate)              # shape (n_users, d)
    U_norm = l2_normalize_rows(U)
    n_users = U_norm.shape[0]

    # carregar embeddings de itens de teste do run atual
    I = np.load(emb_path)                 # shape (n_test, d)
    if I.ndim != 2:
        raise RuntimeError(f"Embeddings file {emb_path} tem dimensão inválida: {I.shape}")

    # normalizar
    I_norm = l2_normalize_rows(I)

    n_test = I_norm.shape[0]
    if I_norm.shape[1] != U_norm.shape[1]:
        raise RuntimeError(f"Dimensionalidade incompatível: item emb dim {I_norm.shape[1]} != user emb dim {U_norm.shape[1]}")

    # preparar arrays de saída
    topk_indices = np.zeros((n_test, min(K_max, n_users)), dtype=np.int32)
    topk_scores = np.zeros((n_test, min(K_max, n_users)), dtype=np.float32)

    # computar similaridades (matriz completa ou por batches)
    mem_cost = n_test * n_users
    threshold_full_mat = 200_000_000
    if mem_cost <= threshold_full_mat:
        sims = I_norm.dot(U_norm.T)  # (n_test, n_users)
        for i in range(n_test):
            row = sims[i]
            k = min(K_max, n_users)
            if n_users <= k:
                idx_sorted = np.argsort(-row)
            else:
                idx_part = np.argpartition(-row, k-1)[:k]
                idx_sorted = idx_part[np.argsort(-row[idx_part])]
            topk_indices[i, :len(idx_sorted)] = idx_sorted
            topk_scores[i, :len(idx_sorted)] = row[idx_sorted]
        del sims
    else:
        batch_size_items = max(1, int(1e6 // n_users))
        for start in range(0, n_test, batch_size_items):
            end = min(n_test, start + batch_size_items)
            batch = I_norm[start:end]              # (b, d)
            sims_batch = batch.dot(U_norm.T)       # (b, n_users)
            for bi in range(end - start):
                row = sims_batch[bi]
                k = min(K_max, n_users)
                if n_users <= k:
                    idx_sorted = np.argsort(-row)
                else:
                    idx_part = np.argpartition(-row, k-1)[:k]
                    idx_sorted = idx_part[np.argsort(-row[idx_part])]
                topk_indices[start + bi, :len(idx_sorted)] = idx_sorted
                topk_scores[start + bi, :len(idx_sorted)] = row[idx_sorted]
            del sims_batch

    # Salvar resultados por run
    topk_idx_path = out_prefix + "_topk_user_indices.npy"   # shape (n_test, K')
    topk_sc_path = out_prefix + "_topk_scores.npy"
    np.save(topk_idx_path, topk_indices)
    np.save(topk_sc_path, topk_scores)

    # Salvar também em JSON compacto
    json_list = [ list(map(int, topk_indices[i])) for i in range(n_test) ]
    json_path = out_prefix + "_topk_user_indices.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(json_list, f, ensure_ascii=False)

    run_info = {
        "emb_path": emb_path,
        "run_label": run_label,
        "user_emb_path": user_emb_candidate,
        "n_test_items": int(n_test),
        "n_users": int(n_users),
        "k_requested": int(K_max),
        "topk_idx_npy": topk_idx_path,
        "topk_scores_npy": topk_sc_path,
        "topk_idx_json": json_path,
        "timestamp": datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
    }
    runs_summary.append(run_info)
    print(f" Saved top-K indices to {topk_idx_path} and JSON {json_path}")

# salvar resumo agregado
summary_path = os.path.join(OUT_DIR, f"all_runs_topk_summary_{datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')}.json")
with open(summary_path, "w", encoding="utf-8") as f:
    json.dump({"n_runs": len(runs_summary), "runs": runs_summary}, f, ensure_ascii=False, indent=2)

print(f"\nFinished processing {len(runs_summary)} run(s). Summary: {summary_path}")



Processing test_item_embeddings_run1.npy -> label run1
 Using user embeddings: runs_graphsage\run1_20251023T141444Z\user_embeddings_run1.npy
 Saved top-K indices to test_embeddings_runs\topk_per_run\run1_topk_user_indices.npy and JSON test_embeddings_runs\topk_per_run\run1_topk_user_indices.json

Processing test_item_embeddings_run2.npy -> label run2
 Using user embeddings: runs_graphsage\run2_20251023T142813Z\user_embeddings_run2.npy
 Saved top-K indices to test_embeddings_runs\topk_per_run\run2_topk_user_indices.npy and JSON test_embeddings_runs\topk_per_run\run2_topk_user_indices.json

Processing test_item_embeddings_run3.npy -> label run3
 Using user embeddings: runs_graphsage\run3_20251023T144217Z\user_embeddings_run3.npy
 Saved top-K indices to test_embeddings_runs\topk_per_run\run3_topk_user_indices.npy and JSON test_embeddings_runs\topk_per_run\run3_topk_user_indices.json

Processing test_item_embeddings_run4.npy -> label run4
 Using user embeddings: runs_graphsage\run4_202510

In [21]:
# Reconstruir gt_item2users a partir dos artefatos existentes

# Ajuste os nomes de arquivo/variáveis se necessário
# Supõe:
# - test_interactions: DataFrame com colunas 'user_id' e 'item_id' (strings ou numéricos)
# - user2idx: dict mapping user_id (string) -> user_index (int)
# - test_item_id2idx.json ou test_item_id2idx já carregado como test_id2row

# Carregar mapping test_id2row se ainda não existir
try:
    test_id2row
except NameError:
    # tenta carregar do JSON produzido anteriormente
    import os
    if os.path.isfile("test_item_id2idx.json"):
        with open("test_item_id2idx.json","r",encoding="utf-8") as f:
            test_id2row = json.load(f)
    else:
        raise RuntimeError("test_id2row não encontrado e arquivo test_item_id2idx.json também não existe")

# Garantir tipos string nas chaves para consistência
test_id2row = { str(k): int(v) for k,v in test_id2row.items() }

# Garantir user2idx existe
try:
    user2idx
except NameError:
    raise RuntimeError("Variável user2idx não encontrada no ambiente. Defina user2idx (mapping user_id -> user_index) antes.")

# Normalizar tipos no DataFrame test_interactions
test_interactions['user_id'] = test_interactions['user_id'].astype(str)
test_interactions['item_id'] = test_interactions['item_id'].astype(str)

# Construir gt_item2users: item_id (string) -> set of user indices (ints)
gt_item2users = {}
for _, row in test_interactions.iterrows():
    iid = str(row['item_id'])
    uid = str(row['user_id'])
    if iid in test_id2row and uid in user2idx:
        uidx = int(user2idx[uid])
        gt_item2users.setdefault(iid, set()).add(uidx)

# Opcional: garantir que todos os test_item_ids estejam presentes como chaves (mesmo que vazio)
try:
    test_item_ids
except NameError:
    # construir list a partir do mapping de ids
    test_item_ids = sorted(list(test_id2row.keys()), key=lambda x: int(x) if x.isdigit() else x)

for iid in test_item_ids:
    gt_item2users.setdefault(str(iid), set())

# Construir evaluated_items (itens com pelo menos um positivo)
evaluated_items = [iid for iid, s in gt_item2users.items() if len(s) > 0]

print("Reconstructed gt_item2users.")
print("n_test_items_total (mapping):", len(test_id2row))
print("n_test_items_with_positives (evaluated_items):", len(evaluated_items))


Reconstructed gt_item2users.
n_test_items_total (mapping): 30
n_test_items_with_positives (evaluated_items): 30


In [22]:
# Agregar Precision@K e NDCG@K por item entre runs

# Configurações
TOPK_RUNS_DIR = "test_embeddings_runs/topk_per_run"   # local onde topk por run foram salvos
PATTERN_JSON = os.path.join(TOPK_RUNS_DIR, "*_topk_user_indices.json")
# alternativa: carregar .npy se JSONs não existirem (arquivo *_topk_user_indices.npy)
PATTERN_NPY = os.path.join(TOPK_RUNS_DIR, "*_topk_user_indices.npy")

Ks = [10, 20, 50]
# ordem dos 30 itens — assegure que test_item_ids esteja definido e contenha 30 ids strings
try:
    test_item_ids
except NameError:
    raise RuntimeError("Variável `test_item_ids` não encontrada. Defina-a como a lista (30) de ids de itens de teste na ordem desejada.")

n_items_expected = len(test_item_ids)
if n_items_expected != 30:
    print(f"Warning: test_item_ids tem {n_items_expected} itens (esperado 30). O código continuará usando essa lista.")

# Carregar arquivos disponíveis (prefere JSON, senão NPZ/NPY)
files_json = sorted(glob.glob(PATTERN_JSON))
files_npy = sorted(glob.glob(PATTERN_NPY))

if len(files_json) == 0 and len(files_npy) == 0:
    raise RuntimeError(f"Nenhum arquivo de top-K encontrado em {TOPK_RUNS_DIR} com padrões JSON/NPY.")

run_files = []
use_json = False
if len(files_json) > 0:
    run_files = files_json
    use_json = True
else:
    run_files = files_npy
    use_json = False

n_runs = len(run_files)
print(f"Found {n_runs} run files. Using JSON format: {use_json}")

# Funções de métricas
def precision_at_k_from_list(recommended, gt_set, k):
    rec_k = recommended[:k]
    hits = sum(1 for u in rec_k if u in gt_set)
    return hits / k

def dcg_at_k_from_list(recommended, gt_set, k):
    rec_k = recommended[:k]
    dcg = 0.0
    for i, u in enumerate(rec_k):
        rel = 1.0 if u in gt_set else 0.0
        dcg += rel / log2(i + 2)
    return dcg

def idcg_at_k_from_gt(gt_set, k):
    n_rel = min(len(gt_set), k)
    if n_rel == 0:
        return 0.0
    return sum(1.0 / log2(i + 2) for i in range(n_rel))

# Inicializar arrays para acumular métricas por run por item
# shape: (n_runs, n_items)
n_items = n_items_expected
prec_by_run = {k: np.zeros((n_runs, n_items), dtype=float) for k in Ks}
ndcg_by_run = {k: np.zeros((n_runs, n_items), dtype=float) for k in Ks}

# Iterar runs e preencher matrizes
for r_idx, fpath in enumerate(run_files):
    print(f"Processing run {r_idx+1}/{n_runs}: {os.path.basename(fpath)}")
    if use_json:
        with open(fpath, "r", encoding="utf-8") as f:
            topk_list_per_item = json.load(f)   # espera lista (n_test) de listas de user indices
        # topk_list_per_item indexed by row index 0..n_test-1
    else:
        arr = np.load(fpath)   # shape (n_test, K')
        # converter para lista de lists
        topk_list_per_item = [ list(map(int, arr[i])) for i in range(arr.shape[0]) ]

    # Checagem de consistência
    n_test_runs = len(topk_list_per_item)
    if n_test_runs < n_items:
        print(f"Warning: run file possui apenas {n_test_runs} itens, menor que n_items={n_items}. Preenchendo faltantes com listas vazias.")
        # expandir com listas vazias
        topk_list_per_item += [[] for _ in range(n_items - n_test_runs)]
    elif n_test_runs > n_items:
        # possível que o arquivo tenha todos os itens, mas test_item_ids refere 30; apenas use os primeiros n_items
        pass

    # Para cada item na ordem test_item_ids, obter sua row index e as recomendações
    for j, iid in enumerate(test_item_ids):
        # obter row index no arquivo topk (assume mapping test_item_id2idx usado antes)
        try:
            row_idx = int(test_id2row[str(iid)])
        except Exception:
            # se não encontrou mapping, assumimos que a order em topk_list_per_item segue test_item_ids
            row_idx = j

        if row_idx >= len(topk_list_per_item):
            recommended = []
        else:
            recommended = topk_list_per_item[row_idx]
            # garantir que todos os elementos sejam ints
            recommended = [int(x) for x in recommended]

        gt_set = gt_item2users.get(str(iid), set())
        # Se gt_set vazio, definimos métricas como NaN e iremos ignorar na média final
        if len(gt_set) == 0:
            for k in Ks:
                prec_by_run[k][r_idx, j] = np.nan
                ndcg_by_run[k][r_idx, j] = np.nan
            continue

        # calcular métricas para cada K
        for k in Ks:
            # se recommended menor que k, pad com valores que não estão no gt (ex: -1)
            if len(recommended) < k:
                rec_padded = recommended + [-1] * (k - len(recommended))
            else:
                rec_padded = recommended
            prec = precision_at_k_from_list(rec_padded, gt_set, k)
            dcg = dcg_at_k_from_list(rec_padded, gt_set, k)
            idcg = idcg_at_k_from_gt(gt_set, k)
            ndcg = (dcg / idcg) if idcg > 0 else 0.0
            prec_by_run[k][r_idx, j] = prec
            ndcg_by_run[k][r_idx, j] = ndcg

# Agora calcular média por item entre runs, ignorando NaNs (runs onde item tinha 0 positivos)
avg_prec_per_item = {k: [] for k in Ks}
avg_ndcg_per_item = {k: [] for k in Ks}
for j in range(n_items):
    for k in Ks:
        col = prec_by_run[k][:, j]
        # ignorar NaNs
        valid = col[~np.isnan(col)]
        if valid.size == 0:
            avg = None
        else:
            avg = float(np.mean(valid))
        avg_prec_per_item[k].append(avg)

        col_ndcg = ndcg_by_run[k][:, j]
        valid_ndcg = col_ndcg[~np.isnan(col_ndcg)]
        if valid_ndcg.size == 0:
            avgn = None
        else:
            avgn = float(np.mean(valid_ndcg))
        avg_ndcg_per_item[k].append(avgn)

# Construir listas solicitadas (6 listas com 30 elementos cada, na ordem test_item_ids)
precision_at_10 = avg_prec_per_item[10]
precision_at_20 = avg_prec_per_item[20]
precision_at_50 = avg_prec_per_item[50]
ndcg_at_10 = avg_ndcg_per_item[10]
ndcg_at_20 = avg_ndcg_per_item[20]
ndcg_at_50 = avg_ndcg_per_item[50]

# Checagem de tamanhos
assert len(precision_at_10) == n_items
assert len(ndcg_at_50) == n_items

# Salvar em JSON
out_result = {
    "test_item_ids_order": test_item_ids,
    "precision_at_10": precision_at_10,
    "precision_at_20": precision_at_20,
    "precision_at_50": precision_at_50,
    "ndcg_at_10": ndcg_at_10,
    "ndcg_at_20": ndcg_at_20,
    "ndcg_at_50": ndcg_at_50
}

OUT_JSON = os.path.join(TOPK_RUNS_DIR, "aggregated_precision_ndcg_per_item_across_runs.json")
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(out_result, f, indent=2, ensure_ascii=False)

print(f"\nSaved aggregated metrics per item across {n_runs} runs to: {OUT_JSON}")
print(f"Each list contains {len(test_item_ids)} elements (None indicates item had no positives in all runs).")


Found 5 run files. Using JSON format: True
Processing run 1/5: run1_topk_user_indices.json
Processing run 2/5: run2_topk_user_indices.json
Processing run 3/5: run3_topk_user_indices.json
Processing run 4/5: run4_topk_user_indices.json
Processing run 5/5: run5_topk_user_indices.json

Saved aggregated metrics per item across 5 runs to: test_embeddings_runs/topk_per_run\aggregated_precision_ndcg_per_item_across_runs.json
Each list contains 30 elements (None indicates item had no positives in all runs).


In [23]:
# caminho do arquivo
IN_PATH = os.path.join("test_embeddings_runs", "topk_per_run", "aggregated_precision_ndcg_per_item_across_runs.json")
OUT_PATH = os.path.join("test_embeddings_runs", "topk_per_run", "summary_mean_precision_ndcg.json")

if not os.path.isfile(IN_PATH):
    raise RuntimeError(f"Arquivo não encontrado: {IN_PATH}")

with open(IN_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

# nomes esperados no JSON: precision_at_10, precision_at_20, precision_at_50, ndcg_at_10, ndcg_at_20, ndcg_at_50
keys = ["precision_at_10", "precision_at_20", "precision_at_50", "ndcg_at_10", "ndcg_at_20", "ndcg_at_50"]

result_means = {}
for k in keys:
    lst = data.get(k)
    if lst is None:
        result_means[k] = None
        continue
    # converter valores None para np.nan e calcular média ignorando np.nan
    arr = np.array([np.nan if v is None else float(v) for v in lst], dtype=float)
    # contar elementos válidos
    valid_mask = ~np.isnan(arr)
    valid_count = int(valid_mask.sum())
    if valid_count == 0:
        mean_val = None
    else:
        mean_val = float(np.nanmean(arr))
    result_means[k] = {"mean": mean_val, "n_valid_items": valid_count, "n_total_items": int(len(arr))}

# imprimir resumo
print("Resumo das médias (ignora itens sem valor):")
for k, v in result_means.items():
    print(f" - {k}: mean={v['mean']}, valid_items={v['n_valid_items']}/{v['n_total_items']}")

Resumo das médias (ignora itens sem valor):
 - precision_at_10: mean=0.17733333333333332, valid_items=30/30
 - precision_at_20: mean=0.16166666666666665, valid_items=30/30
 - precision_at_50: mean=0.14906666666666663, valid_items=30/30
 - ndcg_at_10: mean=0.20107359965768093, valid_items=30/30
 - ndcg_at_20: mean=0.18156381742239444, valid_items=30/30
 - ndcg_at_50: mean=0.1630389547706024, valid_items=30/30
