In [9]:
import os
import pandas as pd
import numpy as np

# Configure paths and split params
DATA_DIR = './data/'  # change if needed
VAL_RATIO = 0.2         # 20% of each user's interactions go to validation (by time)
MIN_VAL = 1             # at least one validation interaction per user when possible
MIN_RATING = 4.0    # ratings >= MIN_rating are considered positive
print('Using DATA_DIR =', DATA_DIR)
print('Pandas:', pd.__version__, '| NumPy:', np.__version__)

Using DATA_DIR = ./data/
Pandas: 2.3.0 | NumPy: 2.2.4


In [10]:

from src.datasets import load_dataset_with_splits, build_id_mappings, get_seen_dict

movies_df, ratings_train, ratings_val, ratings_test = load_dataset_with_splits(DATA_DIR)
uid2idx, idx2uid, iid2idx, idx2iid = build_id_mappings(ratings_train, movies_df)
print('n_users:', len(uid2idx), '| n_items:', len(iid2idx))


ratings_df = ratings_train.copy()

n_users: 610 | n_items: 9742


In [26]:
pos = ratings_df.loc[ratings_df['rating'] >= float(MIN_RATING), ['userId','movieId']]
pos = pos[pos['userId'].isin(uid2idx) & pos['movieId'].isin(iid2idx)].reset_index(drop=True)
pos_pairs = pos[['userId','movieId']].to_numpy(np.int64)
print('positives:', pos_pairs.shape)
print(pos_pairs[:5])
print(pos_pairs[-5:])

positives: (39517, 2)
[[   1  804]
 [   1 1210]
 [   1 2018]
 [   1 2628]
 [   1 2826]]
[[   610  54910]
 [   610  92309]
 [   610  56336]
 [   610 130840]
 [   610 136602]]


In [30]:
u_idx = pos['userId'].map(uid2idx).to_numpy(np.int64, copy=False)
i_idx = pos['movieId'].map(iid2idx).to_numpy(np.int64, copy=False)
order = np.argsort(u_idx); u_sorted = u_idx[order]; i_sorted = i_idx[order]
uniq_u, starts = np.unique(u_sorted, return_index=True); ends = np.r_[starts[1:], len(u_sorted)]
print('unique users:', len(uniq_u))
print('first 5 users:', uniq_u[:5])
print(starts[:5], ends[:5])

unique users: 607
first 5 users: [0 1 2 3 4]
[  0 162 178 194 301] [162 178 194 301 319]


In [None]:

all_items = np.arange(len(iid2idx), dtype=np.int64)
unseen = {int(u): np.setdiff1d(all_items, np.unique(i_sorted[s:e]), assume_unique=False)
          for u, s, e in zip(uniq_u, starts, ends)}




unseen items for first 5 users:
  user 0: 9580 unseen items, first 5: [1 3 4 6 7]
  user 1: 9726 unseen items, first 5: [0 1 2 3 4]
  user 2: 9726 unseen items, first 5: [0 1 2 3 4]
  user 3: 9635 unseen items, first 5: [0 1 2 3 4]
  user 4: 9724 unseen items, first 5: [1 2 3 4 5]


In [34]:
rng = np.random.default_rng()
print('users with positives:', len(unseen))
if len(uniq_u):
    u0 = int(uniq_u[0])
    print('example user idx:', u0, '| unseen count:', unseen[u0].size, '| first unseen:', unseen[u0][:10])

users with positives: 607
example user idx: 0 | unseen count: 9580 | first unseen: [ 1  3  4  6  7  8  9 10 11 12]


In [35]:
if len(uniq_u):
    u0 = int(uniq_u[0]); pool = unseen[u0]
    if pool.size:
        sample = rng.choice(pool, size=int(NEG_PER_POS), replace=True)
        print('Sample negatives for user', u0, ':', sample)
    else:
        print('User has no unseen items to sample')
else:
    print('No users to sample from')

NameError: name 'NEG_PER_POS' is not defined