In [35]:
import os
import pandas as pd
import numpy as np

# Configure paths and split params
DATA_DIR = './data/'  # change if needed
VAL_RATIO = 0.2         # 20% of each user's interactions go to validation (by time)
MIN_VAL = 1             # at least one validation interaction per user when possible

print('Using DATA_DIR =', DATA_DIR)
print('Pandas:', pd.__version__, '| NumPy:', np.__version__)

Using DATA_DIR = ./data/
Pandas: 2.3.0 | NumPy: 2.2.4


In [None]:

movies = pd.read_csv(os.path.join(DATA_DIR, 'movies.csv'))
ratings = pd.read_csv(os.path.join(DATA_DIR, 'ratings.csv'))


Loaded movies: (9742, 3) | ratings: (100836, 4) | synthetic = False


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [37]:
# Standardize movies (hard-coded schema)
required_movies = ['movieId','title','genres']
missing = [c for c in required_movies if c not in movies.columns]
if missing:
    raise ValueError(f'movies.csv missing required columns: {missing}')
movies = movies.copy()
movies['movieId'] = movies['movieId'].astype(int)
movies['title']   = movies['title'].fillna('')
movies['genres']  = movies['genres'].fillna('')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [38]:
# Standardize ratings (hard-coded schema)
required_ratings = ['userId','movieId','rating','timestamp']
missing = [c for c in required_ratings if c not in ratings.columns]
if missing:
    raise ValueError(f'ratings.csv missing required columns: {missing}')
ratings = ratings.copy()
ratings['userId']    = ratings['userId'].astype(int)
ratings['movieId']   = ratings['movieId'].astype(int)
ratings['rating']    = pd.to_numeric(ratings['rating'], errors='coerce').fillna(0.0)
ratings['timestamp'] = pd.to_numeric(ratings['timestamp'], errors='coerce')

# Impute NaN timestamps: per-user median, then global median
ratings['timestamp'] = ratings.groupby('userId')['timestamp'].transform(lambda s: s.fillna(s.median()))\
                                   .fillna(ratings['timestamp'].median())
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [39]:
g.sort_values('timestamp', kind='mergesort')

Unnamed: 0,userId,movieId,rating,timestamp
99554,610,318,3.0,1479541963
99699,610,2959,5.0,1479541966
99649,610,1573,3.5,1479541990
100010,610,7163,1.5,1479541995
99739,610,3623,3.0,1479542001
...,...,...,...,...
100612,610,101739,3.5,1495959269
99540,610,70,4.0,1495959282
99556,610,328,3.5,1495959299
99681,610,2459,3.5,1495959405


In [40]:
# Per-user chronological split using 'timestamp'
parts_train, parts_val = [], []
for u, g in ratings.groupby('userId', sort=False):
    g = g.sort_values('timestamp', kind='mergesort')
    n = len(g)
    if n <= 1:
        parts_train.append(g)
        parts_val.append(g.iloc[0:0])
        continue
    n_val = max(MIN_VAL, int(round(n * VAL_RATIO)))
    n_val = min(n_val, n - 1)
    parts_val.append(g.tail(n_val).copy())
    parts_train.append(g.iloc[: n - n_val].copy())

ratings_train = pd.concat(parts_train, ignore_index=True) if parts_train else ratings.iloc[0:0].copy()
ratings_val   = pd.concat(parts_val,   ignore_index=True) if parts_val   else ratings.iloc[0:0].copy()
print('Train shape:', ratings_train.shape, '| Val shape:', ratings_val.shape)

Train shape: (80672, 4) | Val shape: (20164, 4)


In [41]:
print('Train head:')
display(ratings_train.head())
print('Validation head:')
display(ratings_val.head())

Train head:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,804,4.0,964980499
1,1,1210,5.0,964980499
2,1,2018,5.0,964980523
3,1,2628,4.0,964980523
4,1,2826,4.0,964980523


Validation head:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1219,2.0,964983393
1,1,1348,4.0,964983393
2,1,2644,4.0,964983393
3,1,2654,5.0,964983393
4,1,1258,3.0,964983414


In [42]:
# Build ID mappings
user_ids = sorted(pd.unique(ratings_train['userId']).tolist())
item_ids = sorted(pd.unique(movies['movieId']).tolist())
uid2idx = {u:i for i,u in enumerate(user_ids)}
idx2uid = {i:u for u,i in uid2idx.items()}
iid2idx = {m:i for i,m in enumerate(item_ids)}
idx2iid = {i:m for m,i in iid2idx.items()}
print('Users:', len(uid2idx), '| Items:', len(iid2idx))
list(uid2idx.items())[:5], list(iid2idx.items())[:5]

Users: 610 | Items: 9742


([(1, 0), (2, 1), (3, 2), (4, 3), (5, 4)],
 [(1, 0), (2, 1), (3, 2), (4, 3), (5, 4)])

In [43]:
# Build seen dict from TRAIN
seen = {}
for u, g in ratings_train.groupby('userId'):
    seen[int(u)] = set(int(x) for x in g['movieId'].tolist())
print('Seen entries (up to 3 users):')
list(seen.items())[:3]

Seen entries (up to 3 users):


[(1,
  {1,
   3,
   6,
   50,
   70,
   101,
   110,
   216,
   223,
   231,
   235,
   260,
   296,
   316,
   333,
   349,
   356,
   362,
   367,
   423,
   441,
   457,
   480,
   500,
   543,
   552,
   590,
   592,
   596,
   608,
   648,
   661,
   673,
   733,
   736,
   804,
   919,
   923,
   940,
   954,
   1009,
   1023,
   1024,
   1025,
   1029,
   1030,
   1031,
   1032,
   1042,
   1049,
   1060,
   1073,
   1080,
   1089,
   1097,
   1127,
   1136,
   1196,
   1197,
   1198,
   1208,
   1210,
   1213,
   1214,
   1220,
   1222,
   1256,
   1275,
   1282,
   1291,
   1377,
   1396,
   1408,
   1473,
   1500,
   1517,
   1552,
   1573,
   1580,
   1587,
   1617,
   1620,
   1676,
   1732,
   1777,
   1793,
   1804,
   1805,
   1920,
   1927,
   1954,
   1967,
   2000,
   2005,
   2018,
   2028,
   2033,
   2046,
   2048,
   2054,
   2058,
   2078,
   2090,
   2093,
   2094,
   2096,
   2099,
   2105,
   2115,
   2116,
   2137,
   2139,
   2141,
   2143,
   2161,
   2174,

In [44]:
# Save split artifacts next to your input CSVs
train_path = os.path.join(DATA_DIR, 'ratings_train.csv')
val_path   = os.path.join(DATA_DIR, 'ratings_val_autosplit.csv')
ratings_train.to_csv(train_path, index=False)
ratings_val.to_csv(val_path, index=False)
print('Saved:', train_path)
print('Saved:', val_path)

Saved: ./data/ratings_train.csv
Saved: ./data/ratings_val_autosplit.csv
