In [151]:
import os
import json
import pickle
import numpy as np
import pandas as pd
from datetime import datetime, timezone

In [152]:
from sqlalchemy import create_engine, text, bindparam
from sqlalchemy.dialects.postgresql import UUID, ARRAY
from dotenv import load_dotenv
from surprise import Dataset, Reader, SVD, SVDpp, accuracy, NMF, KNNBaseline, BaselineOnly, CoClustering

from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from collections import defaultdict

In [153]:
load_dotenv()

True

In [154]:
# --- Environment & DB connection ------------------------------------------------
DATABASE_USERNAME = os.environ.get('DATABASE_USERNAME')
DATABASE_PASSWORD = os.environ.get('DATABASE_PASSWORD')
DATABASE_HOST = os.environ.get('DATABASE_HOST')
DATABASE_PORT = os.environ.get('DATABASE_PORT')
DATABASE_NAME = os.environ.get('DATABASE_NAME')

In [155]:
# Build Postgres DSN and create a SQLAlchemy engine
PG_DSN = f"postgresql+psycopg2://{DATABASE_USERNAME}:{DATABASE_PASSWORD}@{DATABASE_HOST}:{DATABASE_PORT}/{DATABASE_NAME}"
engine = create_engine(PG_DSN, pool_pre_ping=True)

In [156]:
# --- Aggregation parameters (keep in sync with your MV if you use one) ----------
HALF_LIFE_DAYS = 90          # time-decay half-life in days (fresh events weigh more)
VIEW_DEBOUNCE_SECONDS = 30   # debounce window for 'view' events from same user on same yacht
LOOKBACK_DAYS = 365 * 2      # history window for events (e.g., last 2 years)
DAY_CAP = 20.0               # per-(user,yacht,day) cap to prevent single session flooding

-----------------------------
--- DATA PREPARATION --------
-----------------------------

In [157]:
# --- 1) Load base tables from csv file --------------------------------------------------------
# Users: only need id and role to keep 'lessee' events
users_data_path="./data/users_final.csv"
events_data_path="./data/events_final.csv"
yachts_data_path="./data/yachts_final.csv"

df_users_full = pd.read_csv(users_data_path)
print(df_users_full.shape)
df_users_full.head()

(504, 16)


Unnamed: 0,id,email,password,country,role,sailingExp,budgetMin,budgetMax,avatarUrl,verified,verificationToken,token,createdAt,recommendations,updatedAt,hasSkipperLicense
0,e09716f2-8145-4c66-9717-66e6ae388def,beginner@test.test,$2b$10$Al3QvkOHSWAYQjFkAPtxuul4xIfcvMR9k5FT3cb...,Italy,lessee,beginner,2000.0,5000.0,https://s.gravatar.com/avatar/742ee2fd7f3d4e87...,True,LJ-RRP8E_qSs40-9WV4F8,eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6I...,2025-11-25 20:36:50.398 +0200,"{818b9279-c1ce-43e9-affd-0d28c9b00685,fdc88620...",2025-11-26 19:06:28.596 +0200,False
1,ff210a03-d01e-49a5-9050-284d1d94490a,lauraford@example.org,0e0df0b5c9d7db4aaf3d7ee64ccf82f089fc37de34f301...,France,lesser,pro,29000.0,39000.0,,False,,,2025-01-14 00:45:43.724 +0200,,2025-01-14 00:45:43.724 +0200,False
2,0a7b247c-424d-4035-914f-fd6f509c84a7,gmiller@example.org,6cd49d308d04231bc81304f9b60ea729309f1719ae7d20...,Spain,lessee,intermediate,37000.0,53000.0,,False,,,2025-04-13 22:15:33.754 +0300,"{224c9f46-e515-4d79-86d2-0651c0c922b6,621dc4e5...",2025-04-13 22:15:33.754 +0300,False
3,0aa32bd5-5c17-421f-8f3c-bd1d7a3424cf,eugeneklein@example.org,c6f2b705941bc55df65e33050435a403fab33e2b366414...,Indonesia,lessee,beginner,7000.0,8000.0,,False,,,2025-06-04 12:40:32.599 +0300,"{e3873b72-c452-4a9a-9fdc-96f42e91fe69,c3051a74...",2025-06-04 12:40:32.599 +0300,False
4,2d2a44bf-b250-40f2-ac2c-87ead7d854d1,romerocheryl@example.com,2d74c6b539a340be113a4a4329ce2a96136218602c7236...,Italy,lessee,beginner,11000.0,14000.0,,False,,,2025-07-26 22:43:27.899 +0300,"{42520952-0745-441c-9d8b-410b923a9513,156a6501...",2025-07-26 22:43:27.899 +0300,False


In [158]:
# select only necessary columns
df_users=df_users_full[["id","role"]]
print(df_users.shape)
df_users.head()

(504, 2)


Unnamed: 0,id,role
0,e09716f2-8145-4c66-9717-66e6ae388def,lessee
1,ff210a03-d01e-49a5-9050-284d1d94490a,lesser
2,0a7b247c-424d-4035-914f-fd6f509c84a7,lessee
3,0aa32bd5-5c17-421f-8f3c-bd1d7a3424cf,lessee
4,2d2a44bf-b250-40f2-ac2c-87ead7d854d1,lessee


In [159]:
df_users.dtypes

id      object
role    object
dtype: object

In [160]:
df_users = df_users.astype({
    'id': 'string',
    'role': 'string',
})
df_users.dtypes

id      string[python]
role    string[python]
dtype: object

In [161]:
# Yachts: (owner_id used to drop self-interactions)
df_yachts_full = pd.read_csv(yachts_data_path)
print(df_yachts_full.shape)
df_yachts_full.head()

(2543, 22)


Unnamed: 0,name,type,guests,cabins,crew,length,year,model,description,rating,country,photos,userId,createdAt,summerLowSeasonPrice,summerHighSeasonPrice,winterLowSeasonPrice,winterHighSeasonPrice,baseMarina,id,similarYachts,updatedAt
0,ZEN,Motor Yachts,9,4,5,26.8,2024,SL88,"The 26.75m/87'9"" 'Zen' motor yacht built by th...",4.8,Greece,{https://pub-59edec60055841149d71125f2e73e658....,ff210a03-d01e-49a5-9050-284d1d94490a,2025-11-17 16:58:48.579 +0200,6357.0,7214.0,6357.0,6357.0,Mykonos,554bb459-f188-4a8e-84ae-15f098c3c973,"{0d0a9813-5068-42a3-aac1-f5509f0cc85e,b387960b...",2025-11-17 16:58:48.579 +0200
1,GHOST,Open Yachts,8,4,2,26.5,2008,S87,"The 26.5m/86'11"" open yacht 'Ghost' by shipyar...",3.0,Australia,{https://pub-59edec60055841149d71125f2e73e658....,ff210a03-d01e-49a5-9050-284d1d94490a,2025-11-17 16:58:48.579 +0200,5571.0,5571.0,5571.0,5571.0,Sydney,5f4c72c7-1ba7-4068-a246-3d80a8236ef5,"{032e19a2-3c3c-41f3-a142-2014e44d2be3,30559b36...",2025-11-17 16:58:48.579 +0200
2,ZENIT,Catamarans,8,4,4,23.8,2024,Seventy 8,"The 23.8m/78'1"" catamaran yacht 'Zenit' is an ...",4.6,France,{https://pub-59edec60055841149d71125f2e73e658....,ff210a03-d01e-49a5-9050-284d1d94490a,2025-11-17 16:58:48.579 +0200,11357.0,13000.0,11357.0,11357.0,Calvi,bb4d2c21-7e79-4fc8-ada3-123e20ad532e,"{0bec0c68-a5a9-4d12-94a6-25fafc805ee5,b15afeaa...",2025-11-17 16:58:48.579 +0200
3,ZULU,Expedition Yachts,10,5,7,36.1,2023,Explorer 100,"The 36.1m/118'5"" 'Zulu' expedition yacht built...",5.0,Italy,{https://pub-59edec60055841149d71125f2e73e658....,ff210a03-d01e-49a5-9050-284d1d94490a,2025-11-17 16:58:48.579 +0200,19357.0,21857.0,19357.0,19357.0,Amalfi Coast,ea2f09d2-6e73-4cf9-9883-f7dd88626e2f,"{71ac58b5-fde5-40cf-9145-1c7e8328a475,3e52be21...",2025-11-17 16:58:48.579 +0200
4,ROMEA,Motor Yachts,12,6,25,81.8,2021,Custom,Built by the German shipyard Abeking & Rasmuss...,5.0,UAE,{https://pub-59edec60055841149d71125f2e73e658....,ff210a03-d01e-49a5-9050-284d1d94490a,2025-11-17 16:58:48.579 +0200,179143.0,179143.0,179143.0,179143.0,Abu Dhabi,fab9aa0b-fdbc-4d6e-8ee7-505ca1cc43dc,"{adfd1f9d-1bd6-441c-af75-dbfa13f3a903,621dc4e5...",2025-11-17 16:58:48.579 +0200


In [162]:
# select only necessary columns
df_yachts=df_yachts_full[["id","userId"]]
df_yachts=df_yachts.rename(columns={"id": "yacht_id", "userId": "owner_id"}, errors="raise")
print(df_yachts.shape)
df_yachts.head()

(2543, 2)


Unnamed: 0,yacht_id,owner_id
0,554bb459-f188-4a8e-84ae-15f098c3c973,ff210a03-d01e-49a5-9050-284d1d94490a
1,5f4c72c7-1ba7-4068-a246-3d80a8236ef5,ff210a03-d01e-49a5-9050-284d1d94490a
2,bb4d2c21-7e79-4fc8-ada3-123e20ad532e,ff210a03-d01e-49a5-9050-284d1d94490a
3,ea2f09d2-6e73-4cf9-9883-f7dd88626e2f,ff210a03-d01e-49a5-9050-284d1d94490a
4,fab9aa0b-fdbc-4d6e-8ee7-505ca1cc43dc,ff210a03-d01e-49a5-9050-284d1d94490a


In [163]:
df_yachts = df_yachts.astype({
    'yacht_id': 'string',
    'owner_id': 'string',
})
df_users.dtypes

id      string[python]
role    string[python]
dtype: object

In [164]:
# Events within the lookback window 
df_events = pd.read_csv(events_data_path)

# Optional guard: if there are no events, stop early to avoid errors downstream
if df_events.empty:
    raise ValueError("No events found within the lookback window. Adjust LOOKBACK_DAYS or check data.")
    
df_events.drop(["updatedAt"], axis=1, inplace=True)
df_events=df_events.rename(columns={ "userId": "user_id", "yachtId": "yacht_id",  "createdAt": "ts"}, errors="raise")   
print(df_events.shape)
df_events.head()

(12506, 6)


Unnamed: 0,id,type,weight,user_id,yacht_id,ts
0,0803b322-3b50-4220-b084-656e11bf1a6e,view,2,72c29aaa-7dbf-4784-a865-1dff7e3d2c24,089ae8eb-5006-44d4-b5dc-4315d5394dc8,2025-01-06 11:18:45.923 +0200
1,24516486-e7c5-4105-a3fd-c2ce9f3239e8,wishlist,4,11fa7ac1-d530-412c-bc89-4f80108fd41b,7dde5929-c5fc-480c-9522-8610382d6f4d,2025-01-08 08:33:11.807 +0200
2,ef94e809-e694-4cd4-bc01-28b7fcc64bee,view,2,47364914-3d59-485d-bdc0-3c596bd42de9,c10f9c96-ac1f-4d0a-af31-2e5917ba9727,2025-01-08 18:43:06.462 +0200
3,61cffb22-1677-4f27-9c58-dab4e9b6f9b5,view,2,ac10e962-3055-4937-9f73-f04f68c1262f,b8cae658-af7d-42e8-9e40-a594033636f2,2025-01-09 00:19:25.335 +0200
4,be3d28cb-a5c1-45be-913e-eb3247646cbe,wishlist,4,72c29aaa-7dbf-4784-a865-1dff7e3d2c24,c7e4d758-5d23-444c-ae46-7d95ed52594a,2025-01-10 08:10:06.171 +0200


In [165]:
df_events.dtypes

id          object
type        object
weight       int64
user_id     object
yacht_id    object
ts          object
dtype: object

In [166]:
df_events['ts'] = pd.to_datetime(df_events['ts'], utc=True).dt.tz_convert(None)
df_events = df_events.astype({
    # 'ts': 'datetime64[ns]',
    'user_id': 'string',
    'yacht_id': 'string',
    'type': 'string',
})
df_events.dtypes

id                  object
type        string[python]
weight               int64
user_id     string[python]
yacht_id    string[python]
ts          datetime64[ns]
dtype: object

In [167]:
df_book=df_events[df_events["type"] == "book"]
print(df_book.shape)
df_book.head()

(288, 6)


Unnamed: 0,id,type,weight,user_id,yacht_id,ts
119,b25d61d9-6b7e-4bc2-9efc-4eb08e8ae253,book,10,0dd3c2ac-0a42-43b0-8a0b-c27c0b909c4a,8d3242c8-e196-46d4-8a76-51c712bf971d,2025-02-16 06:46:27.274
186,7950a038-7b8e-4232-9372-d994badf99cc,book,10,a087147d-06dc-474f-b393-7a3950ed61b8,b789d024-5320-4cec-bc3d-cebb5239c814,2025-02-28 00:14:14.413
206,a140e8ab-3ee8-4d24-b071-112b4b279014,book,10,66447476-cb59-4cbb-b689-aeedc35e874f,19dc6742-2e92-4e51-b4ea-af9b75026aff,2025-03-02 00:16:03.729
269,fbd9daef-76ed-441d-8b24-b9dec64f04ad,book,10,4344b076-8da5-489a-92ea-eb142692f81e,dcf54f5b-26c0-46c8-8453-a4af8c6d4383,2025-03-11 23:35:18.200
285,d1def29a-db22-48c3-8dfd-720e629cda36,book,10,9480f48b-1ca5-4ae2-ab3c-75159975ed6a,a77deb8a-c2cd-4201-bb64-cbbda6d3c6d3,2025-03-14 01:56:06.620


In [168]:
booked_yacht_ids = df_book["yacht_id"].unique().tolist()
len(booked_yacht_ids)

241

In [169]:
# --- 2) Keep only 'lessee' events ----------------------------------------------
# Join user role and filter to renter role (events from admins/owners are not demand signals)
df_events = df_events.merge(df_users, left_on="user_id", right_on="id", how="left", suffixes=("", "_u"))
df_events = df_events[df_events["role"] == "lessee"].drop(columns=["id_u", "role"])


# --- 3) Remove interactions with user's own yachts ------------------------------
# (Owners interacting with their listings would bias the model)
df_events = df_events.merge(df_yachts, on="yacht_id", how="left")
df_events = df_events[df_events["owner_id"].astype(str) != df_events["user_id"].astype(str)]

# --- 4) Debounce frequent 'view' events ----------------------------------------
# Sort and drop near-duplicate 'view' events per (user, yacht) inside the debounce window
df_events = df_events.sort_values(["user_id", "yacht_id", "type", "ts"])
mask = ~(
    (df_events["type"] == "view") &
    (df_events.groupby(["user_id", "yacht_id", "type"])["ts"]
        .diff()
        .dt.total_seconds()
        .between(0, VIEW_DEBOUNCE_SECONDS, inclusive="both"))
)
df_events = df_events[mask]

print(df_events.shape)
df_events.head()

(12503, 7)


Unnamed: 0,id,type,weight,user_id,yacht_id,ts,owner_id
9767,6c87fda6-cf4a-43fe-8f46-17907cba4f3b,view,2,004b11f0-234e-4c42-a573-60ce7a759547,093dee6a-74b8-4361-9cd4-38c1443228a6,2025-10-30 23:46:16.028,ff210a03-d01e-49a5-9050-284d1d94490a
7883,62cb73a4-1d95-470b-85db-3d43b5ab0738,view,2,004b11f0-234e-4c42-a573-60ce7a759547,1993b0ed-a89f-4bfc-9d8f-a2aabaa80a49,2025-10-11 18:02:18.115,ff210a03-d01e-49a5-9050-284d1d94490a
6572,9f0a1d57-2b6f-40de-afa6-ac598d2fbae2,view,2,004b11f0-234e-4c42-a573-60ce7a759547,45dd9892-3df7-49dd-a132-0102b37f39e3,2025-09-25 05:05:24.252,ff210a03-d01e-49a5-9050-284d1d94490a
6722,c617069c-bd55-45cf-9ebb-12f30f9c6493,view,2,004b11f0-234e-4c42-a573-60ce7a759547,47808093-e408-436d-8067-4db7f361bd12,2025-09-27 07:38:29.910,ff210a03-d01e-49a5-9050-284d1d94490a
7758,99fa7a41-3ca3-4116-8829-4fb13fadcca0,wishlist,4,004b11f0-234e-4c42-a573-60ce7a759547,4d1e5462-e34f-4f88-8b0f-0e993d4ca3c5,2025-10-10 09:50:09.891,ff210a03-d01e-49a5-9050-284d1d94490a


In [170]:
# Calculate cutoff date
cutoff_date = pd.Timestamp.now() - pd.Timedelta(days=LOOKBACK_DAYS)
df_events = df_events[df_events['ts'] >= cutoff_date].reset_index(drop=True)
print(df_events.shape)
df_events.head()

(12503, 7)


Unnamed: 0,id,type,weight,user_id,yacht_id,ts,owner_id
0,6c87fda6-cf4a-43fe-8f46-17907cba4f3b,view,2,004b11f0-234e-4c42-a573-60ce7a759547,093dee6a-74b8-4361-9cd4-38c1443228a6,2025-10-30 23:46:16.028,ff210a03-d01e-49a5-9050-284d1d94490a
1,62cb73a4-1d95-470b-85db-3d43b5ab0738,view,2,004b11f0-234e-4c42-a573-60ce7a759547,1993b0ed-a89f-4bfc-9d8f-a2aabaa80a49,2025-10-11 18:02:18.115,ff210a03-d01e-49a5-9050-284d1d94490a
2,9f0a1d57-2b6f-40de-afa6-ac598d2fbae2,view,2,004b11f0-234e-4c42-a573-60ce7a759547,45dd9892-3df7-49dd-a132-0102b37f39e3,2025-09-25 05:05:24.252,ff210a03-d01e-49a5-9050-284d1d94490a
3,c617069c-bd55-45cf-9ebb-12f30f9c6493,view,2,004b11f0-234e-4c42-a573-60ce7a759547,47808093-e408-436d-8067-4db7f361bd12,2025-09-27 07:38:29.910,ff210a03-d01e-49a5-9050-284d1d94490a
4,99fa7a41-3ca3-4116-8829-4fb13fadcca0,wishlist,4,004b11f0-234e-4c42-a573-60ce7a759547,4d1e5462-e34f-4f88-8b0f-0e993d4ca3c5,2025-10-10 09:50:09.891,ff210a03-d01e-49a5-9050-284d1d94490a


In [171]:
# --- 5) Time-decay weighting ----------------------------------------------------
# Compute exponentially decayed weight: eff_w = weight * exp(-days_ago / half_life)
now = pd.Timestamp.utcnow()
days_ago = (now - pd.to_datetime(df_events["ts"], utc=True)).dt.total_seconds() / 86400.0
print("days_ago",days_ago)
decay = np.exp(-days_ago / HALF_LIFE_DAYS)
# decay=1
df_events["eff_w"] = df_events["weight"].astype(float) * decay

# --- 6) Daily cap to limit flood from single sessions ---------------------------
# Aggregate by day, cap the daily contribution, then sum across days
df_events["d"] = pd.to_datetime(df_events["ts"], utc=True).dt.floor("D")
print(df_events.shape)
df_events.head()

days_ago 0         34.913601
1         54.152465
2         70.691978
3         68.585662
4         55.494227
            ...    
12498    227.702188
12499    110.245237
12500    195.916913
12501     20.227432
12502    227.725530
Name: ts, Length: 12503, dtype: float64
(12503, 9)


Unnamed: 0,id,type,weight,user_id,yacht_id,ts,owner_id,eff_w,d
0,6c87fda6-cf4a-43fe-8f46-17907cba4f3b,view,2,004b11f0-234e-4c42-a573-60ce7a759547,093dee6a-74b8-4361-9cd4-38c1443228a6,2025-10-30 23:46:16.028,ff210a03-d01e-49a5-9050-284d1d94490a,1.356921,2025-10-30 00:00:00+00:00
1,62cb73a4-1d95-470b-85db-3d43b5ab0738,view,2,004b11f0-234e-4c42-a573-60ce7a759547,1993b0ed-a89f-4bfc-9d8f-a2aabaa80a49,2025-10-11 18:02:18.115,ff210a03-d01e-49a5-9050-284d1d94490a,1.095765,2025-10-11 00:00:00+00:00
2,9f0a1d57-2b6f-40de-afa6-ac598d2fbae2,view,2,004b11f0-234e-4c42-a573-60ce7a759547,45dd9892-3df7-49dd-a132-0102b37f39e3,2025-09-25 05:05:24.252,ff210a03-d01e-49a5-9050-284d1d94490a,0.911814,2025-09-25 00:00:00+00:00
3,c617069c-bd55-45cf-9ebb-12f30f9c6493,view,2,004b11f0-234e-4c42-a573-60ce7a759547,47808093-e408-436d-8067-4db7f361bd12,2025-09-27 07:38:29.910,ff210a03-d01e-49a5-9050-284d1d94490a,0.933405,2025-09-27 00:00:00+00:00
4,99fa7a41-3ca3-4116-8829-4fb13fadcca0,wishlist,4,004b11f0-234e-4c42-a573-60ce7a759547,4d1e5462-e34f-4f88-8b0f-0e993d4ca3c5,2025-10-10 09:50:09.891,ff210a03-d01e-49a5-9050-284d1d94490a,2.159101,2025-10-10 00:00:00+00:00


In [172]:
daily = (
    df_events
    .groupby(["user_id", "yacht_id", "d"], as_index=False)["eff_w"]
    .sum()
)
daily["day_sum"] = daily["eff_w"].clip(upper=DAY_CAP)

daily.head()

Unnamed: 0,user_id,yacht_id,d,eff_w,day_sum
0,004b11f0-234e-4c42-a573-60ce7a759547,093dee6a-74b8-4361-9cd4-38c1443228a6,2025-10-30 00:00:00+00:00,1.356921,1.356921
1,004b11f0-234e-4c42-a573-60ce7a759547,1993b0ed-a89f-4bfc-9d8f-a2aabaa80a49,2025-10-11 00:00:00+00:00,1.095765,1.095765
2,004b11f0-234e-4c42-a573-60ce7a759547,45dd9892-3df7-49dd-a132-0102b37f39e3,2025-09-25 00:00:00+00:00,0.911814,0.911814
3,004b11f0-234e-4c42-a573-60ce7a759547,47808093-e408-436d-8067-4db7f361bd12,2025-09-27 00:00:00+00:00,0.933405,0.933405
4,004b11f0-234e-4c42-a573-60ce7a759547,4d1e5462-e34f-4f88-8b0f-0e993d4ca3c5,2025-10-10 00:00:00+00:00,2.159101,2.159101


In [173]:
daily.describe()

Unnamed: 0,eff_w,day_sum
count,12203.0,12203.0
mean,1.346728,1.344964
std,1.290756,1.2499
min,0.049714,0.049714
25%,0.549056,0.549056
50%,1.059073,1.059073
75%,1.571386,1.571386
max,41.527353,20.0


In [174]:
scores = (
    daily
    .groupby(["user_id", "yacht_id"], as_index=False)
    .agg(score=("day_sum", "sum"), latest_ts=("d", "max"))
)
print(scores.shape)
scores.head()


(7447, 4)


Unnamed: 0,user_id,yacht_id,score,latest_ts
0,004b11f0-234e-4c42-a573-60ce7a759547,093dee6a-74b8-4361-9cd4-38c1443228a6,1.356921,2025-10-30 00:00:00+00:00
1,004b11f0-234e-4c42-a573-60ce7a759547,1993b0ed-a89f-4bfc-9d8f-a2aabaa80a49,1.095765,2025-10-11 00:00:00+00:00
2,004b11f0-234e-4c42-a573-60ce7a759547,45dd9892-3df7-49dd-a132-0102b37f39e3,0.911814,2025-09-25 00:00:00+00:00
3,004b11f0-234e-4c42-a573-60ce7a759547,47808093-e408-436d-8067-4db7f361bd12,0.933405,2025-09-27 00:00:00+00:00
4,004b11f0-234e-4c42-a573-60ce7a759547,4d1e5462-e34f-4f88-8b0f-0e993d4ca3c5,2.159101,2025-10-10 00:00:00+00:00


In [175]:
# --- 7) Per-user min-max scaling to [1..5] -------------------------------------
# Convert implicit scores into pseudo-ratings for Surprise (stable range per user)
def scale_1_5(g: pd.DataFrame) -> pd.DataFrame:
    mn, mx = g["score"].min(), g["score"].max()
    if mx - mn < 1e-9:
        # If user has uniform scores, assign a neutral-but-positive rating
        g["rating"] = 4.0
    else:
        g["rating"] = 1 + 4 * (g["score"] - mn) / (mx - mn)
    return g

def ensure_str_ids(df):
    df["user_id"] = df["user_id"].astype(str)
    df["yacht_id"] = df["yacht_id"].astype(str)
    return df

In [176]:
# --- 8) Ratings dataframe for surprise algos -------------------------------------
ratings = scores.groupby("user_id", group_keys=False).apply(scale_1_5)
ratings = ratings[["user_id", "yacht_id", "rating", "latest_ts"]].reset_index(drop=True)
ratings = ensure_str_ids(ratings)
print("Number of ratings:\n", ratings.shape)
# print("Sample ratings:\n", ratings.head())
ratings.head()

Number of ratings:
 (7447, 4)


  ratings = scores.groupby("user_id", group_keys=False).apply(scale_1_5)


Unnamed: 0,user_id,yacht_id,rating,latest_ts
0,004b11f0-234e-4c42-a573-60ce7a759547,093dee6a-74b8-4361-9cd4-38c1443228a6,1.424693,2025-10-30 00:00:00+00:00
1,004b11f0-234e-4c42-a573-60ce7a759547,1993b0ed-a89f-4bfc-9d8f-a2aabaa80a49,1.175515,2025-10-11 00:00:00+00:00
2,004b11f0-234e-4c42-a573-60ce7a759547,45dd9892-3df7-49dd-a132-0102b37f39e3,1.0,2025-09-25 00:00:00+00:00
3,004b11f0-234e-4c42-a573-60ce7a759547,47808093-e408-436d-8067-4db7f361bd12,1.020601,2025-09-27 00:00:00+00:00
4,004b11f0-234e-4c42-a573-60ce7a759547,4d1e5462-e34f-4f88-8b0f-0e993d4ca3c5,2.190083,2025-10-10 00:00:00+00:00


In [177]:
ratings.describe()

Unnamed: 0,rating
count,7447.0
mean,2.021539
std,1.185399
min,1.0
25%,1.136373
50%,1.535496
75%,2.446151
max,5.0


In [178]:
# --- 8) Build Surprise dataset --------------------------------------------------
# Surprise expects explicit ratings with a known scale; we mapped to [1..5]
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[["user_id", "yacht_id", "rating"]], reader)

-----------------------------
--- TRAIN / TEST MODEL --------
-----------------------------

In [179]:
# Config
# ART_DIR = os.getenv("ART_DIR", "./artifacts_surprise")
# ALGO_NAME = os.getenv("ALGO", "SVD")           # 'SVD', 'SVDpp', 'NMF', 'KNNBaseline', 'BaselineOnly', 'CoClustering'
# FACTORS   = int(os.getenv("FACTORS", "64"))
# EPOCHS    = int(os.getenv("EPOCHS", "40"))
# LR_ALL    = float(os.getenv("LR_ALL", "0.005"))
# REG_ALL   = float(os.getenv("REG_ALL", "0.02"))
# CUTOFF    = os.getenv("CUTOFF_DATE", "")        # e.g. "2025-06-01" for time-based split
# TEST_SIZE = float(os.getenv("TEST_SIZE", "0.2"))# used if no CUTOFF
# K_TOP     = int(os.getenv("TOPK", "10"))        # for top-K metrics
# POS_TH    = float(os.getenv("POS_THRESHOLD", "4.0"))  # rating >= POS_TH is positive in test

# Model parameters
ART_DIR="./artifacts_surprise"
# 'SVD' or 'SVDpp'
ALGO_NAME="KNNBaseline" 
FACTORS=150
EPOCHS=20
LR_ALL=0.002
REG_ALL=0.02
# e.g. "2025-06-01" for time-based split
CUTOFF=False
# used if no CUTOFF
TEST_SIZE=0.2
# for top-K metrics
K_TOP=15
# rating >= POS_TH is positive in test
POS_TH=3.5

In [180]:
os.makedirs(ART_DIR, exist_ok=True)

In [181]:
# ==== 9) Build Surprise train/test
def build_train_test_from_ratings(ratings_df: pd.DataFrame, cutoff_date: str = "", test_size: float = 0.2):
    """Create Surprise trainset/testset either by time split or random split."""
    reader = Reader(rating_scale=(1, 5))
    if cutoff_date:
        T = pd.to_datetime(cutoff_date)
        train_df = ratings_df[ratings_df["latest_ts"] < T]
        test_df  = ratings_df[ratings_df["latest_ts"] >= T]
        if train_df.empty or test_df.empty:
            raise ValueError("Time-based split produced empty train or test; adjust CUTOFF_DATE.")
        train_data = Dataset.load_from_df(train_df[["user_id","yacht_id","rating"]], reader)
        trainset = train_data.build_full_trainset()
        testset  = list(test_df[["user_id","yacht_id","rating"]].itertuples(index=False, name=None))
        return trainset, testset, train_df, test_df
    else:
        data = Dataset.load_from_df(ratings_df[["user_id","yacht_id","rating"]], reader)
        trainset, testset = train_test_split(data, test_size=test_size, random_state=42)
        # For convenience, reconstruct DataFrames
        train_df = pd.DataFrame(trainset.build_testset(), columns=["user_id","yacht_id","rating"])
        test_df  = pd.DataFrame(testset, columns=["user_id","yacht_id","rating"])
        return trainset, testset, train_df, test_df

In [182]:
trainset, testset, train_df, test_df = build_train_test_from_ratings(ratings, CUTOFF, TEST_SIZE)
train_df = ensure_str_ids(train_df)
test_df = ensure_str_ids(test_df)
print(train_df.shape)
print(test_df.shape)

(5957, 3)
(1490, 3)


In [183]:
# ==== 10 Train model
# ==== 1️⃣ Функція створення моделі за ім'ям
def make_algo(name: str, FACTORS=50, EPOCHS=20, LR_ALL=0.002, REG_ALL=0.01):
    name = name.strip().lower()
    if name == "svdpp":
        return SVDpp(n_factors=FACTORS, n_epochs=EPOCHS, random_state=42)
    elif name == "nmf":
        return NMF(n_factors=FACTORS, n_epochs=EPOCHS, random_state=42)
    elif name == "knnbaseline":
        sim_options = {"name": "cosine", "user_based": True}
        return KNNBaseline(sim_options=sim_options)
    elif name == "baselineonly":
        return BaselineOnly()
    elif name == "coclustering":
        return CoClustering()
    # default — SVD
    return SVD(n_factors=FACTORS, n_epochs=EPOCHS, lr_all=LR_ALL, reg_all=REG_ALL, random_state=42)

In [184]:
algo = make_algo(ALGO_NAME)
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x159b99d00>

In [185]:
# ==== 11 Evaluate: RMSE/MAE (pointwise)
preds = algo.test(testset)
rmse = accuracy.rmse(preds, verbose=False)
mae  = accuracy.mae(preds, verbose=False)
print(f"[METRIC] RMSE={rmse:.4f}  MAE={mae:.4f}")

[METRIC] RMSE=1.3276  MAE=1.0134


In [186]:
# ==== 12  Evaluate: Top-K ranking metrics
def precision_recall_ndcg_at_k(algo, train_df, test_df, K=10, pos_threshold=4.0):
    # positives in test
    test_pos = test_df[test_df["rating"] >= pos_threshold]
    user_pos = test_pos.groupby("user_id")["yacht_id"].apply(set).to_dict()
    # candidates = items seen in train universe
    item_set = set(train_df["yacht_id"].unique())
    # items seen in train per user
    train_seen = train_df.groupby("user_id")["yacht_id"].apply(set).to_dict()

    precs, recs, ndcgs = [], [], []

    for u, pos_items in user_pos.items():
        candidates = list(item_set - train_seen.get(u, set()))
        if not candidates:
            continue
        est = [(iid, algo.predict(u, iid).est) for iid in candidates]
        est.sort(key=lambda x: x[1], reverse=True)
        top = [iid for iid,_ in est[:K]]

        # precision/recall
        hits = len(set(top) & pos_items)
        precs.append(hits / K)
        recs.append(hits / max(1, len(pos_items)))

        # nDCG
        # relevance is 1 if item in pos_items else 0
        gains = [1.0 if iid in pos_items else 0.0 for iid in top]
        dcg = 0.0
        for i, g in enumerate(gains, start=1):
            dcg += g / np.log2(i + 1)
        # ideal DCG: first min(K, |pos|) are 1
        ideal_hits = min(K, len(pos_items))
        idcg = sum(1.0 / np.log2(i + 1) for i in range(1, ideal_hits + 1))
        ndcgs.append(dcg / idcg if idcg > 0 else 0.0)

    metrics = {
        "precision_at_k": float(np.mean(precs)) if precs else 0.0,
        "recall_at_k": float(np.mean(recs)) if recs else 0.0,
        "ndcg_at_k": float(np.mean(ndcgs)) if ndcgs else 0.0,
        "users_evaluated": int(len(precs)),
    }
    return metrics

In [187]:


# ==== 12) Evaluate model using RMSE/MAE and Precision@K / Recall@K / nDCG@K
def evaluate_model(algo, data, k=15, rating_threshold=4.0, cv=3, test_size=0.2, random_state=42):
    """
    Surprise model evaluation:
      1) RMSE/MAE using k-fold cross-validation (on the whole dataset)
      2) Precision@K / Recall@K / nDCG@K using a separate train/test split (via anti-testset)

    Parameters:
      algo:             instance of a Surprise model (e.g., SVDpp())
      data:             surprise.dataset.DatasetAutoFolds (Dataset.load_from_df(...))
      k:                K value for top-K metrics
      rating_threshold: ratings equal to or above this value are considered relevant
      cv:               number of folds for cross-validation
      test_size:        test set proportion for top-K evaluation
      random_state:     random seed for reproducibility

    Returns:
      pandas.DataFrame containing: RMSE, MAE, P@K, R@K, nDCG@K, users_evaluated
    """

    # ---------- 1) Cross-validation for RMSE/MAE ----------
    cv_res = cross_validate(
        algo.__class__(**getattr(algo, 'pp', {})) if False else algo,  # use the provided algo as is
        data,
        measures=["RMSE", "MAE"],
        cv=cv,
        verbose=False
    )
    rmse_cv = float(np.mean(cv_res["test_rmse"]))
    mae_cv  = float(np.mean(cv_res["test_mae"]))

    # ---------- 2) Top-K metrics on a separate train/test split ----------
    # Perform a custom split (independent from CV)
    trainset, testset = train_test_split(data, test_size=test_size, random_state=random_state)

    # Train the model
    algo.fit(trainset)

    # Predict for all “unknown” user–item pairs (anti-testset) to build recommendations
    anti_testset = trainset.build_anti_testset()
    preds_all = algo.test(anti_testset)

    # Build Top-K recommendations per user
    user_to_ranked = defaultdict(list)
    for uid, iid, true_r, est, _ in preds_all:
        user_to_ranked[uid].append((iid, est))
    for uid in user_to_ranked:
        user_to_ranked[uid].sort(key=lambda x: x[1], reverse=True)
        user_to_ranked[uid] = [iid for iid, _ in user_to_ranked[uid][:k]]

    # Collect ground-truth relevant items in the test set (rating ≥ threshold)
    user_to_rels = defaultdict(set)
    for uid, iid, true_r in testset:
        if true_r >= rating_threshold:
            user_to_rels[uid].add(iid)

    # Compute Precision@K, Recall@K, nDCG@K
    precisions, recalls, ndcgs = [], [], []
    users_evaluated = 0

    # Helper: compute DCG@K given hit positions (1-based)
    def dcg_at_k(hits_positions):
        return sum(1 / np.log2(p + 1) for p in hits_positions)

    for uid, recs in user_to_ranked.items():
        rels = user_to_rels.get(uid, set())
        if len(rels) == 0:
            # Skip users with no relevant items in the test set
            continue

        users_evaluated += 1
        hits = [1 if iid in rels else 0 for iid in recs]
        n_hits = sum(hits)

        # Precision@K and Recall@K
        precisions.append(n_hits / k)
        recalls.append(n_hits / len(rels))

        # nDCG@K
        hit_positions = [i + 1 for i, h in enumerate(hits) if h == 1]
        dcg = dcg_at_k(hit_positions)
        ideal_len = min(len(rels), k)
        idcg = dcg_at_k(list(range(1, ideal_len + 1))) if ideal_len > 0 else 1.0
        ndcgs.append(dcg / idcg if idcg > 0 else 0.0)

    p_at_k = float(np.mean(precisions)) if precisions else 0.0
    r_at_k = float(np.mean(recalls))    if recalls else 0.0
    ndcg_k = float(np.mean(ndcgs))      if ndcgs else 0.0

    # Return metrics as a DataFrame
    res = pd.DataFrame([{
        "model":      algo.__class__.__name__,
        "RMSE_CV":    round(rmse_cv, 4),
        "MAE_CV":     round(mae_cv, 4),
        f"P@{k}":     round(p_at_k, 4),
        f"R@{k}":     round(r_at_k, 4),
        f"nDCG@{k}":  round(ndcg_k, 4),
        # "users_evaluated": int(users_evaluated),
        # "rating_threshold": rating_threshold,
        # "K": k,
        # "cv": cv,
        # "test_size": test_size
    }])

    return res


# (optional) convenient runner for multiple models
def evaluate_many(algos, data, k=15, rating_threshold=4.0, cv=3, test_size=0.2, random_state=42):
    frames = []
    for algo in algos:
        res = evaluate_model(algo, data, k=k, rating_threshold=rating_threshold,
                             cv=cv, test_size=test_size, random_state=random_state)
        frames.append(res)
    return pd.concat(frames, ignore_index=True)



In [188]:
# ========================
# =======GridSearch=======


# ---------- 1) Algo class to name mapper
ALGO_CLASSES = {
    "SVD": SVD,
    "SVDpp": SVDpp,
    "NMF": NMF,
    "KNNBaseline": KNNBaseline,
    "BaselineOnly": BaselineOnly,
    "CoClustering": CoClustering,
}

# ---------- 2) Hyperparameter grid for each model
PARAM_GRIDS = {
    "SVD": {
        "n_factors": [50, 100, 150],
        "n_epochs": [20, 40],
        "lr_all": [0.002, 0.005],
        "reg_all": [0.02, 0.1],
    },
    "SVDpp": {
        "n_factors": [50, 100],
        "n_epochs": [20, 40],
        "lr_all": [0.002, 0.005],
        "reg_all": [0.02, 0.1],
    },
    "NMF": {
        "n_factors": [20, 50],
        "n_epochs": [50, 100],
        "reg_pu": [0.02, 0.06, 0.1],
        "reg_qi": [0.02, 0.06, 0.1],
    },
    "KNNBaseline": {
        "k": [20, 40, 80],
        "min_k": [1, 5],
        "sim_options": {
            "name": ["pearson_baseline", "cosine", "msd", "pearson"],
            "user_based": [False, True],  # item-based / user-based
        },
        "bsl_options": {
            "method": ["als"],
            "n_epochs": [5, 10],
            "reg_u": [8, 12],
            "reg_i": [4, 8],
        }
    },
    "BaselineOnly": {
        "bsl_options": {
            "method": ["als"],
            "n_epochs": [5, 10],
            "reg_u": [8, 12, 15],
            "reg_i": [4, 8, 12],
        }
    },
    "CoClustering": {
        "n_cltr_u": [3, 5, 10],
        "n_cltr_i": [3, 5, 10],
        "n_epochs": [20, 40],
    },
}

# ---------- 3) Utils for top-K metrics
def _topk_from_preds(preds, k=15):
    user_to_ranked = defaultdict(list)
    for uid, iid, true_r, est, _ in preds:
        user_to_ranked[uid].append((iid, est))
    for uid in user_to_ranked:
        user_to_ranked[uid].sort(key=lambda x: x[1], reverse=True)
        user_to_ranked[uid] = [iid for iid, _ in user_to_ranked[uid][:k]]
    return user_to_ranked

def _eval_topk(algo, data, k=15, rating_threshold=4.0, test_size=0.2, random_state=42):
    trainset, testset = train_test_split(data, test_size=test_size, random_state=random_state)
    algo.fit(trainset)

    anti = trainset.build_anti_testset()
    preds = algo.test(anti)
    topk = _topk_from_preds(preds, k=k)

    user_true = defaultdict(set)
    for uid, iid, r in testset:
        if r >= rating_threshold:
            user_true[uid].add(iid)

    precisions, recalls, ndcgs, users_eval = [], [], [], 0

    def dcg_at_k(pos):
        return sum(1 / np.log2(p + 1) for p in pos)

    for uid, recs in topk.items():
        rels = user_true.get(uid, set())
        if not rels:
            continue
        users_eval += 1
        hits = [1 if iid in rels else 0 for iid in recs]
        n_hits = sum(hits)

        precisions.append(n_hits / k)
        recalls.append(n_hits / len(rels))

        hit_pos = [i + 1 for i, h in enumerate(hits) if h == 1]
        dcg = dcg_at_k(hit_pos)
        idcg = dcg_at_k(range(1, min(len(rels), k) + 1))
        ndcgs.append(dcg / idcg if idcg > 0 else 0.0)

    return {
        "P@K": float(np.mean(precisions)) if precisions else 0.0,
        "R@K": float(np.mean(recalls)) if recalls else 0.0,
        "nDCG@K": float(np.mean(ndcgs)) if ndcgs else 0.0,
        "users_evaluated": int(users_eval),
    }

# ---------- 4) Main grid search function for a SINGLE model
def run_grid_search(name: str, data, measures=("rmse", "mae"), cv=3, n_jobs=-1, verbose=1):
    """Runs GridSearchCV for the given model `name` and returns (best_algo, gs),
    where best_algo is already trained on the full train set during the CV process."""
    if name not in ALGO_CLASSES:
        raise ValueError(f"Unknown algorithm '{name}'. Allowed: {list(ALGO_CLASSES.keys())}")

    AlgoClass = ALGO_CLASSES[name]
    param_grid = PARAM_GRIDS[name]

    gs = GridSearchCV(
        algo_class=AlgoClass,
        param_grid=param_grid,
        measures=measures,
        cv=cv,
        n_jobs=n_jobs,
        joblib_verbose=0
    )
    gs.fit(data)

    # By default, .best_estimator['rmse'] is the best model based on RMSE
    best_by = measures[0]
    best_algo = gs.best_estimator[best_by]

    if verbose:
        print(f"[{name}] Best {best_by.upper()}: {gs.best_score[best_by]:.4f}")
        print(f"[{name}] Best params ({best_by}): {gs.best_params[best_by]}")

        # Display other metrics if provided
        for m in measures[1:]:
            print(f"[{name}] Best {m.upper()}: {gs.best_score[m]:.4f}  (params: {gs.best_params[m]})")

    return best_algo, gs


# ---------- 5) Convenient wrapper: grid search + top-K metrics
def evaluate_with_grid(name: str, data, k=15, rating_threshold=4.0, cv=3, test_size=0.2, n_jobs=-1, random_state=42):
    # 5.1 Run GridSearchCV (RMSE/MAE CV)
    best_algo, gs = run_grid_search(name, data, measures=("rmse", "mae"), cv=cv, n_jobs=n_jobs, verbose=0)

    # Neatly print a short summary
    rmse_best = gs.best_score["rmse"]
    mae_best  = gs.best_score["mae"]
    params_rmse = gs.best_params["rmse"]

    # 5.2 Evaluate Top-K metrics on a separate split using the best model
    topk_res = _eval_topk(
        best_algo,
        data,
        k=k,
        rating_threshold=rating_threshold,
        test_size=test_size,
        random_state=random_state
    )


    row = {
        "model": name,
        "RMSE_CV(best)": round(rmse_best, 4),
        "MAE_CV(best)": round(mae_best, 4),
        f"P@{k}": round(topk_res["P@K"], 4),
        f"R@{k}": round(topk_res["R@K"], 4),
        f"nDCG@{k}": round(topk_res["nDCG@K"], 4),
        "users_evaluated": topk_res["users_evaluated"],
        "best_params_by_rmse": params_rmse,
    }
    return pd.DataFrame([row]), best_algo, gs


In [189]:
rank_metrics = precision_recall_ndcg_at_k(algo, train_df, test_df, K=K_TOP, pos_threshold=POS_TH)
print(f"Algo name {ALGO_NAME} best params", f"file processed: {users_data_path}")
print(f"[METRIC] RMSE={rmse:.4f}  MAE={mae:.4f}")
print(f"[METRIC] P@{K_TOP}={rank_metrics['precision_at_k']:.4f}  "
      f"R@{K_TOP}={rank_metrics['recall_at_k']:.4f}  "
      f"nDCG@{K_TOP}={rank_metrics['ndcg_at_k']:.4f}  "
      f"(users={rank_metrics['users_evaluated']})")

Algo name KNNBaseline best params file processed: ./data/users_final.csv
[METRIC] RMSE=1.3276  MAE=1.0134
[METRIC] P@15=0.0059  R@15=0.0791  nDCG@15=0.0408  (users=158)


In [190]:
from surprise import SVD, SVDpp, NMF, KNNBaseline, BaselineOnly, CoClustering

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[["user_id","yacht_id","rating"]], reader)

algos = [
    SVD(),
    SVDpp(),
    NMF(),
    KNNBaseline(),
    BaselineOnly(),
    CoClustering()
]

result_table = evaluate_many(algos, data, k=15, rating_threshold=3.5, cv=3, test_size=0.2)
print(result_table)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
          model  RMSE_CV  MAE_CV    P@15    R@15  nDCG@15
0           SVD   1.1969  0.9239  0.0017  0.0253   0.0103
1         SVDpp   1.2076  0.9281  0.0021  0.0316   0.0165
2           NMF   1.3836  0.9877  0.0013  0.0158   0.0045
3   KNNBaseline   1.3824  1.0359  0.0076  0.1076   0.0461
4  BaselineOnly   1.1801  0.9175  0.0021  0.0316   0.0099
5  CoClustering   1.4477  1.0403  0.0004  0.0063   0.0018


In [191]:
# 1) prepare Surprise data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[["user_id", "yacht_id", "rating"]], reader)

# 2) One model with Grid search + top-K metrics
table_knn, best_knn, gs_knn = evaluate_with_grid(
    "KNNBaseline", data, k=15, rating_threshold=4.0, cv=3, test_size=0.2, n_jobs=-1
)
print(table_knn)

# 3) Test on all models
models = ["SVD", "SVDpp", "NMF", "KNNBaseline", "BaselineOnly", "CoClustering"]
tables = []
for m in models:
    t, best_model, gs = evaluate_with_grid(m, data, k=15, rating_threshold=4.0, cv=3, test_size=0.2, n_jobs=-1)
    tables.append(t)
summary = pd.concat(tables, ignore_index=True)
print(summary)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
         model  ...                                best_params_by_rmse
0  KNNBaseline  ...  {'k': 20, 'min_k': 5, 'sim_options': {'name': ...

[1 rows x 8 columns]
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Comput

In [192]:
summary

Unnamed: 0,model,RMSE_CV(best),MAE_CV(best),P@15,R@15,nDCG@15,users_evaluated,best_params_by_rmse
0,SVD,1.1814,0.9158,0.0021,0.032,0.0108,125,"{'n_factors': 50, 'n_epochs': 20, 'lr_all': 0...."
1,SVDpp,1.1873,0.922,0.0016,0.024,0.0122,125,"{'n_factors': 50, 'n_epochs': 20, 'lr_all': 0...."
2,NMF,1.3647,0.9581,0.0005,0.008,0.0027,125,"{'n_factors': 50, 'n_epochs': 50, 'reg_pu': 0...."
3,KNNBaseline,1.1847,0.9193,0.0021,0.032,0.0097,125,"{'k': 20, 'min_k': 5, 'sim_options': {'name': ..."
4,BaselineOnly,1.1798,0.9168,0.0021,0.032,0.0095,125,"{'bsl_options': {'method': 'als', 'n_epochs': ..."
5,CoClustering,1.4425,1.0372,0.0,0.0,0.0,125,"{'n_cltr_u': 3, 'n_cltr_i': 3, 'n_epochs': 20}"


In [193]:
for _, row in summary.iterrows():
    print(f"\nModel: {row['model']}, {row['RMSE_CV(best)']}, {row['MAE_CV(best)']}, {row['P@15']}, {row['R@15']}, {row['nDCG@15']}")
    print(f"\nBest params: {row['best_params_by_rmse']}")


Model: SVD, 1.1814, 0.9158, 0.0021, 0.032, 0.0108

Best params: {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.02}

Model: SVDpp, 1.1873, 0.922, 0.0016, 0.024, 0.0122

Best params: {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.1}

Model: NMF, 1.3647, 0.9581, 0.0005, 0.008, 0.0027

Best params: {'n_factors': 50, 'n_epochs': 50, 'reg_pu': 0.1, 'reg_qi': 0.06}

Model: KNNBaseline, 1.1847, 0.9193, 0.0021, 0.032, 0.0097

Best params: {'k': 20, 'min_k': 5, 'sim_options': {'name': 'pearson', 'user_based': False}, 'bsl_options': {'method': 'als', 'n_epochs': 10, 'reg_u': 12, 'reg_i': 8}}

Model: BaselineOnly, 1.1798, 0.9168, 0.0021, 0.032, 0.0095

Best params: {'bsl_options': {'method': 'als', 'n_epochs': 5, 'reg_u': 15, 'reg_i': 8}}

Model: CoClustering, 1.4425, 1.0372, 0.0, 0.0, 0.0

Best params: {'n_cltr_u': 3, 'n_cltr_i': 3, 'n_epochs': 20}


In [194]:
import itertools
import numpy as np
import pandas as pd
from collections import defaultdict

from surprise import SVD, SVDpp, NMF, KNNBaseline, BaselineOnly, CoClustering
from surprise.model_selection import KFold

# ===== 1) Utils for Top-K Grid Search
def _topk_from_preds(preds, k=15):
    user_to_ranked = defaultdict(list)
    for uid, iid, true_r, est, _ in preds:
        user_to_ranked[uid].append((iid, est))
    for uid in user_to_ranked:
        user_to_ranked[uid].sort(key=lambda x: x[1], reverse=True)
        user_to_ranked[uid] = [iid for iid, _ in user_to_ranked[uid][:k]]
    return user_to_ranked

def _eval_topk_on_fold(algo, trainset, testset, k=15, rating_threshold=4.0):
    """Evaluation of P@K, R@K, and nDCG@K on a single fold (the trainset is already built, and the testset is a list of triplets)."""
    algo.fit(trainset)
    anti = trainset.build_anti_testset()
    preds = algo.test(anti)
    topk = _topk_from_preds(preds, k=k)

    # Ground truth from test fold
    user_true = defaultdict(set)
    for uid, iid, r in testset:
        if r >= rating_threshold:
            user_true[uid].add(iid)

    def dcg_at_k(pos_list):
        return sum(1 / np.log2(p + 1) for p in pos_list)

    precisions, recalls, ndcgs, users_eval = [], [], [], 0
    for uid, recs in topk.items():
        rels = user_true.get(uid, set())
        if not rels:
            continue
        users_eval += 1
        hits = [1 if iid in rels else 0 for iid in recs]
        n_hits = sum(hits)

        precisions.append(n_hits / k)
        recalls.append(n_hits / len(rels))

        hit_pos = [i + 1 for i, h in enumerate(hits) if h == 1]
        dcg = dcg_at_k(hit_pos)
        idcg = dcg_at_k(range(1, min(len(rels), k) + 1))
        ndcgs.append((dcg / idcg) if idcg > 0 else 0.0)

    # average in this fold users
    return {
        "P@K": float(np.mean(precisions)) if precisions else 0.0,
        "R@K": float(np.mean(recalls)) if recalls else 0.0,
        "nDCG@K": float(np.mean(ndcgs)) if ndcgs else 0.0,
        "users_evaluated": int(users_eval),
    }

# ===== 2) Combination generator (supports nested dicts like in sim_options, bsl_options)
def _product_dict(d):
    """
    Accepts a dict where the values are lists or nested dicts with lists.
    Returns an iterator of all possible parameter combinations (including nested ones).
    """
    # Separate simple and nested keys.
    simple_items = {k: v for k, v in d.items() if not isinstance(v, dict)}
    nested_items = {k: v for k, v in d.items() if isinstance(v, dict)}

    # Simple keys: assume each value is a list of possible values.
    simple_keys = list(simple_items.keys())
    simple_vals = [simple_items[k] if isinstance(simple_items[k], list) else [simple_items[k]] for k in simple_keys]
    simple_product = list(itertools.product(*simple_vals)) if simple_keys else [()]

    # Nested keys: recursively generate combinations.
    nested_keys = list(nested_items.keys())
    nested_grids = []
    for k in nested_keys:
        # In nested_items[k], all values must be lists, so generate their combinations.
        sub_keys = list(nested_items[k].keys())
        sub_vals = [nested_items[k][sk] for sk in sub_keys]
        for combo in itertools.product(*sub_vals):
            nested_grids.append({k: dict(zip(sub_keys, combo))})
    if not nested_keys:
        nested_grids = [dict()]

    # Finally, combine the simple and nested parts.
    for sp in simple_product:
        base = dict(zip(simple_keys, sp)) if simple_keys else {}
        for ng in nested_grids:
            out = base.copy()
            out.update(ng)
            yield out

# ===== 3) Custom Grid Search with Top-K metrics
ALGO_CLASSES = {
    "SVD": SVD,
    "SVDpp": SVDpp,
    "NMF": NMF,
    "KNNBaseline": KNNBaseline,
    "BaselineOnly": BaselineOnly,
    "CoClustering": CoClustering,
}

def gridsearch_topk(
    name: str,
    data,
    param_grid: dict,
    target: str = "nDCG@K",  # або "P@K", "R@K"
    k: int = 15,
    rating_threshold: float = 4.0,
    cv: int = 3,
    random_state: int = 42,
    verbose: int = 1,
):
    """
    Iterates over the parameter grid of the Surprise model and selects the configuration with the best average Top-K metric across K-folds.
    """
    if name not in ALGO_CLASSES:
        raise ValueError(f"Unknown algo '{name}'. Options: {list(ALGO_CLASSES.keys())}")

    AlgoClass = ALGO_CLASSES[name]
    kf = KFold(n_splits=cv, random_state=random_state, shuffle=True)

    best_score = -np.inf
    best_params = None
    history = []

    for params in _product_dict(param_grid):
        fold_scores = []
        for trainset, testset in kf.split(data):
            algo = AlgoClass(**params)
            scores = _eval_topk_on_fold(algo, trainset, testset, k=k, rating_threshold=rating_threshold)
            fold_scores.append(scores)

        # average in folds
        mean_scores = {m: float(np.mean([fs[m] for fs in fold_scores])) for m in fold_scores[0].keys()}
        history.append({"params": params, **mean_scores})

        score = mean_scores[target]
        if score > best_score:
            best_score = score
            best_params = params
            if verbose:
                print(f"[{name}] New best {target}={best_score:.4f} with params={best_params}")


    return {
        "model": name,
        "target": target,
        "best_score": round(best_score, 4),
        "best_params": best_params,
        "cv_history": pd.DataFrame(history).sort_values(target, ascending=False).reset_index(drop=True),
    }


In [195]:
# algo parameters for grid search
grid_knn = {
    "k": [40, 80],
    "min_k": [1, 5],
    "sim_options": {
        "name": ["pearson_baseline", "cosine"],
        "user_based": [False, True],
    },
    "bsl_options": {
        "method": ["als"],
        "n_epochs": [5, 10],
        "reg_u": [8, 12],
        "reg_i": [4, 8],
    }
}

# data — this is Surprise Dataset (Dataset.load_from_df(...))
res = gridsearch_topk(
    name="KNNBaseline",
    data=data,
    param_grid=grid_knn,
    target="R@K",       # or "P@K" / "R@K" / "nDCG@K"
    k=15,
    rating_threshold=4.0,
    cv=3,
    random_state=42,
    verbose=1
)

print("Best score R@K:", res["best_score"])
print("Best params:", res["best_params"])
# history table:
print(res["cv_history"].head())


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
[KNNBaseline] New best R@K=0.0872 with params={'k': 40, 'min_k': 1, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}}
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
[KNNBaseline] New best R@K=0.0889 with params={'k': 40, 'min_k': 1, 'sim_options': {'name': 'pearson_baseline', 'user_based': True}}
Estimating biases using als...


In [196]:
# Create and train KNNBaseline algo with the best params
best_knn_base = KNNBaseline(**res["best_params"])
full_train = data.build_full_trainset()
best_knn_base.fit(full_train)

# Evaluate: RMSE/MAE (pointwise)
best_knn_base.fit(trainset)
best_knn_base_preds = best_knn_base.test(testset)
best_knn_base_rmse = accuracy.rmse(best_knn_base_preds, verbose=False)
best_knn_base_mae  = accuracy.mae(best_knn_base_preds, verbose=False)

best_knn_base_metrics = precision_recall_ndcg_at_k(best_knn_base, train_df, test_df, K=K_TOP, pos_threshold=POS_TH)
print("=======================================================================================")
print(f"Algo name KNNBaseline", "Best R@15:", res["best_score"])
print("Best params:", res["best_params"])
print(f"Best params [METRIC] RMSE={best_knn_base_rmse:.4f}  MAE={best_knn_base_mae:.4f}")
print(f"Best params [METRIC] P@{K_TOP}={best_knn_base_metrics['precision_at_k']:.4f}  "
      f"R@{K_TOP}={best_knn_base_metrics['recall_at_k']:.4f}  "
      f"nDCG@{K_TOP}={best_knn_base_metrics['ndcg_at_k']:.4f}  "
      f"(users={best_knn_base_metrics['users_evaluated']})")
print("=======================================================================================")

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Algo name KNNBaseline Best R@15: 0.1502
Best params: {'k': 40, 'min_k': 1, 'sim_options': {'name': 'cosine', 'user_based': True}}
Best params [METRIC] RMSE=1.4144  MAE=1.0726
Best params [METRIC] P@15=0.0076  R@15=0.1044  nDCG@15=0.0483  (users=158)


In [197]:
grid_svd = {
    "n_factors": [50, 100, 150],
    "n_epochs":  [20, 40],
    "lr_all":    [0.002, 0.005],
    "reg_all":   [0.02, 0.1],
    # "biased":  [True, False],  # optional
}

res_svd = gridsearch_topk(
    name="SVD",
    data=data,
    param_grid=grid_svd,
    target="R@K",  # or "P@K" / "R@K" / "nDCG@K"
    k=15,
    rating_threshold=4.0,
    cv=3,
    random_state=42,
    verbose=1
)

print("Best nDCG@15:", res_svd["best_score"])
print("Best params:", res_svd["best_params"])
# review history:
res_svd["cv_history"].head()

[SVD] New best R@K=0.0406 with params={'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.02}
[SVD] New best R@K=0.0426 with params={'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.1}
[SVD] New best R@K=0.0479 with params={'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}
[SVD] New best R@K=0.0488 with params={'n_factors': 50, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.02}
[SVD] New best R@K=0.0530 with params={'n_factors': 100, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.1}
Best nDCG@15: 0.053
Best params: {'n_factors': 100, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.1}


Unnamed: 0,params,P@K,R@K,nDCG@K,users_evaluated
0,"{'n_factors': 100, 'n_epochs': 40, 'lr_all': 0...",0.003786,0.05299,0.021687,222.0
1,"{'n_factors': 50, 'n_epochs': 40, 'lr_all': 0....",0.003299,0.048804,0.018706,222.0
2,"{'n_factors': 150, 'n_epochs': 40, 'lr_all': 0...",0.00327,0.048141,0.018957,222.0
3,"{'n_factors': 50, 'n_epochs': 20, 'lr_all': 0....",0.003394,0.04788,0.019425,222.0
4,"{'n_factors': 150, 'n_epochs': 40, 'lr_all': 0...",0.003121,0.045448,0.019022,222.0


In [198]:
# Create and train SVD algo with the best params
best_svd = SVD(**res_svd["best_params"])
full_train = data.build_full_trainset()
best_svd.fit(full_train)

# Evaluate: RMSE/MAE (pointwise)
best_svd.fit(trainset)
best_svd_preds = best_svd.test(testset)
best_svd_rmse = accuracy.rmse(best_svd_preds, verbose=False)
best_svd_mae  = accuracy.mae(best_svd_preds, verbose=False)
print(f"[METRIC] RMSE={best_svd_rmse:.4f}  MAE={best_svd_mae:.4f}")

best_svd_rank_metrics = precision_recall_ndcg_at_k(best_svd, train_df, test_df, K=K_TOP, pos_threshold=POS_TH)
print("=======================================================================================")
print(f"Algo name SVD  best params", "Best R@15:", res_svd["best_score"])
print("Best params:", res_svd["best_params"])
print(f"Best params [METRIC] RMSE={best_svd_rmse:.4f}  MAE={best_svd_mae:.4f}")
print(f"Best params [METRIC] P@{K_TOP}={best_svd_rank_metrics['precision_at_k']:.4f}  "
      f"R@{K_TOP}={best_svd_rank_metrics['recall_at_k']:.4f}  "
      f"nDCG@{K_TOP}={best_svd_rank_metrics['ndcg_at_k']:.4f}  "
      f"(users={best_svd_rank_metrics['users_evaluated']})")
print("=======================================================================================")

[METRIC] RMSE=1.2045  MAE=0.9290
Algo name SVD  best params Best R@15: 0.053
Best params: {'n_factors': 100, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.1}
Best params [METRIC] RMSE=1.2045  MAE=0.9290
Best params [METRIC] P@15=0.0025  R@15=0.0380  nDCG@15=0.0128  (users=158)


In [199]:
=================== FINISH TESTING ================================

SyntaxError: invalid syntax (2379835986.py, line 1)