In [856]:
import os
import json
import pickle
import numpy as np
import pandas as pd
from datetime import datetime, timezone

In [857]:
from sqlalchemy import create_engine, text, bindparam
from sqlalchemy.dialects.postgresql import UUID, ARRAY
from dotenv import load_dotenv
from surprise import Dataset, Reader, SVD, SVDpp, accuracy
from surprise.model_selection import train_test_split

In [858]:
load_dotenv()

True

In [859]:
# --- Environment & DB connection ------------------------------------------------
DATABASE_USERNAME = os.environ.get('DATABASE_USERNAME')
DATABASE_PASSWORD = os.environ.get('DATABASE_PASSWORD')
DATABASE_HOST = os.environ.get('DATABASE_HOST')
DATABASE_PORT = os.environ.get('DATABASE_PORT')
DATABASE_NAME = os.environ.get('DATABASE_NAME')

In [860]:
# Build Postgres DSN and create a SQLAlchemy engine
PG_DSN = f"postgresql+psycopg2://{DATABASE_USERNAME}:{DATABASE_PASSWORD}@{DATABASE_HOST}:{DATABASE_PORT}/{DATABASE_NAME}"
engine = create_engine(PG_DSN, pool_pre_ping=True)

In [861]:
# --- Aggregation parameters (keep in sync with your MV if you use one) ----------
HALF_LIFE_DAYS = 90          # time-decay half-life in days (fresh events weigh more)
VIEW_DEBOUNCE_SECONDS = 30   # debounce window for 'view' events from same user on same yacht
LOOKBACK_DAYS = 365 * 2      # history window for events (e.g., last 2 years)
DAY_CAP = 20.0               # per-(user,yacht,day) cap to prevent single session flooding

-----------------------------
--- DATA PREPARATION --------
-----------------------------

In [862]:
# --- 1) Load base tables from csv file --------------------------------------------------------
# Users: only need id and role to keep 'lessee' events
users_data_path="./data/users.csv"
events_data_path="./data/events.csv"
yachts_data_path="./data/yachts.csv"

df_users_full = pd.read_csv(users_data_path)
print(df_users_full.shape)
df_users_full.head()

(501, 11)


Unnamed: 0,id,email,password_hash,country,role,sailingExp,budgetMin,budgetMax,has_skipper_licence,createdAt,updatedAt
0,76c4daf8-bccc-4d53-b36f-453b7f970c42,npowell@example.net,b71ea608d518a792a0b84b247f41cd3d7c66a0005266c2...,France,lessee,beginner,153000,223000,False,2025-07-06 16:04:13.883506,2025-07-06 16:04:13.883506
1,ff210a03-d01e-49a5-9050-284d1d94490a,david91@example.net,9e606ff129c6f451d6c66bedbbd76c8e87b208edf57505...,Spain,lesser,pro,331000,361000,True,2025-08-09 11:12:41.202476,2025-08-09 11:12:41.202476
2,9ec5d781-0848-49fa-bb5c-cc554d78a156,jackiechase@example.com,fbc06bd3205e292650fce8d5850d4c5bb848b3bef14f86...,UAE,lessee,beginner,303000,339000,True,2025-06-14 03:11:58.232374,2025-06-14 03:11:58.232374
3,d509045d-3486-4d4e-81d5-2751d26b1f44,drichardson@example.net,afcaa21d016037d9bc4fad753374fc749d05a97d64c3db...,France,lessee,beginner,319000,406000,True,2025-02-01 08:38:32.151975,2025-02-01 08:38:32.151975
4,ec3bee5f-46c3-456a-b9fe-96cdc13807e9,karen22@example.org,bc2df28ca7d1dceccdd52b5dbab09aae04c39ba81d3722...,UAE,lessee,intermediate,159000,160000,True,2025-09-08 10:43:56.885602,2025-09-08 10:43:56.885602


In [863]:
# select only necessary columns
df_users=df_users_full[["id","role"]]
print(df_users.shape)
df_users.head()

(501, 2)


Unnamed: 0,id,role
0,76c4daf8-bccc-4d53-b36f-453b7f970c42,lessee
1,ff210a03-d01e-49a5-9050-284d1d94490a,lesser
2,9ec5d781-0848-49fa-bb5c-cc554d78a156,lessee
3,d509045d-3486-4d4e-81d5-2751d26b1f44,lessee
4,ec3bee5f-46c3-456a-b9fe-96cdc13807e9,lessee


In [864]:
df_users.dtypes

id      object
role    object
dtype: object

In [865]:
df_users = df_users.astype({
    'id': 'string',
    'role': 'string',
})
df_users.dtypes

id      string[python]
role    string[python]
dtype: object

In [866]:
# Yachts: (owner_id used to drop self-interactions)
df_yachts_full = pd.read_csv(yachts_data_path)
print(df_yachts_full.shape)
df_yachts_full.head()

(2544, 22)


Unnamed: 0,name,type,guests,cabins,crew,length,year,model,summerLowSeasonPrice,summerHighSeasonPrice,winterLowSeasonPrice,winterHighSeasonPrice,description,rating,baseMarina,country,photos,userId,createdAt,updatedAt,id,similarYachts
0,AOIBH,Catamarans,8,4,3,17.26,2014,Custom,3286.0,4071.0,3286.0,3286.0,"The 17.26m/56'8"" catamaran yacht 'Aoibh' by th...",4.0,Corsica,France,{https://pub-59edec60055841149d71125f2e73e658....,ff210a03-d01e-49a5-9050-284d1d94490a,2025-10-25 16:58:48.579 +0300,2025-10-25 16:58:48.579 +0300,9149396e-5737-4ae6-a29e-9129aafe9059,"{074e1237-62ec-43fc-9245-98da440b38f2,51196bda..."
1,CHARLATAN,Sailing Yachts,6,3,3,26.7,2019,Jongert 2700M,4571.0,5571.0,4571.0,4571.0,"The 26.7m/87'7"" sail yacht 'Charlatan' by ship...",4.6,Ibiza,Spain,{https://pub-59edec60055841149d71125f2e73e658....,ff210a03-d01e-49a5-9050-284d1d94490a,2025-10-25 16:58:48.579 +0300,2025-10-25 16:58:48.579 +0300,5d09a41f-4f28-4396-afcd-0d308c25efec,"{908ffa86-06af-4635-8c0c-156ea5c604d1,be3dbcd8..."
2,ABELY,Motor Yachts,10,4,5,32.99,2024,Commuter 108,7500.0,9143.0,7500.0,9143.0,"The 32.5m/106'8"" motor yacht 'Abely' (ex. Abel...",4.5,Ibiza,Spain,{https://pub-59edec60055841149d71125f2e73e658....,ff210a03-d01e-49a5-9050-284d1d94490a,2025-10-25 16:58:48.579 +0300,2025-10-25 16:58:48.579 +0300,bdfa3bb0-9a39-47b8-ab00-09930bfb7d2b,"{20389ed1-9672-4318-97b8-4ff67898dff2,2a26d89c..."
3,AEGEAN CLIPPER,Gulet Yachts,22,11,5,41.0,2019,Custom,3643.0,4429.0,3643.0,4429.0,"The 41m/134'6"" 'Aegean Clipper' gulet yacht bu...",4.1,Amalfi Coast,Italy,"{""https://pub-59edec60055841149d71125f2e73e658...",ff210a03-d01e-49a5-9050-284d1d94490a,2025-10-25 16:58:48.579 +0300,2025-10-25 16:58:48.579 +0300,b8cdc6f5-72d3-4ab7-87a8-862675b40a34,"{34055a6e-189d-493d-90e2-db5b6526f8a9,689e143b..."
4,AERO,Motor Yachts,10,5,7,39.6,2023,CRN 128,22000.0,24286.0,22786.0,26071.0,"The 39.6m/129'11"" motor yacht 'Aero' by the It...",4.8,Amalfi Coast,Italy,{https://pub-59edec60055841149d71125f2e73e658....,ff210a03-d01e-49a5-9050-284d1d94490a,2025-10-25 16:58:48.579 +0300,2025-10-25 16:58:48.579 +0300,e268e2ec-31c7-4c74-8a31-a6ca46a9de88,"{9a504274-f2c9-4b59-b921-94e5cd480745,c036e89a..."


In [867]:
# select only necessary columns
df_yachts=df_yachts_full[["id","userId"]]
df_yachts=df_yachts.rename(columns={"id": "yacht_id", "userId": "owner_id"}, errors="raise")
print(df_yachts.shape)
df_yachts.head()

(2544, 2)


Unnamed: 0,yacht_id,owner_id
0,9149396e-5737-4ae6-a29e-9129aafe9059,ff210a03-d01e-49a5-9050-284d1d94490a
1,5d09a41f-4f28-4396-afcd-0d308c25efec,ff210a03-d01e-49a5-9050-284d1d94490a
2,bdfa3bb0-9a39-47b8-ab00-09930bfb7d2b,ff210a03-d01e-49a5-9050-284d1d94490a
3,b8cdc6f5-72d3-4ab7-87a8-862675b40a34,ff210a03-d01e-49a5-9050-284d1d94490a
4,e268e2ec-31c7-4c74-8a31-a6ca46a9de88,ff210a03-d01e-49a5-9050-284d1d94490a


In [868]:
df_yachts = df_yachts.astype({
    'yacht_id': 'string',
    'owner_id': 'string',
})
df_users.dtypes

id      string[python]
role    string[python]
dtype: object

In [869]:
# Events within the lookback window 
df_events = pd.read_csv(events_data_path)

# Optional guard: if there are no events, stop early to avoid errors downstream
if df_events.empty:
    raise ValueError("No events found within the lookback window. Adjust LOOKBACK_DAYS or check data.")
    
df_events.drop(["updatedAt"], axis=1, inplace=True)
df_events=df_events.rename(columns={ "userId": "user_id", "yachtId": "yacht_id",  "createdAt": "ts"}, errors="raise")   
print(df_events.shape)
df_events.head()

(7498, 6)


Unnamed: 0,id,user_id,yacht_id,type,weight,ts
0,5987d489-aa3c-4d0a-9383-58ac6c04c594,3d2ed4da-6d82-4f44-a7c7-e8dcdb6332d9,eb6ec706-f623-447c-a09c-3163d8756a16,view,2,2025-01-03 08:54:58.142869
1,b0d83b1c-d7f4-4f20-9399-9546d56d09bf,048dfc3f-1f97-4e3c-b253-d285df84cfa3,da047c3e-e015-4eab-be1d-ee6f82ac69d8,view,2,2025-01-13 05:26:38.734072
2,f804c3fc-c250-4d9d-84b6-74e58b08a9e9,93eed134-3787-49c5-813c-42d3815f64c4,c2dbcb9c-708f-473e-8d1e-a25409c94613,wishlist,4,2025-01-14 05:12:31.460362
3,f2ff3f9b-64de-4233-8962-85cd9cd72df3,17c49940-92a5-4da4-8a1e-101089988f7a,7d5f1f33-b4ce-474c-bcf7-ceb25b249085,view,2,2025-01-14 08:50:46.464679
4,bb03d9af-8d6e-45e2-a898-b004fa5ac07c,3d2ed4da-6d82-4f44-a7c7-e8dcdb6332d9,f4ca2c49-50a0-4afd-b88a-cadf35f95354,view,2,2025-01-14 11:33:54.319845


In [870]:
df_events.dtypes

id          object
user_id     object
yacht_id    object
type        object
weight       int64
ts          object
dtype: object

In [871]:
df_events = df_events.astype({
    'ts': 'datetime64[ns]',
    'user_id': 'string',
    'yacht_id': 'string',
    'type': 'string',
})
df_events.dtypes

id                  object
user_id     string[python]
yacht_id    string[python]
type        string[python]
weight               int64
ts          datetime64[ns]
dtype: object

In [872]:
df_book=df_events[df_events["type"] == "book"]
print(df_book.shape)
df_book.head()

(62, 6)


Unnamed: 0,id,user_id,yacht_id,type,weight,ts
383,83e6f3c5-ea91-440c-b949-9c4c32ec571c,93eed134-3787-49c5-813c-42d3815f64c4,c2dbcb9c-708f-473e-8d1e-a25409c94613,book,10,2025-03-30 23:44:33.965397
447,13ddc8dc-07e6-4a35-8c55-b2d0347acbbe,3d2ccbe6-709a-4819-b7a3-005cdf0b97f6,1245afe4-52ec-4e34-b9b3-d54ec3f8df98,book,10,2025-04-07 11:13:47.853931
529,2adc479c-791a-48cf-bd35-7ffe592d7a7c,17c49940-92a5-4da4-8a1e-101089988f7a,3cc2b398-5285-4c31-b977-29aafb178876,book,10,2025-04-15 15:06:56.573610
587,8bed84c5-399e-4132-8d62-09772bf34087,82822e92-be7d-48c1-a43f-8cb4dbda0547,f5a91437-dfc0-4aee-989f-a3c107691f5a,book,10,2025-04-20 20:03:12.958206
681,b36c378d-0a74-4de0-8c7f-aa3470e964e3,79e5f0a2-99ef-43eb-afa2-f49b797c44ef,eeac1abb-a3d0-4a7f-ad9d-d0eb45fbab9e,book,10,2025-04-28 19:24:29.779999


In [873]:
booked_yacht_ids = df_book["yacht_id"].unique().tolist()
len(booked_yacht_ids)

42

In [874]:
# --- 2) Keep only 'lessee' events ----------------------------------------------
# Join user role and filter to renter role (events from admins/owners are not demand signals)
df_events = df_events.merge(df_users, left_on="user_id", right_on="id", how="left", suffixes=("", "_u"))
df_events = df_events[df_events["role"] == "lessee"].drop(columns=["id_u", "role"])


# --- 3) Remove interactions with user's own yachts ------------------------------
# (Owners interacting with their listings would bias the model)
df_events = df_events.merge(df_yachts, on="yacht_id", how="left")
df_events = df_events[df_events["owner_id"].astype(str) != df_events["user_id"].astype(str)]

# --- 4) Debounce frequent 'view' events ----------------------------------------
# Sort and drop near-duplicate 'view' events per (user, yacht) inside the debounce window
df_events = df_events.sort_values(["user_id", "yacht_id", "type", "ts"])
mask = ~(
    (df_events["type"] == "view") &
    (df_events.groupby(["user_id", "yacht_id", "type"])["ts"]
        .diff()
        .dt.total_seconds()
        .between(0, VIEW_DEBOUNCE_SECONDS, inclusive="both"))
)
df_events = df_events[mask]

print(df_events.shape)
df_events.head()

(7497, 7)


Unnamed: 0,id,user_id,yacht_id,type,weight,ts,owner_id
1595,ab3e4482-7b5d-44cb-9d5f-ba60adeeb106,00326c79-0812-4be1-b146-1cf8768ff49b,836980da-940a-4caa-a7f6-c11d7682d329,view,2,2025-06-24 15:06:24.192429,ff210a03-d01e-49a5-9050-284d1d94490a
2712,b487152a-22be-49b7-b123-f382c6534bce,00326c79-0812-4be1-b146-1cf8768ff49b,836980da-940a-4caa-a7f6-c11d7682d329,view,2,2025-08-05 09:10:16.014100,ff210a03-d01e-49a5-9050-284d1d94490a
5071,fb579e07-0139-4fa5-b302-361bbfa64d23,00326c79-0812-4be1-b146-1cf8768ff49b,c2dbcb9c-708f-473e-8d1e-a25409c94613,view,2,2025-10-01 16:05:14.780353,ff210a03-d01e-49a5-9050-284d1d94490a
3070,17e04f1e-7fd9-413b-9876-ad9d3b8d5fe5,00326c79-0812-4be1-b146-1cf8768ff49b,c98bbcf8-8282-465f-b9ea-eb8ad17b8873,view,2,2025-08-17 01:09:13.546338,ff210a03-d01e-49a5-9050-284d1d94490a
6000,4eb89390-3d36-4155-b3ac-bc5b6ae6d9be,00326c79-0812-4be1-b146-1cf8768ff49b,c98bbcf8-8282-465f-b9ea-eb8ad17b8873,wishlist,4,2025-10-16 08:32:40.635341,ff210a03-d01e-49a5-9050-284d1d94490a


In [875]:
# Calculate cutoff date
cutoff_date = pd.Timestamp.now() - pd.Timedelta(days=LOOKBACK_DAYS)
df_events = df_events[df_events['ts'] >= cutoff_date].reset_index(drop=True)
print(df_events.shape)
df_events.head()

(7497, 7)


Unnamed: 0,id,user_id,yacht_id,type,weight,ts,owner_id
0,ab3e4482-7b5d-44cb-9d5f-ba60adeeb106,00326c79-0812-4be1-b146-1cf8768ff49b,836980da-940a-4caa-a7f6-c11d7682d329,view,2,2025-06-24 15:06:24.192429,ff210a03-d01e-49a5-9050-284d1d94490a
1,b487152a-22be-49b7-b123-f382c6534bce,00326c79-0812-4be1-b146-1cf8768ff49b,836980da-940a-4caa-a7f6-c11d7682d329,view,2,2025-08-05 09:10:16.014100,ff210a03-d01e-49a5-9050-284d1d94490a
2,fb579e07-0139-4fa5-b302-361bbfa64d23,00326c79-0812-4be1-b146-1cf8768ff49b,c2dbcb9c-708f-473e-8d1e-a25409c94613,view,2,2025-10-01 16:05:14.780353,ff210a03-d01e-49a5-9050-284d1d94490a
3,17e04f1e-7fd9-413b-9876-ad9d3b8d5fe5,00326c79-0812-4be1-b146-1cf8768ff49b,c98bbcf8-8282-465f-b9ea-eb8ad17b8873,view,2,2025-08-17 01:09:13.546338,ff210a03-d01e-49a5-9050-284d1d94490a
4,4eb89390-3d36-4155-b3ac-bc5b6ae6d9be,00326c79-0812-4be1-b146-1cf8768ff49b,c98bbcf8-8282-465f-b9ea-eb8ad17b8873,wishlist,4,2025-10-16 08:32:40.635341,ff210a03-d01e-49a5-9050-284d1d94490a


In [876]:
# --- 5) Time-decay weighting ----------------------------------------------------
# Compute exponentially decayed weight: eff_w = weight * exp(-days_ago / half_life)
now = pd.Timestamp.utcnow()
days_ago = (now - pd.to_datetime(df_events["ts"], utc=True)).dt.total_seconds() / 86400.0
print("days_ago",days_ago)
decay = np.exp(-days_ago / HALF_LIFE_DAYS)
# decay=1
df_events["eff_w"] = df_events["weight"].astype(float) * decay

# --- 6) Daily cap to limit flood from single sessions ---------------------------
# Aggregate by day, cap the daily contribution, then sum across days
df_events["d"] = pd.to_datetime(df_events["ts"], utc=True).dt.floor("D")
print(df_events.shape)
df_events.head()

days_ago 0       142.260201
1       100.507518
2        43.219337
3        88.841574
4        28.533622
           ...    
7492     88.311790
7493     86.722079
7494     68.348991
7495     51.165920
7496     39.604990
Name: ts, Length: 7497, dtype: float64
(7497, 9)


Unnamed: 0,id,user_id,yacht_id,type,weight,ts,owner_id,eff_w,d
0,ab3e4482-7b5d-44cb-9d5f-ba60adeeb106,00326c79-0812-4be1-b146-1cf8768ff49b,836980da-940a-4caa-a7f6-c11d7682d329,view,2,2025-06-24 15:06:24.192429,ff210a03-d01e-49a5-9050-284d1d94490a,0.411675,2025-06-24 00:00:00+00:00
1,b487152a-22be-49b7-b123-f382c6534bce,00326c79-0812-4be1-b146-1cf8768ff49b,836980da-940a-4caa-a7f6-c11d7682d329,view,2,2025-08-05 09:10:16.014100,ff210a03-d01e-49a5-9050-284d1d94490a,0.654684,2025-08-05 00:00:00+00:00
2,fb579e07-0139-4fa5-b302-361bbfa64d23,00326c79-0812-4be1-b146-1cf8768ff49b,c2dbcb9c-708f-473e-8d1e-a25409c94613,view,2,2025-10-01 16:05:14.780353,ff210a03-d01e-49a5-9050-284d1d94490a,1.237301,2025-10-01 00:00:00+00:00
3,17e04f1e-7fd9-413b-9876-ad9d3b8d5fe5,00326c79-0812-4be1-b146-1cf8768ff49b,c98bbcf8-8282-465f-b9ea-eb8ad17b8873,view,2,2025-08-17 01:09:13.546338,ff210a03-d01e-49a5-9050-284d1d94490a,0.74529,2025-08-17 00:00:00+00:00
4,4eb89390-3d36-4155-b3ac-bc5b6ae6d9be,00326c79-0812-4be1-b146-1cf8768ff49b,c98bbcf8-8282-465f-b9ea-eb8ad17b8873,wishlist,4,2025-10-16 08:32:40.635341,ff210a03-d01e-49a5-9050-284d1d94490a,2.913206,2025-10-16 00:00:00+00:00


In [877]:
daily = (
    df_events
    .groupby(["user_id", "yacht_id", "d"], as_index=False)["eff_w"]
    .sum()
)
daily["day_sum"] = daily["eff_w"].clip(upper=DAY_CAP)

daily.head()

Unnamed: 0,user_id,yacht_id,d,eff_w,day_sum
0,00326c79-0812-4be1-b146-1cf8768ff49b,836980da-940a-4caa-a7f6-c11d7682d329,2025-06-24 00:00:00+00:00,0.411675,0.411675
1,00326c79-0812-4be1-b146-1cf8768ff49b,836980da-940a-4caa-a7f6-c11d7682d329,2025-08-05 00:00:00+00:00,0.654684,0.654684
2,00326c79-0812-4be1-b146-1cf8768ff49b,c2dbcb9c-708f-473e-8d1e-a25409c94613,2025-10-01 00:00:00+00:00,1.237301,1.237301
3,00326c79-0812-4be1-b146-1cf8768ff49b,c98bbcf8-8282-465f-b9ea-eb8ad17b8873,2025-08-17 00:00:00+00:00,0.74529,0.74529
4,00326c79-0812-4be1-b146-1cf8768ff49b,c98bbcf8-8282-465f-b9ea-eb8ad17b8873,2025-10-16 00:00:00+00:00,2.913206,2.913206


In [878]:
daily.describe()

Unnamed: 0,eff_w,day_sum
count,7282.0,7282.0
mean,1.236486,1.236486
std,1.083642,1.083642
min,0.060719,0.060719
25%,0.52803,0.52803
50%,1.035759,1.035759
75%,1.535764,1.535764
max,13.596617,13.596617


In [879]:
scores = (
    daily
    .groupby(["user_id", "yacht_id"], as_index=False)
    .agg(score=("day_sum", "sum"), latest_ts=("d", "max"))
)
print(scores.shape)
scores.head()


(3327, 4)


Unnamed: 0,user_id,yacht_id,score,latest_ts
0,00326c79-0812-4be1-b146-1cf8768ff49b,836980da-940a-4caa-a7f6-c11d7682d329,1.066358,2025-08-05 00:00:00+00:00
1,00326c79-0812-4be1-b146-1cf8768ff49b,c2dbcb9c-708f-473e-8d1e-a25409c94613,1.237301,2025-10-01 00:00:00+00:00
2,00326c79-0812-4be1-b146-1cf8768ff49b,c98bbcf8-8282-465f-b9ea-eb8ad17b8873,3.658496,2025-10-16 00:00:00+00:00
3,00326c79-0812-4be1-b146-1cf8768ff49b,da047c3e-e015-4eab-be1d-ee6f82ac69d8,4.86792,2025-10-10 00:00:00+00:00
4,005aed5a-522c-4bf4-b0e9-52d1371c2939,04641868-5c90-4a98-a7b5-eacc96d52b1f,0.673034,2025-08-07 00:00:00+00:00


In [880]:
# --- 7) Per-user min-max scaling to [1..5] -------------------------------------
# Convert implicit scores into pseudo-ratings for Surprise (stable range per user)
def scale_1_5(g: pd.DataFrame) -> pd.DataFrame:
    mn, mx = g["score"].min(), g["score"].max()
    if mx - mn < 1e-9:
        # If user has uniform scores, assign a neutral-but-positive rating
        g["rating"] = 4.0
    else:
        g["rating"] = 1 + 4 * (g["score"] - mn) / (mx - mn)
    return g

def ensure_str_ids(df):
    df["user_id"] = df["user_id"].astype(str)
    df["yacht_id"] = df["yacht_id"].astype(str)
    return df

In [881]:
# --- 8) Ratings dataframe for surprise algos -------------------------------------
ratings = scores.groupby("user_id", group_keys=False).apply(scale_1_5)
ratings = ratings[["user_id", "yacht_id", "rating", "latest_ts"]].reset_index(drop=True)
ratings = ensure_str_ids(ratings)
print("Number of ratings:\n", ratings.shape)
# print("Sample ratings:\n", ratings.head())
ratings.head()

Number of ratings:
 (3327, 4)


  ratings = scores.groupby("user_id", group_keys=False).apply(scale_1_5)


Unnamed: 0,user_id,yacht_id,rating,latest_ts
0,00326c79-0812-4be1-b146-1cf8768ff49b,836980da-940a-4caa-a7f6-c11d7682d329,1.0,2025-08-05 00:00:00+00:00
1,00326c79-0812-4be1-b146-1cf8768ff49b,c2dbcb9c-708f-473e-8d1e-a25409c94613,1.179866,2025-10-01 00:00:00+00:00
2,00326c79-0812-4be1-b146-1cf8768ff49b,c98bbcf8-8282-465f-b9ea-eb8ad17b8873,3.727445,2025-10-16 00:00:00+00:00
3,00326c79-0812-4be1-b146-1cf8768ff49b,da047c3e-e015-4eab-be1d-ee6f82ac69d8,5.0,2025-10-10 00:00:00+00:00
4,005aed5a-522c-4bf4-b0e9-52d1371c2939,04641868-5c90-4a98-a7b5-eacc96d52b1f,1.060038,2025-08-07 00:00:00+00:00


In [882]:
ratings.describe()

Unnamed: 0,rating
count,3327.0
mean,2.415122
std,1.422227
min,1.0
25%,1.160358
50%,1.890114
75%,3.475618
max,5.0


In [883]:
# --- 8) Build Surprise dataset --------------------------------------------------
# Surprise expects explicit ratings with a known scale; we mapped to [1..5]
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[["user_id", "yacht_id", "rating"]], reader)

-----------------------------
--- TRAIN / TEST MODEL --------
-----------------------------

In [884]:
# Config
# ART_DIR = os.getenv("ART_DIR", "./artifacts_surprise")
# ALGO_NAME = os.getenv("ALGO", "SVD")           # 'SVD', 'SVDpp', 'NMF', 'KNNBaseline', 'BaselineOnly', 'CoClustering'
# FACTORS   = int(os.getenv("FACTORS", "64"))
# EPOCHS    = int(os.getenv("EPOCHS", "40"))
# LR_ALL    = float(os.getenv("LR_ALL", "0.005"))
# REG_ALL   = float(os.getenv("REG_ALL", "0.02"))
# CUTOFF    = os.getenv("CUTOFF_DATE", "")        # e.g. "2025-06-01" for time-based split
# TEST_SIZE = float(os.getenv("TEST_SIZE", "0.2"))# used if no CUTOFF
# K_TOP     = int(os.getenv("TOPK", "10"))        # for top-K metrics
# POS_TH    = float(os.getenv("POS_THRESHOLD", "4.0"))  # rating >= POS_TH is positive in test

# Model parameters
ART_DIR="./artifacts_surprise"
# 'SVD' or 'SVDpp'
ALGO_NAME="KNNBaseline" 
FACTORS=150
EPOCHS=20
LR_ALL=0.002
REG_ALL=0.02
# e.g. "2025-06-01" for time-based split
CUTOFF=False
# used if no CUTOFF
TEST_SIZE=0.2
# for top-K metrics
K_TOP=15
# rating >= POS_TH is positive in test
POS_TH=3.5

In [885]:
os.makedirs(ART_DIR, exist_ok=True)

In [886]:
# ==== 9) Build Surprise train/test
def build_train_test_from_ratings(ratings_df: pd.DataFrame, cutoff_date: str = "", test_size: float = 0.2):
    """Create Surprise trainset/testset either by time split or random split."""
    reader = Reader(rating_scale=(1, 5))
    if cutoff_date:
        T = pd.to_datetime(cutoff_date)
        train_df = ratings_df[ratings_df["latest_ts"] < T]
        test_df  = ratings_df[ratings_df["latest_ts"] >= T]
        if train_df.empty or test_df.empty:
            raise ValueError("Time-based split produced empty train or test; adjust CUTOFF_DATE.")
        train_data = Dataset.load_from_df(train_df[["user_id","yacht_id","rating"]], reader)
        trainset = train_data.build_full_trainset()
        testset  = list(test_df[["user_id","yacht_id","rating"]].itertuples(index=False, name=None))
        return trainset, testset, train_df, test_df
    else:
        data = Dataset.load_from_df(ratings_df[["user_id","yacht_id","rating"]], reader)
        trainset, testset = train_test_split(data, test_size=test_size, random_state=42)
        # For convenience, reconstruct DataFrames
        train_df = pd.DataFrame(trainset.build_testset(), columns=["user_id","yacht_id","rating"])
        test_df  = pd.DataFrame(testset, columns=["user_id","yacht_id","rating"])
        return trainset, testset, train_df, test_df

In [887]:
trainset, testset, train_df, test_df = build_train_test_from_ratings(ratings, CUTOFF, TEST_SIZE)
train_df = ensure_str_ids(train_df)
test_df = ensure_str_ids(test_df)
print(train_df.shape)
print(test_df.shape)

(2661, 3)
(666, 3)


In [888]:
# ==== 10 Train model
# ==== 1️⃣ Функція створення моделі за ім'ям
def make_algo(name: str, FACTORS=100, EPOCHS=20, LR_ALL=0.005, REG_ALL=0.02):
    name = name.strip().lower()
    if name == "svdpp":
        return SVDpp(n_factors=FACTORS, n_epochs=EPOCHS, random_state=42)
    elif name == "nmf":
        return NMF(n_factors=FACTORS, n_epochs=EPOCHS, random_state=42)
    elif name == "knnbaseline":
        sim_options = {"name": "pearson_baseline", "user_based": True}
        return KNNBaseline(sim_options=sim_options)
    elif name == "baselineonly":
        return BaselineOnly()
    elif name == "coclustering":
        return CoClustering()
    # default — SVD
    return SVD(n_factors=FACTORS, n_epochs=EPOCHS, lr_all=LR_ALL, reg_all=REG_ALL, random_state=42)

In [889]:
algo = make_algo(ALGO_NAME)
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x16fa8c380>

In [890]:
# ==== 11 Evaluate: RMSE/MAE (pointwise)
preds = algo.test(testset)
rmse = accuracy.rmse(preds, verbose=False)
mae  = accuracy.mae(preds, verbose=False)
print(f"[METRIC] RMSE={rmse:.4f}  MAE={mae:.4f}")

[METRIC] RMSE=1.5630  MAE=1.2792


In [891]:
# ==== 12  Evaluate: Top-K ranking metrics
def precision_recall_ndcg_at_k(algo, train_df, test_df, K=10, pos_threshold=4.0):
    # positives in test
    test_pos = test_df[test_df["rating"] >= pos_threshold]
    user_pos = test_pos.groupby("user_id")["yacht_id"].apply(set).to_dict()
    # candidates = items seen in train universe
    item_set = set(train_df["yacht_id"].unique())
    # items seen in train per user
    train_seen = train_df.groupby("user_id")["yacht_id"].apply(set).to_dict()

    precs, recs, ndcgs = [], [], []

    for u, pos_items in user_pos.items():
        candidates = list(item_set - train_seen.get(u, set()))
        if not candidates:
            continue
        est = [(iid, algo.predict(u, iid).est) for iid in candidates]
        est.sort(key=lambda x: x[1], reverse=True)
        top = [iid for iid,_ in est[:K]]

        # precision/recall
        hits = len(set(top) & pos_items)
        precs.append(hits / K)
        recs.append(hits / max(1, len(pos_items)))

        # nDCG
        # relevance is 1 if item in pos_items else 0
        gains = [1.0 if iid in pos_items else 0.0 for iid in top]
        dcg = 0.0
        for i, g in enumerate(gains, start=1):
            dcg += g / np.log2(i + 1)
        # ideal DCG: first min(K, |pos|) are 1
        ideal_hits = min(K, len(pos_items))
        idcg = sum(1.0 / np.log2(i + 1) for i in range(1, ideal_hits + 1))
        ndcgs.append(dcg / idcg if idcg > 0 else 0.0)

    metrics = {
        "precision_at_k": float(np.mean(precs)) if precs else 0.0,
        "recall_at_k": float(np.mean(recs)) if recs else 0.0,
        "ndcg_at_k": float(np.mean(ndcgs)) if ndcgs else 0.0,
        "users_evaluated": int(len(precs)),
    }
    return metrics

In [892]:
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
from collections import defaultdict
import numpy as np
import pandas as pd

# ==== 12) Evaluate model using RMSE/MAE and Precision@K / Recall@K / nDCG@K
def evaluate_model(algo, data, k=15, rating_threshold=4.0, cv=3, test_size=0.2, random_state=42):
    """
    Surprise model evaluation:
      1) RMSE/MAE using k-fold cross-validation (on the whole dataset)
      2) Precision@K / Recall@K / nDCG@K using a separate train/test split (via anti-testset)

    Parameters:
      algo:             instance of a Surprise model (e.g., SVDpp())
      data:             surprise.dataset.DatasetAutoFolds (Dataset.load_from_df(...))
      k:                K value for top-K metrics
      rating_threshold: ratings equal to or above this value are considered relevant
      cv:               number of folds for cross-validation
      test_size:        test set proportion for top-K evaluation
      random_state:     random seed for reproducibility

    Returns:
      pandas.DataFrame containing: RMSE, MAE, P@K, R@K, nDCG@K, users_evaluated
    """

    # ---------- 1) Cross-validation for RMSE/MAE ----------
    cv_res = cross_validate(
        algo.__class__(**getattr(algo, 'pp', {})) if False else algo,  # use the provided algo as is
        data,
        measures=["RMSE", "MAE"],
        cv=cv,
        verbose=False
    )
    rmse_cv = float(np.mean(cv_res["test_rmse"]))
    mae_cv  = float(np.mean(cv_res["test_mae"]))

    # ---------- 2) Top-K metrics on a separate train/test split ----------
    # Perform a custom split (independent from CV)
    trainset, testset = train_test_split(data, test_size=test_size, random_state=random_state)

    # Train the model
    algo.fit(trainset)

    # Predict for all “unknown” user–item pairs (anti-testset) to build recommendations
    anti_testset = trainset.build_anti_testset()
    preds_all = algo.test(anti_testset)

    # Build Top-K recommendations per user
    user_to_ranked = defaultdict(list)
    for uid, iid, true_r, est, _ in preds_all:
        user_to_ranked[uid].append((iid, est))
    for uid in user_to_ranked:
        user_to_ranked[uid].sort(key=lambda x: x[1], reverse=True)
        user_to_ranked[uid] = [iid for iid, _ in user_to_ranked[uid][:k]]

    # Collect ground-truth relevant items in the test set (rating ≥ threshold)
    user_to_rels = defaultdict(set)
    for uid, iid, true_r in testset:
        if true_r >= rating_threshold:
            user_to_rels[uid].add(iid)

    # Compute Precision@K, Recall@K, nDCG@K
    precisions, recalls, ndcgs = [], [], []
    users_evaluated = 0

    # Helper: compute DCG@K given hit positions (1-based)
    def dcg_at_k(hits_positions):
        return sum(1 / np.log2(p + 1) for p in hits_positions)

    for uid, recs in user_to_ranked.items():
        rels = user_to_rels.get(uid, set())
        if len(rels) == 0:
            # Skip users with no relevant items in the test set
            continue

        users_evaluated += 1
        hits = [1 if iid in rels else 0 for iid in recs]
        n_hits = sum(hits)

        # Precision@K and Recall@K
        precisions.append(n_hits / k)
        recalls.append(n_hits / len(rels))

        # nDCG@K
        hit_positions = [i + 1 for i, h in enumerate(hits) if h == 1]
        dcg = dcg_at_k(hit_positions)
        ideal_len = min(len(rels), k)
        idcg = dcg_at_k(list(range(1, ideal_len + 1))) if ideal_len > 0 else 1.0
        ndcgs.append(dcg / idcg if idcg > 0 else 0.0)

    p_at_k = float(np.mean(precisions)) if precisions else 0.0
    r_at_k = float(np.mean(recalls))    if recalls else 0.0
    ndcg_k = float(np.mean(ndcgs))      if ndcgs else 0.0

    # Return metrics as a DataFrame
    res = pd.DataFrame([{
        "model":      algo.__class__.__name__,
        "RMSE_CV":    round(rmse_cv, 4),
        "MAE_CV":     round(mae_cv, 4),
        f"P@{k}":     round(p_at_k, 4),
        f"R@{k}":     round(r_at_k, 4),
        f"nDCG@{k}":  round(ndcg_k, 4),
        # "users_evaluated": int(users_evaluated),
        # "rating_threshold": rating_threshold,
        # "K": k,
        # "cv": cv,
        # "test_size": test_size
    }])

    return res


# (optional) convenient runner for multiple models
def evaluate_many(algos, data, k=15, rating_threshold=4.0, cv=3, test_size=0.2, random_state=42):
    frames = []
    for algo in algos:
        res = evaluate_model(algo, data, k=k, rating_threshold=rating_threshold,
                             cv=cv, test_size=test_size, random_state=random_state)
        frames.append(res)
    return pd.concat(frames, ignore_index=True)



In [893]:
# ========================
# =======GridSearch=======
import numpy as np
import pandas as pd
from collections import defaultdict

from surprise import Dataset, Reader
from surprise import SVD, SVDpp, NMF, KNNBaseline, BaselineOnly, CoClustering
from surprise import accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

# ---------- 1) Algo class to name mapper
ALGO_CLASSES = {
    "SVD": SVD,
    "SVDpp": SVDpp,
    "NMF": NMF,
    "KNNBaseline": KNNBaseline,
    "BaselineOnly": BaselineOnly,
    "CoClustering": CoClustering,
}

# ---------- 2) Hyperparameter grid for each model
PARAM_GRIDS = {
    "SVD": {
        "n_factors": [50, 100, 150],
        "n_epochs": [20, 40],
        "lr_all": [0.002, 0.005],
        "reg_all": [0.02, 0.1],
    },
    "SVDpp": {
        "n_factors": [50, 100],
        "n_epochs": [20, 40],
        "lr_all": [0.002, 0.005],
        "reg_all": [0.02, 0.1],
    },
    "NMF": {
        "n_factors": [20, 50],
        "n_epochs": [50, 100],
        "reg_pu": [0.02, 0.06, 0.1],
        "reg_qi": [0.02, 0.06, 0.1],
    },
    "KNNBaseline": {
        "k": [20, 40, 80],
        "min_k": [1, 5],
        "sim_options": {
            "name": ["pearson_baseline", "cosine", "msd", "pearson"],
            "user_based": [False, True],  # item-based / user-based
        },
        "bsl_options": {
            "method": ["als"],
            "n_epochs": [5, 10],
            "reg_u": [8, 12],
            "reg_i": [4, 8],
        }
    },
    "BaselineOnly": {
        "bsl_options": {
            "method": ["als"],
            "n_epochs": [5, 10],
            "reg_u": [8, 12, 15],
            "reg_i": [4, 8, 12],
        }
    },
    "CoClustering": {
        "n_cltr_u": [3, 5, 10],
        "n_cltr_i": [3, 5, 10],
        "n_epochs": [20, 40],
    },
}

# ---------- 3) Utils for top-K metrics
def _topk_from_preds(preds, k=15):
    user_to_ranked = defaultdict(list)
    for uid, iid, true_r, est, _ in preds:
        user_to_ranked[uid].append((iid, est))
    for uid in user_to_ranked:
        user_to_ranked[uid].sort(key=lambda x: x[1], reverse=True)
        user_to_ranked[uid] = [iid for iid, _ in user_to_ranked[uid][:k]]
    return user_to_ranked

def _eval_topk(algo, data, k=15, rating_threshold=4.0, test_size=0.2, random_state=42):
    trainset, testset = train_test_split(data, test_size=test_size, random_state=random_state)
    algo.fit(trainset)

    anti = trainset.build_anti_testset()
    preds = algo.test(anti)
    topk = _topk_from_preds(preds, k=k)

    user_true = defaultdict(set)
    for uid, iid, r in testset:
        if r >= rating_threshold:
            user_true[uid].add(iid)

    precisions, recalls, ndcgs, users_eval = [], [], [], 0

    def dcg_at_k(pos):
        return sum(1 / np.log2(p + 1) for p in pos)

    for uid, recs in topk.items():
        rels = user_true.get(uid, set())
        if not rels:
            continue
        users_eval += 1
        hits = [1 if iid in rels else 0 for iid in recs]
        n_hits = sum(hits)

        precisions.append(n_hits / k)
        recalls.append(n_hits / len(rels))

        hit_pos = [i + 1 for i, h in enumerate(hits) if h == 1]
        dcg = dcg_at_k(hit_pos)
        idcg = dcg_at_k(range(1, min(len(rels), k) + 1))
        ndcgs.append(dcg / idcg if idcg > 0 else 0.0)

    return {
        "P@K": float(np.mean(precisions)) if precisions else 0.0,
        "R@K": float(np.mean(recalls)) if recalls else 0.0,
        "nDCG@K": float(np.mean(ndcgs)) if ndcgs else 0.0,
        "users_evaluated": int(users_eval),
    }

# ---------- 4) Main grid search function for a SINGLE model
def run_grid_search(name: str, data, measures=("rmse", "mae"), cv=3, n_jobs=-1, verbose=1):
    """Runs GridSearchCV for the given model `name` and returns (best_algo, gs),
    where best_algo is already trained on the full train set during the CV process."""
    if name not in ALGO_CLASSES:
        raise ValueError(f"Unknown algorithm '{name}'. Allowed: {list(ALGO_CLASSES.keys())}")

    AlgoClass = ALGO_CLASSES[name]
    param_grid = PARAM_GRIDS[name]

    gs = GridSearchCV(
        algo_class=AlgoClass,
        param_grid=param_grid,
        measures=measures,
        cv=cv,
        n_jobs=n_jobs,
        joblib_verbose=0
    )
    gs.fit(data)

    # By default, .best_estimator['rmse'] is the best model based on RMSE
    best_by = measures[0]
    best_algo = gs.best_estimator[best_by]

    if verbose:
        print(f"[{name}] Best {best_by.upper()}: {gs.best_score[best_by]:.4f}")
        print(f"[{name}] Best params ({best_by}): {gs.best_params[best_by]}")

        # Display other metrics if provided
        for m in measures[1:]:
            print(f"[{name}] Best {m.upper()}: {gs.best_score[m]:.4f}  (params: {gs.best_params[m]})")

    return best_algo, gs


# ---------- 5) Convenient wrapper: grid search + top-K metrics
def evaluate_with_grid(name: str, data, k=15, rating_threshold=4.0, cv=3, test_size=0.2, n_jobs=-1, random_state=42):
    # 5.1 Run GridSearchCV (RMSE/MAE CV)
    best_algo, gs = run_grid_search(name, data, measures=("rmse", "mae"), cv=cv, n_jobs=n_jobs, verbose=0)

    # Neatly print a short summary
    rmse_best = gs.best_score["rmse"]
    mae_best  = gs.best_score["mae"]
    params_rmse = gs.best_params["rmse"]

    # 5.2 Evaluate Top-K metrics on a separate split using the best model
    topk_res = _eval_topk(
        best_algo,
        data,
        k=k,
        rating_threshold=rating_threshold,
        test_size=test_size,
        random_state=random_state
    )


    row = {
        "model": name,
        "RMSE_CV(best)": round(rmse_best, 4),
        "MAE_CV(best)": round(mae_best, 4),
        f"P@{k}": round(topk_res["P@K"], 4),
        f"R@{k}": round(topk_res["R@K"], 4),
        f"nDCG@{k}": round(topk_res["nDCG@K"], 4),
        "users_evaluated": topk_res["users_evaluated"],
        "best_params_by_rmse": params_rmse,
    }
    return pd.DataFrame([row]), best_algo, gs


In [894]:
rank_metrics = precision_recall_ndcg_at_k(algo, train_df, test_df, K=K_TOP, pos_threshold=POS_TH)
print(f"Algo name {ALGO_NAME} best params", f"file processed: {users_data_path}")
print(f"[METRIC] RMSE={rmse:.4f}  MAE={mae:.4f}")
print(f"[METRIC] P@{K_TOP}={rank_metrics['precision_at_k']:.4f}  "
      f"R@{K_TOP}={rank_metrics['recall_at_k']:.4f}  "
      f"nDCG@{K_TOP}={rank_metrics['ndcg_at_k']:.4f}  "
      f"(users={rank_metrics['users_evaluated']})")

Algo name KNNBaseline best params file processed: ./data/generated_users_first.csv
[METRIC] RMSE=1.5630  MAE=1.2792
[METRIC] P@15=0.0254  R@15=0.3401  nDCG@15=0.2135  (users=147)


In [895]:
from surprise import SVD, SVDpp, NMF, KNNBaseline, BaselineOnly, CoClustering

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[["user_id","yacht_id","rating"]], reader)

algos = [
    SVD(),
    SVDpp(),
    NMF(),
    KNNBaseline(),
    BaselineOnly(),
    CoClustering()
]

result_table = evaluate_many(algos, data, k=15, rating_threshold=3.5, cv=3, test_size=0.2)
print(result_table)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
          model  RMSE_CV  MAE_CV    P@15    R@15  nDCG@15
0           SVD   1.4512  1.2253  0.0110  0.1488   0.0697
1         SVDpp   1.4682  1.2261  0.0110  0.1476   0.0909
2           NMF   1.7369  1.3572  0.0014  0.0179   0.0074
3   KNNBaseline   1.5083  1.2541  0.0238  0.3226   0.1451
4  BaselineOnly   1.4255  1.2140  0.0086  0.1119   0.0594
5  CoClustering   1.7700  1.3882  0.0010  0.0143   0.0039


In [896]:
# 1) prepare Surprise data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[["user_id", "yacht_id", "rating"]], reader)

# 2) One model with Grid search + top-K metrics
table_knn, best_knn, gs_knn = evaluate_with_grid(
    "KNNBaseline", data, k=15, rating_threshold=4.0, cv=3, test_size=0.2, n_jobs=-1
)
print(table_knn)

# 3) Test on all models
models = ["SVD", "SVDpp", "NMF", "KNNBaseline", "BaselineOnly", "CoClustering"]
tables = []
for m in models:
    t, best_model, gs = evaluate_with_grid(m, data, k=15, rating_threshold=4.0, cv=3, test_size=0.2, n_jobs=-1)
    tables.append(t)
summary = pd.concat(tables, ignore_index=True)
print(summary)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
         model  ...                                best_params_by_rmse
0  KNNBaseline  ...  {'k': 20, 'min_k': 5, 'sim_options': {'name': ...

[1 rows x 8 columns]
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Comput

In [897]:
summary

Unnamed: 0,model,RMSE_CV(best),MAE_CV(best),P@15,R@15,nDCG@15,users_evaluated,best_params_by_rmse
0,SVD,1.4224,1.2127,0.0127,0.1777,0.0815,121,"{'n_factors': 100, 'n_epochs': 20, 'lr_all': 0..."
1,SVDpp,1.4192,1.2127,0.011,0.157,0.1067,121,"{'n_factors': 100, 'n_epochs': 20, 'lr_all': 0..."
2,NMF,1.6866,1.2975,0.0,0.0,0.0,121,"{'n_factors': 50, 'n_epochs': 50, 'reg_pu': 0...."
3,KNNBaseline,1.434,1.2057,0.0149,0.1942,0.1343,121,"{'k': 20, 'min_k': 5, 'sim_options': {'name': ..."
4,BaselineOnly,1.4309,1.2166,0.0121,0.1653,0.075,121,"{'bsl_options': {'method': 'als', 'n_epochs': ..."
5,CoClustering,1.7602,1.378,0.0017,0.0207,0.0095,121,"{'n_cltr_u': 3, 'n_cltr_i': 3, 'n_epochs': 40}"


In [898]:
for _, row in summary.iterrows():
    print(f"\nModel: {row['model']}, {row['RMSE_CV(best)']}, {row['MAE_CV(best)']}, {row['P@15']}, {row['R@15']}, {row['nDCG@15']}")
    print(f"\nBest params: {row['best_params_by_rmse']}")


Model: SVD, 1.4224, 1.2127, 0.0127, 0.1777, 0.0815

Best params: {'n_factors': 100, 'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.02}

Model: SVDpp, 1.4192, 1.2127, 0.011, 0.157, 0.1067

Best params: {'n_factors': 100, 'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.1}

Model: NMF, 1.6866, 1.2975, 0.0, 0.0, 0.0

Best params: {'n_factors': 50, 'n_epochs': 50, 'reg_pu': 0.1, 'reg_qi': 0.06}

Model: KNNBaseline, 1.434, 1.2057, 0.0149, 0.1942, 0.1343

Best params: {'k': 20, 'min_k': 5, 'sim_options': {'name': 'pearson_baseline', 'user_based': True}, 'bsl_options': {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 8}}

Model: BaselineOnly, 1.4309, 1.2166, 0.0121, 0.1653, 0.075

Best params: {'bsl_options': {'method': 'als', 'n_epochs': 5, 'reg_u': 15, 'reg_i': 12}}

Model: CoClustering, 1.7602, 1.378, 0.0017, 0.0207, 0.0095

Best params: {'n_cltr_u': 3, 'n_cltr_i': 3, 'n_epochs': 40}


In [905]:
import itertools
import numpy as np
import pandas as pd
from collections import defaultdict

from surprise import SVD, SVDpp, NMF, KNNBaseline, BaselineOnly, CoClustering
from surprise.model_selection import KFold

# ===== 1) Utils for Top-K Grid Search
def _topk_from_preds(preds, k=15):
    user_to_ranked = defaultdict(list)
    for uid, iid, true_r, est, _ in preds:
        user_to_ranked[uid].append((iid, est))
    for uid in user_to_ranked:
        user_to_ranked[uid].sort(key=lambda x: x[1], reverse=True)
        user_to_ranked[uid] = [iid for iid, _ in user_to_ranked[uid][:k]]
    return user_to_ranked

def _eval_topk_on_fold(algo, trainset, testset, k=15, rating_threshold=4.0):
    """Evaluation of P@K, R@K, and nDCG@K on a single fold (the trainset is already built, and the testset is a list of triplets)."""
    algo.fit(trainset)
    anti = trainset.build_anti_testset()
    preds = algo.test(anti)
    topk = _topk_from_preds(preds, k=k)

    # Ground truth from test fold
    user_true = defaultdict(set)
    for uid, iid, r in testset:
        if r >= rating_threshold:
            user_true[uid].add(iid)

    def dcg_at_k(pos_list):
        return sum(1 / np.log2(p + 1) for p in pos_list)

    precisions, recalls, ndcgs, users_eval = [], [], [], 0
    for uid, recs in topk.items():
        rels = user_true.get(uid, set())
        if not rels:
            continue
        users_eval += 1
        hits = [1 if iid in rels else 0 for iid in recs]
        n_hits = sum(hits)

        precisions.append(n_hits / k)
        recalls.append(n_hits / len(rels))

        hit_pos = [i + 1 for i, h in enumerate(hits) if h == 1]
        dcg = dcg_at_k(hit_pos)
        idcg = dcg_at_k(range(1, min(len(rels), k) + 1))
        ndcgs.append((dcg / idcg) if idcg > 0 else 0.0)

    # average in this fold users
    return {
        "P@K": float(np.mean(precisions)) if precisions else 0.0,
        "R@K": float(np.mean(recalls)) if recalls else 0.0,
        "nDCG@K": float(np.mean(ndcgs)) if ndcgs else 0.0,
        "users_evaluated": int(users_eval),
    }

# ===== 2) Combination generator (supports nested dicts like in sim_options, bsl_options)
def _product_dict(d):
    """
    Accepts a dict where the values are lists or nested dicts with lists.
    Returns an iterator of all possible parameter combinations (including nested ones).
    """
    # Separate simple and nested keys.
    simple_items = {k: v for k, v in d.items() if not isinstance(v, dict)}
    nested_items = {k: v for k, v in d.items() if isinstance(v, dict)}

    # Simple keys: assume each value is a list of possible values.
    simple_keys = list(simple_items.keys())
    simple_vals = [simple_items[k] if isinstance(simple_items[k], list) else [simple_items[k]] for k in simple_keys]
    simple_product = list(itertools.product(*simple_vals)) if simple_keys else [()]

    # Nested keys: recursively generate combinations.
    nested_keys = list(nested_items.keys())
    nested_grids = []
    for k in nested_keys:
        # In nested_items[k], all values must be lists, so generate their combinations.
        sub_keys = list(nested_items[k].keys())
        sub_vals = [nested_items[k][sk] for sk in sub_keys]
        for combo in itertools.product(*sub_vals):
            nested_grids.append({k: dict(zip(sub_keys, combo))})
    if not nested_keys:
        nested_grids = [dict()]

    # Finally, combine the simple and nested parts.
    for sp in simple_product:
        base = dict(zip(simple_keys, sp)) if simple_keys else {}
        for ng in nested_grids:
            out = base.copy()
            out.update(ng)
            yield out

# ===== 3) Custom Grid Search with Top-K metrics
ALGO_CLASSES = {
    "SVD": SVD,
    "SVDpp": SVDpp,
    "NMF": NMF,
    "KNNBaseline": KNNBaseline,
    "BaselineOnly": BaselineOnly,
    "CoClustering": CoClustering,
}

def gridsearch_topk(
    name: str,
    data,
    param_grid: dict,
    target: str = "nDCG@K",  # або "P@K", "R@K"
    k: int = 15,
    rating_threshold: float = 4.0,
    cv: int = 3,
    random_state: int = 42,
    verbose: int = 1,
):
    """
    Iterates over the parameter grid of the Surprise model and selects the configuration with the best average Top-K metric across K-folds.
    """
    if name not in ALGO_CLASSES:
        raise ValueError(f"Unknown algo '{name}'. Options: {list(ALGO_CLASSES.keys())}")

    AlgoClass = ALGO_CLASSES[name]
    kf = KFold(n_splits=cv, random_state=random_state, shuffle=True)

    best_score = -np.inf
    best_params = None
    history = []

    for params in _product_dict(param_grid):
        fold_scores = []
        for trainset, testset in kf.split(data):
            algo = AlgoClass(**params)
            scores = _eval_topk_on_fold(algo, trainset, testset, k=k, rating_threshold=rating_threshold)
            fold_scores.append(scores)

        # average in folds
        mean_scores = {m: float(np.mean([fs[m] for fs in fold_scores])) for m in fold_scores[0].keys()}
        history.append({"params": params, **mean_scores})

        score = mean_scores[target]
        if score > best_score:
            best_score = score
            best_params = params
            if verbose:
                print(f"[{name}] New best {target}={best_score:.4f} with params={best_params}")


    return {
        "model": name,
        "target": target,
        "best_score": round(best_score, 4),
        "best_params": best_params,
        "cv_history": pd.DataFrame(history).sort_values(target, ascending=False).reset_index(drop=True),
    }


In [900]:
# algo parameters for grid search
grid_knn = {
    "k": [40, 80],
    "min_k": [1, 5],
    "sim_options": {
        "name": ["pearson_baseline", "cosine"],
        "user_based": [False, True],
    },
    "bsl_options": {
        "method": ["als"],
        "n_epochs": [5, 10],
        "reg_u": [8, 12],
        "reg_i": [4, 8],
    }
}

# data — this is Surprise Dataset (Dataset.load_from_df(...))
res = gridsearch_topk(
    name="KNNBaseline",
    data=data,
    param_grid=grid_knn,
    target="nDCG@K",       # or "P@K" чи "R@K"
    k=15,
    rating_threshold=4.0,
    cv=3,
    random_state=42,
    verbose=1
)

print("Best score:", res["best_score"])
print("Best params:", res["best_params"])
# history table:
print(res["cv_history"].head())


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
[KNNBaseline] New best nDCG@K=0.1148 with params={'k': 40, 'min_k': 1, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}}
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
[KNNBaseline] New best nDCG@K=0.1833 with params={'k': 40, 'min_k': 1, 'sim_options': {'name': 'pearson_baseline', 'user_based': True}}
Estimating biases using a

In [901]:
# Create and train KNNBaseline algo with the best params
best_knn_base = KNNBaseline(**res["best_params"])
full_train = data.build_full_trainset()
best_knn_base.fit(full_train)

# Evaluate: RMSE/MAE (pointwise)
best_knn_base.fit(trainset)
best_knn_base_preds = best_knn_base.test(testset)
best_knn_base_rmse = accuracy.rmse(best_knn_base_preds, verbose=False)
best_knn_base_mae  = accuracy.mae(best_knn_base_preds, verbose=False)

best_knn_base_metrics = precision_recall_ndcg_at_k(best_knn_base, train_df, test_df, K=K_TOP, pos_threshold=POS_TH)
print(f"Algo name KNNBaseline best params", f"file processed: {users_data_path}")
print(f"Best params [METRIC] RMSE={best_knn_base_rmse:.4f}  MAE={best_knn_base_mae:.4f}")
print(f"Best params [METRIC] P@{K_TOP}={best_knn_base_metrics['precision_at_k']:.4f}  "
      f"R@{K_TOP}={best_knn_base_metrics['recall_at_k']:.4f}  "
      f"nDCG@{K_TOP}={best_knn_base_metrics['ndcg_at_k']:.4f}  "
      f"(users={best_knn_base_metrics['users_evaluated']})")

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Algo name KNNBaseline best params file processed: ./data/generated_users_first.csv
Best params [METRIC] RMSE=1.5630  MAE=1.2792
Best params [METRIC] P@15=0.0254  R@15=0.3401  nDCG@15=0.2135  (users=147)


In [902]:
grid_svd = {
    "n_factors": [50, 100, 150],
    "n_epochs":  [20, 40],
    "lr_all":    [0.002, 0.005],
    "reg_all":   [0.02, 0.1],
    # "biased":  [True, False],  # optional
}

res_svd = gridsearch_topk(
    name="SVD",
    data=data,
    param_grid=grid_svd,
    target="nDCG@K",  # or "P@K" / "R@K"
    k=15,
    rating_threshold=4.0,
    cv=3,
    random_state=42,
    verbose=1
)

print("Best nDCG@15:", res_svd["best_score"])
print("Best params:", res_svd["best_params"])
# review history:
res_svd["cv_history"].head()

[SVD] New best nDCG@K=0.1065 with params={'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.02}
Best nDCG@15: 0.1065
Best params: {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.02}


Unnamed: 0,params,P@K,R@K,nDCG@K,users_evaluated
0,"{'n_factors': 50, 'n_epochs': 20, 'lr_all': 0....",0.015176,0.213091,0.106505,180.0
1,"{'n_factors': 50, 'n_epochs': 20, 'lr_all': 0....",0.015311,0.211413,0.103338,180.0
2,"{'n_factors': 100, 'n_epochs': 20, 'lr_all': 0...",0.014444,0.200048,0.102235,180.0
3,"{'n_factors': 50, 'n_epochs': 40, 'lr_all': 0....",0.014008,0.192678,0.09332,180.0
4,"{'n_factors': 150, 'n_epochs': 20, 'lr_all': 0...",0.013117,0.183961,0.090478,180.0


In [903]:
# Create and train SVD algo with the best params
best_svd = SVD(**res_svd["best_params"])
full_train = data.build_full_trainset()
best_svd.fit(full_train)

# Evaluate: RMSE/MAE (pointwise)
best_svd.fit(trainset)
best_svd_preds = best_svd.test(testset)
best_svd_rmse = accuracy.rmse(best_svd_preds, verbose=False)
best_svd_mae  = accuracy.mae(best_svd_preds, verbose=False)
print(f"[METRIC] RMSE={best_svd_rmse:.4f}  MAE={best_svd_mae:.4f}")

best_svd_rank_metrics = precision_recall_ndcg_at_k(best_svd, train_df, test_df, K=K_TOP, pos_threshold=POS_TH)
print(f"Algo name SVD  best params", f"file processed: {users_data_path}")
print(f"Best params [METRIC] RMSE={best_svd_rmse:.4f}  MAE={best_svd_mae:.4f}")
print(f"Best params [METRIC] P@{K_TOP}={best_svd_rank_metrics['precision_at_k']:.4f}  "
      f"R@{K_TOP}={best_svd_rank_metrics['recall_at_k']:.4f}  "
      f"nDCG@{K_TOP}={best_svd_rank_metrics['ndcg_at_k']:.4f}  "
      f"(users={best_svd_rank_metrics['users_evaluated']})")

[METRIC] RMSE=1.4525  MAE=1.2432
Algo name SVD best params file processed: ./data/generated_users_first.csv
Best params [METRIC] RMSE=1.4525  MAE=1.2432
Best params [METRIC] P@15=0.0186  R@15=0.2562  nDCG@15=0.1424  (users=147)


In [None]:
=================== FINISH TESTING ================================