In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, json
import numpy as np
import polars as pl

BASE = "/content/drive/MyDrive/recsys_final_20m"
ART  = f"{BASE}/artifacts"

# Load full catalog item factors + mapping
item_factors = np.load(f"{ART}/als_model/item_factors.npy").astype(np.float32)
with open(f"{ART}/als_model/movie_ids.json", "r", encoding="utf-8") as f:
    movie_ids = json.load(f)

print("✅ item_factors:", item_factors.shape)
print("✅ movie_ids:", len(movie_ids))


Mounted at /content/drive
✅ item_factors: (26744, 128)
✅ movie_ids: 26744


In [None]:
# ============================
# EXPORT WEB PACKAGE FOR NEXT.JS
# ============================

WEB = f"{ART}/web"
os.makedirs(WEB, exist_ok=True)

# 1) item_factors -> raw float32 buffer (Node-friendly)
rows, cols = item_factors.shape
item_factors.tofile(f"{WEB}/item_factors.f32")

with open(f"{WEB}/item_factors_shape.json", "w", encoding="utf-8") as f:
    json.dump({"rows": rows, "cols": cols}, f)

# 2) index -> movieId mapping
with open(f"{WEB}/movie_ids.json", "w", encoding="utf-8") as f:
    json.dump(movie_ids, f)

# 3) movieId -> tmdbId mapping (full)
links = pl.read_parquet(f"{BASE}/parquet/links.parquet").select(["movieId", "tmdbId"]).drop_nulls()
movieId_to_tmdb = {int(r["movieId"]): int(r["tmdbId"]) for r in links.to_dicts()}

with open(f"{WEB}/movieId_to_tmdb.json", "w", encoding="utf-8") as f:
    json.dump(movieId_to_tmdb, f)

# 4) app JSONs (copy into web folder)
for name in ["onboarding_250_diverse.json", "home_popular_80.json", "recs_top100.json"]:
    src = f"{ART}/{name}"
    assert os.path.exists(src), f"Missing required file: {src}"
    with open(src, "r", encoding="utf-8") as f:
        data = json.load(f)
    with open(f"{WEB}/{name}", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

print("✅ Web package exported to:", WEB)
print("Files:", sorted(os.listdir(WEB)))
print("item_factors:", rows, cols, "| movieId_to_tmdb:", len(movieId_to_tmdb))


✅ Web package exported to: /content/drive/MyDrive/recsys_final_20m/artifacts/web
Files: ['home_popular_80.json', 'item_factors.f32', 'item_factors_shape.json', 'movieId_to_tmdb.json', 'movie_ids.json', 'onboarding_250_diverse.json', 'recs_top100.json']
item_factors: 26744 128 | movieId_to_tmdb: 27026


In [None]:
# ============================
# DOWNLOAD WEB PACKAGE AS ZIP
# ============================

import shutil
from google.colab import files

ZIP_PATH = "/content/recsys_web_package.zip"
WEB_DIR = "/content/drive/MyDrive/recsys_final_20m/artifacts/web"

# Create zip
shutil.make_archive(ZIP_PATH.replace(".zip", ""), "zip", WEB_DIR)

print("✅ ZIP created at:", ZIP_PATH)

# Trigger browser download
files.download(ZIP_PATH)


✅ ZIP created at: /content/recsys_web_package.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import requests, time, json, os

TMDB_KEY = os.environ.get("TMDB_API_KEY")  # or hardcode if needed
TMDB_IMG = "https://image.tmdb.org/t/p"

TMDB_CACHE_PATH = "/content/drive/MyDrive/recsys_final_20m/artifacts/tmdb_cache_full.json"

if os.path.exists(TMDB_CACHE_PATH):
    with open(TMDB_CACHE_PATH, "r") as f:
        TMDB_CACHE = json.load(f)
else:
    TMDB_CACHE = {}

def save_tmdb_cache():
    with open(TMDB_CACHE_PATH, "w") as f:
        json.dump(TMDB_CACHE, f)

def tmdb_movie_full(tmdb_id):
    if not tmdb_id:
        return None
    tid = str(tmdb_id)
    if tid in TMDB_CACHE:
        return TMDB_CACHE[tid]

    url = f"https://api.themoviedb.org/3/movie/{tmdb_id}"
    params = {"api_key": TMDB_KEY, "append_to_response": "credits"}
    r = requests.get(url, params=params)
    if r.status_code != 200:
        return None

    d = r.json()
    director = None
    for c in d.get("credits", {}).get("crew", []):
        if c.get("job") == "Director":
            director = c.get("name")
            break

    payload = {
        "title": d.get("title"),
        "overview": d.get("overview"),
        "year": int(d["release_date"][:4]) if d.get("release_date") else None,
        "runtime": d.get("runtime"),
        "original_language": d.get("original_language"),
        "genres": [g["name"] for g in d.get("genres", [])],
        "director": director,
        "poster_url": f"{TMDB_IMG}/w342{d['poster_path']}" if d.get("poster_path") else None,
        "backdrop_url": f"{TMDB_IMG}/w780{d['backdrop_path']}" if d.get("backdrop_path") else None,
    }

    TMDB_CACHE[tid] = payload
    time.sleep(0.25)  # be nice to TMDB
    return payload


In [9]:
import polars as pl, re, json

BASE = "/content/drive/MyDrive/recsys_final_20m"
ART  = f"{BASE}/artifacts"

ratings = pl.read_parquet(f"{BASE}/parquet/ratings.parquet").select(["movieId","rating"])
movies  = pl.read_parquet(f"{BASE}/parquet/movies.parquet").select(["movieId","title","genres"])
links   = pl.read_parquet(f"{BASE}/parquet/links.parquet").select(["movieId","tmdbId"])

# --- Aggregate rating stats ---
stats = ratings.group_by("movieId").agg([
    pl.len().alias("n_ratings"),
    pl.mean("rating").alias("avg_rating")
])

# Global mean rating
C = stats.select(pl.mean("avg_rating")).item()

# Mainstream bias knobs:
# - raise m (the Bayesian prior strength) so low-count items are penalized more
# - also rank partially by log(n_ratings)
m = stats.select(pl.quantile("n_ratings", 0.92)).item()  # mainstream leaning
MIN_RATINGS = int(stats.select(pl.quantile("n_ratings", 0.70)).item())  # filter out too-obscure

cand = (
    stats.filter(pl.col("n_ratings") >= MIN_RATINGS)
    .with_columns(
        (
            (pl.col("n_ratings")/(pl.col("n_ratings")+m))*pl.col("avg_rating") +
            (m/(pl.col("n_ratings")+m))*C
        ).alias("weighted")
    )
    .join(movies, on="movieId", how="left")
    .join(links, on="movieId", how="left")
)

# --- Helpers for year/decade + primary genre ---
year_re = re.compile(r"\((\d{4})\)\s*$")

def get_year(title):
    if not title: return None
    mm = year_re.search(title)
    return int(mm.group(1)) if mm else None

def get_decade(y):
    return (y // 10) * 10 if y else None

def primary_genre(genres):
    if not genres: return "Unknown"
    return genres.split("|")[0] if "|" in genres else genres

# --- Build ranked pool with mainstream bias ---
# score_mainstream = weighted + alpha * log10(n_ratings)
# (alpha tuned small so it nudges toward well-known titles)
alpha = 0.12

rows = cand.with_columns(
    (pl.col("weighted") + alpha * (pl.col("n_ratings").cast(pl.Float64).log10())).alias("score_mainstream")
).sort(["score_mainstream","n_ratings"], descending=True).head(12000).to_dicts()

# --- Selection policy: 1975+ only + diversity caps ---
MIN_YEAR = 1975
MAX_YEAR = 2026

MAX_PER_PRIMARY_GENRE = 28
MAX_PER_DECADE = 55

genre_count = {}
decade_count = {}
selected = []
seen = set()

for r in rows:
    mid = int(r["movieId"])
    if mid in seen:
        continue

    title = r.get("title")
    y = get_year(title)
    if y is None or y < MIN_YEAR or y > MAX_YEAR:
        continue

    g = primary_genre(r.get("genres"))
    d = get_decade(y)

    if genre_count.get(g, 0) >= MAX_PER_PRIMARY_GENRE:
        continue
    if decade_count.get(d, 0) >= MAX_PER_DECADE:
        continue

    selected.append(r)
    seen.add(mid)
    genre_count[g] = genre_count.get(g, 0) + 1
    decade_count[d] = decade_count.get(d, 0) + 1

    if len(selected) >= 250:
        break

print("✅ Selected:", len(selected))
print("Top primary genres:", sorted(genre_count.items(), key=lambda x: -x[1])[:10])
print("Decades:", sorted(decade_count.items()))

# --- Enrich using your existing TMDB helper from Notebook 01 ---
# Requires tmdb_movie_full() and save_tmdb_cache() to already exist in the runtime.
# If you're in Notebook 03 and don't have them loaded, run your TMDB helper cell first.
out = []
missing_poster = 0
for r in selected:
    tmdb_id = None if r.get("tmdbId") is None else int(r["tmdbId"])
    payload = tmdb_movie_full(tmdb_id) if tmdb_id else None
    if not (payload and payload.get("poster_url")):
        missing_poster += 1

    out.append({
        "movieId": int(r["movieId"]),
        "tmdbId": tmdb_id,
        "title": r.get("title"),
        "genres_ml": r.get("genres"),
        "poster_url": payload.get("poster_url") if payload else None,
        "backdrop_url": payload.get("backdrop_url") if payload else None,
        "overview": payload.get("overview") if payload else None,
        "year": payload.get("year") if payload else get_year(r.get("title")),
        "director": payload.get("director") if payload else None,
        "runtime": payload.get("runtime") if payload else None,
        "original_language": payload.get("original_language") if payload else None,
        "score_weighted": float(r["weighted"]),
        "score_mainstream": float(r["score_mainstream"]),
        "n_ratings": int(r["n_ratings"]),
        "avg_rating": float(r["avg_rating"]),
    })

save_tmdb_cache()

OUT_PATH = f"{ART}/onboarding_250_diverse.json"  # overwrite so the website keeps working
with open(OUT_PATH, "w", encoding="utf-8") as f:
    json.dump(out, f, ensure_ascii=False, indent=2)

print("✅ Saved:", OUT_PATH)
print("Posters missing:", missing_poster, "out of 250")
print("Example:", out[0])


✅ Selected: 250
Top primary genres: [('Crime', 28), ('Drama', 28), ('Action', 28), ('Comedy', 28), ('Adventure', 28), ('Documentary', 28), ('Animation', 23), ('Horror', 14), ('Mystery', 10), ('Children', 10)]
Decades: [(1970, 41), (1980, 55), (1990, 55), (2000, 55), (2010, 44)]
✅ Saved: /content/drive/MyDrive/recsys_final_20m/artifacts/onboarding_250_diverse.json
Posters missing: 250 out of 250
Example: {'movieId': 318, 'tmdbId': 278, 'title': 'Shawshank Redemption, The (1994)', 'genres_ml': 'Crime|Drama', 'poster_url': None, 'backdrop_url': None, 'overview': None, 'year': 1994, 'director': None, 'runtime': None, 'original_language': None, 'score_weighted': 4.40976098472384, 'score_mainstream': 4.985983739884522, 'n_ratings': 63366, 'avg_rating': 4.446990489959717}


In [7]:
import os, requests

print("TMDB_KEY present?", bool(os.environ.get("TMDB_API_KEY")))
print("TMDB_KEY first 4 chars:", (os.environ.get("TMDB_API_KEY") or "")[:4])

test_tmdb_id = 278  # Shawshank
r = requests.get(
    f"https://api.themoviedb.org/3/movie/{test_tmdb_id}",
    params={"api_key": os.environ.get("TMDB_API_KEY"), "language": "en-US"},
    timeout=20
)

print("Status:", r.status_code)
print("Body snippet:", r.text[:120])


TMDB_KEY present? True
TMDB_KEY first 4 chars: PAST
Status: 401
Body snippet: {"status_code":7,"status_message":"Invalid API key: You must be granted a valid key.","success":false}



In [6]:
import os
os.environ["TMDB_API_KEY"] = "PASTE_YOUR_KEY_HERE"


In [11]:
import os
os.environ["TMDB_API_KEY"] = "bbb2d116d23f8969efb219444eeec069"
print("✅ TMDB key set. First 4 chars:", os.environ["TMDB_API_KEY"][:4])


✅ TMDB key set. First 4 chars: bbb2


In [12]:
import requests, os
r = requests.get(
    "https://api.themoviedb.org/3/movie/278",
    params={"api_key": os.environ["TMDB_API_KEY"], "language": "en-US"},
    timeout=20
)
print("Status:", r.status_code)
print("Title:", r.json().get("title") if r.status_code==200 else r.text[:120])


Status: 200
Title: The Shawshank Redemption


In [13]:
import json, os

BASE = "/content/drive/MyDrive/recsys_final_20m"
ART  = f"{BASE}/artifacts"
PATH = f"{ART}/onboarding_250_diverse.json"

with open(PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

missing_before = sum(1 for x in data if not x.get("poster_url"))

updated = 0
for x in data:
    tmdb_id = x.get("tmdbId")
    if not tmdb_id:
        continue
    payload = tmdb_movie_full(int(tmdb_id))
    if not payload:
        continue

    x["poster_url"] = payload.get("poster_url")
    x["backdrop_url"] = payload.get("backdrop_url")
    x["overview"] = payload.get("overview")
    x["director"] = payload.get("director")
    x["runtime"] = payload.get("runtime")
    x["original_language"] = payload.get("original_language")
    # keep x["year"] as-is unless missing
    if x.get("year") is None:
        x["year"] = payload.get("year")
    updated += 1

save_tmdb_cache()

with open(PATH, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

missing_after = sum(1 for x in data if not x.get("poster_url"))

print("✅ Patched:", PATH)
print("Missing posters before:", missing_before)
print("Missing posters after:", missing_after)
print("Updated entries:", updated)
print("Example:", data[0])


✅ Patched: /content/drive/MyDrive/recsys_final_20m/artifacts/onboarding_250_diverse.json
Missing posters before: 250
Missing posters after: 250
Updated entries: 0
Example: {'movieId': 318, 'tmdbId': 278, 'title': 'Shawshank Redemption, The (1994)', 'genres_ml': 'Crime|Drama', 'poster_url': None, 'backdrop_url': None, 'overview': None, 'year': 1994, 'director': None, 'runtime': None, 'original_language': None, 'score_weighted': 4.40976098472384, 'score_mainstream': 4.985983739884522, 'n_ratings': 63366, 'avg_rating': 4.446990489959717}


In [14]:
import os, requests

print("TMDB_API_KEY first 6:", (os.environ.get("TMDB_API_KEY") or "")[:6])

r = requests.get(
    "https://api.themoviedb.org/3/movie/278",
    params={"api_key": os.environ.get("TMDB_API_KEY"), "language": "en-US"},
    timeout=20
)

print("Status:", r.status_code)
if r.status_code == 200:
    print("Title:", r.json().get("title"))
else:
    print("Body:", r.text[:150])


TMDB_API_KEY first 6: bbb2d1
Status: 200
Title: The Shawshank Redemption


In [15]:
import json, os, time, requests

BASE = "/content/drive/MyDrive/recsys_final_20m"
ART  = f"{BASE}/artifacts"
ONB  = f"{ART}/onboarding_250_diverse.json"
CACHE_PATH = f"{ART}/tmdb_cache_full.json"

TMDB_KEY = os.environ.get("TMDB_API_KEY")
assert TMDB_KEY, "TMDB_API_KEY missing in this runtime."

# Load onboarding
with open(ONB, "r", encoding="utf-8") as f:
    data = json.load(f)

# Load cache (if exists)
if os.path.exists(CACHE_PATH):
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        cache = json.load(f)
else:
    cache = {}

# Purge "poisoned" entries (cached None) for the tmdbIds we need
needed = [str(int(x["tmdbId"])) for x in data if x.get("tmdbId") is not None]
poisoned = sum(1 for k in needed if k in cache and cache[k] is None)

for k in needed:
    if k in cache and cache[k] is None:
        del cache[k]

print("✅ Purged cached-None entries:", poisoned)

TMDB_IMG = "https://image.tmdb.org/t/p"

def fetch_tmdb_full(tmdb_id: int):
    """Fresh fetch (no poisoned cache), returns minimal payload needed for UI."""
    key = str(int(tmdb_id))
    if key in cache and cache[key] is not None:
        return cache[key]

    url = f"https://api.themoviedb.org/3/movie/{int(tmdb_id)}"
    r = requests.get(url, params={"api_key": TMDB_KEY, "append_to_response": "credits"}, timeout=20)
    if r.status_code != 200:
        cache[key] = None
        return None

    d = r.json()

    director = None
    for c in d.get("credits", {}).get("crew", []):
        if c.get("job") == "Director":
            director = c.get("name")
            break

    payload = {
        "title": d.get("title"),
        "overview": d.get("overview"),
        "year": int(d["release_date"][:4]) if d.get("release_date") else None,
        "runtime": d.get("runtime"),
        "original_language": d.get("original_language"),
        "genres": [g["name"] for g in d.get("genres", [])],
        "director": director,
        "poster_url": f"{TMDB_IMG}/w342{d['poster_path']}" if d.get("poster_path") else None,
        "backdrop_url": f"{TMDB_IMG}/w780{d['backdrop_path']}" if d.get("backdrop_path") else None,
    }

    cache[key] = payload
    time.sleep(0.20)  # be nice to TMDB
    return payload

missing_before = sum(1 for x in data if not x.get("poster_url"))
updated = 0

for x in data:
    tmdb_id = x.get("tmdbId")
    if not tmdb_id:
        continue

    payload = fetch_tmdb_full(int(tmdb_id))
    if not payload:
        continue

    x["poster_url"] = payload.get("poster_url")
    x["backdrop_url"] = payload.get("backdrop_url")
    x["overview"] = payload.get("overview")
    x["director"] = payload.get("director")
    x["runtime"] = payload.get("runtime")
    x["original_language"] = payload.get("original_language")
    if x.get("year") is None:
        x["year"] = payload.get("year")

    updated += 1

# Save cache + onboarding back
with open(CACHE_PATH, "w", encoding="utf-8") as f:
    json.dump(cache, f, ensure_ascii=False)

with open(ONB, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

missing_after = sum(1 for x in data if not x.get("poster_url"))

print("✅ Patched onboarding:", ONB)
print("Missing posters before:", missing_before)
print("Missing posters after:", missing_after)
print("Updated entries:", updated)
print("Example:", data[0])


✅ Purged cached-None entries: 0
✅ Patched onboarding: /content/drive/MyDrive/recsys_final_20m/artifacts/onboarding_250_diverse.json
Missing posters before: 250
Missing posters after: 3
Updated entries: 247
Example: {'movieId': 318, 'tmdbId': 278, 'title': 'Shawshank Redemption, The (1994)', 'genres_ml': 'Crime|Drama', 'poster_url': 'https://image.tmdb.org/t/p/w342/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg', 'backdrop_url': 'https://image.tmdb.org/t/p/w780/zfbjgQE1uSd9wiPTX4VzsLi0rGG.jpg', 'overview': 'Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.', 'year': 1994, 'director': 'Frank Darabont', 'runtime': 142, 'original_language': 'en', 'score_weighted': 4.4

In [16]:
from google.colab import files
files.download("/content/drive/MyDrive/recsys_final_20m/artifacts/onboarding_250_diverse.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>