In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
BASE = "/content/drive/MyDrive/recsys_final_20m"
for d in [
    f"{BASE}/raw",
    f"{BASE}/parquet",
    f"{BASE}/tmdb_cache",
    f"{BASE}/artifacts",
]:
    os.makedirs(d, exist_ok=True)

print("‚úÖ Project base:", BASE)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Project base: /content/drive/MyDrive/recsys_final_20m


In [4]:
!pip -q install polars pyarrow pandas numpy scipy implicit requests scikit-learn


[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/70.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m70.3/70.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for implicit (pyproject.toml) ... [?25l[?25hdone


In [5]:
import requests, zipfile, io, os

ml_url = "https://files.grouplens.org/datasets/movielens/ml-20m.zip"
target_dir = f"{BASE}/raw"

r = requests.get(ml_url)
r.raise_for_status()

z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(target_dir)

ml_root = os.path.join(target_dir, "ml-20m")
print("‚úÖ Extracted files:", sorted(os.listdir(ml_root)))


‚úÖ Extracted files: ['README.txt', 'genome-scores.csv', 'genome-tags.csv', 'links.csv', 'movies.csv', 'ratings.csv', 'tags.csv']


In [6]:
import polars as pl
import os

ml_root = os.path.join(f"{BASE}/raw", "ml-20m")
pq_dir = f"{BASE}/parquet"

ratings = pl.read_csv(
    f"{ml_root}/ratings.csv",
    dtypes={"userId": pl.Int32, "movieId": pl.Int32, "rating": pl.Float32, "timestamp": pl.Int64},
)
movies = pl.read_csv(
    f"{ml_root}/movies.csv",
    dtypes={"movieId": pl.Int32, "title": pl.Utf8, "genres": pl.Utf8},
)
links = pl.read_csv(
    f"{ml_root}/links.csv",
    dtypes={"movieId": pl.Int32, "imdbId": pl.Int32, "tmdbId": pl.Int32},
)

ratings.write_parquet(f"{pq_dir}/ratings.parquet")
movies.write_parquet(f"{pq_dir}/movies.parquet")
links.write_parquet(f"{pq_dir}/links.parquet")

print("‚úÖ Parquet saved")
print("ratings:", ratings.shape, "movies:", movies.shape, "links:", links.shape)


  ratings = pl.read_csv(
  movies = pl.read_csv(
  links = pl.read_csv(


‚úÖ Parquet saved
ratings: (20000263, 4) movies: (27278, 3) links: (27278, 3)


In [7]:
import os, json, requests
from google.colab import userdata

# --- TMDB setup ---
TMDB_KEY = userdata.get("TMDB_API_KEY")
assert TMDB_KEY, "‚ùå Add TMDB_API_KEY in Colab Secrets."

BASE = "/content/drive/MyDrive/recsys_final_20m"
CACHE_PATH = f"{BASE}/tmdb_cache/tmdb_cache_full.json"

if os.path.exists(CACHE_PATH):
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        tmdb_cache = json.load(f)
else:
    tmdb_cache = {}

TMDB_BASE = "https://api.themoviedb.org/3"

def poster_url(path, size="w342"):
    return f"https://image.tmdb.org/t/p/{size}{path}" if path else None

def backdrop_url(path, size="w780"):
    return f"https://image.tmdb.org/t/p/{size}{path}" if path else None

def tmdb_movie_full(tmdb_id: int):
    """
    Fetch full TMDB movie payload:
    - details
    - director(s)
    - top cast
    - runtime, language, genres, etc.
    Cached to Drive.
    """
    if tmdb_id is None:
        return None

    key = str(int(tmdb_id))
    if key in tmdb_cache:
        return tmdb_cache[key]

    # ---- 1) Movie details ----
    details_url = f"{TMDB_BASE}/movie/{int(tmdb_id)}"
    details_resp = requests.get(
        details_url,
        params={"api_key": TMDB_KEY, "language": "en-US"},
        timeout=20
    )
    if details_resp.status_code != 200:
        tmdb_cache[key] = None
        return None

    d = details_resp.json()

    # ---- 2) Credits (directors + cast) ----
    credits_url = f"{TMDB_BASE}/movie/{int(tmdb_id)}/credits"
    credits_resp = requests.get(
        credits_url,
        params={"api_key": TMDB_KEY},
        timeout=20
    )
    credits = credits_resp.json() if credits_resp.status_code == 200 else {}

    # Directors
    directors = [
        c["name"]
        for c in credits.get("crew", [])
        if c.get("job") == "Director"
    ]

    # Top cast (first 5 billed)
    top_cast = [
        c["name"]
        for c in credits.get("cast", [])[:5]
    ]

    payload = {
        "tmdb_id": d.get("id"),
        "title": d.get("title"),
        "original_title": d.get("original_title"),
        "overview": d.get("overview"),
        "tagline": d.get("tagline"),
        "release_date": d.get("release_date"),
        "year": int(d["release_date"][:4]) if d.get("release_date") else None,

        # Core details
        "runtime": d.get("runtime"),  # minutes
        "original_language": d.get("original_language"),
        "spoken_languages": [l["english_name"] for l in d.get("spoken_languages", [])],
        "production_countries": [c["name"] for c in d.get("production_countries", [])],

        # Genres & people
        "genres": [g["name"] for g in d.get("genres", [])],
        "directors": directors,
        "director": directors[0] if directors else None,
        "top_cast": top_cast,

        # Media
        "poster_path": d.get("poster_path"),
        "poster_url": poster_url(d.get("poster_path")),
        "backdrop_path": d.get("backdrop_path"),
        "backdrop_url": backdrop_url(d.get("backdrop_path")),

        # Popularity & ratings
        "vote_average": d.get("vote_average"),
        "vote_count": d.get("vote_count"),
        "popularity": d.get("popularity"),

        # Misc
        "adult": d.get("adult"),
        "status": d.get("status"),
        "imdb_id": d.get("imdb_id"),
    }

    tmdb_cache[key] = payload
    return payload

def save_tmdb_cache():
    with open(CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(tmdb_cache, f, ensure_ascii=False, indent=2)

print("‚úÖ TMDB full cache loaded:", len(tmdb_cache))


‚úÖ TMDB full cache loaded: 80


In [8]:
import polars as pl
import json

ratings = pl.read_parquet(f"{BASE}/parquet/ratings.parquet")
movies  = pl.read_parquet(f"{BASE}/parquet/movies.parquet")
links   = pl.read_parquet(f"{BASE}/parquet/links.parquet")

stats = (
    ratings.group_by("movieId")
    .agg([
        pl.len().alias("n_ratings"),
        pl.mean("rating").alias("avg_rating")
    ])
)

C = stats.select(pl.mean("avg_rating")).item()
m = stats.select(pl.quantile("n_ratings", 0.80)).item()

popular = (
    stats.with_columns(
        (
            (pl.col("n_ratings")/(pl.col("n_ratings")+m))*pl.col("avg_rating") +
            (m/(pl.col("n_ratings")+m))*C
        ).alias("weighted")
    )
    .sort(["weighted","n_ratings"], descending=True)
    .head(80)
    .join(movies, on="movieId", how="left")
    .join(links, on="movieId", how="left")
)

rows = popular.to_dicts()
out = []
for r in rows:
    payload = tmdb_movie_full(r.get("tmdbId"))
    out.append({
        "movieId": int(r["movieId"]),
        "title": r.get("title"),
        "genres": r.get("genres"),
        "tmdbId": None if r.get("tmdbId") is None else int(r["tmdbId"]),
        "poster": poster_url(payload.get("poster_path")) if payload else None,
        "overview": payload.get("overview") if payload else None,
        "release_date": payload.get("release_date") if payload else None,
        "score": float(r["weighted"]),
        "n_ratings": int(r["n_ratings"]),
        "avg_rating": float(r["avg_rating"]),
    })

save_tmdb_cache()

out_path = f"{BASE}/artifacts/home_popular_80.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(out, f, ensure_ascii=False, indent=2)

print("‚úÖ Wrote homepage JSON:", out_path)
print("Example item:", out[0])


‚úÖ Wrote homepage JSON: /content/drive/MyDrive/recsys_final_20m/artifacts/home_popular_80.json
Example item: {'movieId': 318, 'title': 'Shawshank Redemption, The (1994)', 'genres': 'Crime|Drama', 'tmdbId': 278, 'poster': 'https://image.tmdb.org/t/p/w342/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg', 'overview': 'Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.', 'release_date': '1994-09-23', 'score': 4.439855703205573, 'n_ratings': 63366, 'avg_rating': 4.446990489959717}


In [9]:
import polars as pl

BASE = "/content/drive/MyDrive/recsys_final_20m"

ratings = pl.read_parquet(f"{BASE}/parquet/ratings.parquet").select(["userId","movieId","rating"])
movies  = pl.read_parquet(f"{BASE}/parquet/movies.parquet")
links   = pl.read_parquet(f"{BASE}/parquet/links.parquet")

# Keep users with at least 20 ratings (adjustable)
user_counts = ratings.group_by("userId").agg(pl.len().alias("n"))
active_users = user_counts.filter(pl.col("n") >= 20).select("userId")

ratings_f = ratings.join(active_users, on="userId", how="inner")

print("‚úÖ Original ratings:", ratings.shape)
print("‚úÖ Filtered ratings:", ratings_f.shape)
print("‚úÖ Active users:", active_users.shape)


‚úÖ Original ratings: (20000263, 3)
‚úÖ Filtered ratings: (20000263, 3)
‚úÖ Active users: (138493, 1)


In [10]:
print("‚úÖ Original ratings:", ratings.shape)
print("‚úÖ Filtered ratings:", ratings_f.shape)
print("‚úÖ Active users:", active_users.shape)


‚úÖ Original ratings: (20000263, 3)
‚úÖ Filtered ratings: (20000263, 3)
‚úÖ Active users: (138493, 1)


In [None]:
# ‚úÖ CELL 8 (FULL, CORRECTED): Build sparse matrix + train ALS (implicit)

import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares

# ratings_f should come from Cell 7 (filtered ratings)
# user_list, item_list, user_id_to_idx, item_id_to_idx will be built here

# 1) Build id -> index maps
unique_users = ratings_f.select("userId").unique().sort("userId")
unique_items = ratings_f.select("movieId").unique().sort("movieId")

user_list = unique_users["userId"].to_list()   # index -> userId
item_list = unique_items["movieId"].to_list()  # index -> movieId

user_id_to_idx = {u: i for i, u in enumerate(user_list)}
item_id_to_idx = {m: i for i, m in enumerate(item_list)}

# 2) Convert ratings to numpy arrays (for sparse matrix construction)
r_pd = ratings_f.to_pandas()
u = r_pd["userId"].map(user_id_to_idx).to_numpy()
i = r_pd["movieId"].map(item_id_to_idx).to_numpy()
v = r_pd["rating"].astype(np.float32).to_numpy()

# 3) Build sparse user-item matrix (users x items)
mat = coo_matrix(
    (v, (u, i)),
    shape=(len(user_id_to_idx), len(item_id_to_idx))
).tocsr()

print("‚úÖ mat shape (users x items):", mat.shape, "| nonzeros:", mat.nnz)

# 4) Train ALS
model = AlternatingLeastSquares(
    factors=128,
    regularization=0.12,
    iterations=25,
)

# IMPORTANT: fit on user-item matrix for your implicit version
model.fit(mat)

print("‚úÖ ALS trained")
print("‚úÖ user_factors:", model.user_factors.shape, "(should match users)")
print("‚úÖ item_factors:", model.item_factors.shape, "(should match items)")


‚úÖ mat shape (users x items): (138493, 26744) | nonzeros: 20000263


  0%|          | 0/25 [00:00<?, ?it/s]

‚úÖ ALS trained
‚úÖ user_factors: (138493, 128) (should match users)
‚úÖ item_factors: (26744, 128) (should match items)


In [None]:
import numpy as np, json, os

ART = f"{BASE}/artifacts/als_model"
os.makedirs(ART, exist_ok=True)

# save factors
np.save(f"{ART}/user_factors.npy", model.user_factors)
np.save(f"{ART}/item_factors.npy", model.item_factors)

# save id maps
with open(f"{ART}/user_ids.json", "w", encoding="utf-8") as f:
    json.dump(user_list, f)   # index -> userId

with open(f"{ART}/movie_ids.json", "w", encoding="utf-8") as f:
    json.dump(item_list, f)   # index -> movieId

# save hyperparams for the report
meta = {
    "factors": 128,
    "regularization": 0.12,
    "iterations": 25,
    "notes": "ALS (implicit) trained on ML-20M (filtered or full depending on prior step)."
}
with open(f"{ART}/meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

print("‚úÖ Saved ALS artifacts to:", ART)
print("user_factors:", model.user_factors.shape, "item_factors:", model.item_factors.shape)


‚úÖ Saved ALS artifacts to: /content/drive/MyDrive/recsys_final_20m/artifacts/als_model
user_factors: (138493, 128) item_factors: (26744, 128)


In [None]:
import random
import polars as pl
import numpy as np

# item_user should already exist from training cell:
# item_user = mat.T
# Make sure it's CSR
item_user_csr = item_user.tocsr()

# pick a random active userId
some_user_id = random.choice(user_list)
uidx = user_id_to_idx[some_user_id]

# recommend: pass item_user matrix (NOT mat[uidx])
rec_ids, rec_scores = model.recommend(
    userid=uidx,
    user_items=item_user_csr,
    N=100,
    filter_already_liked_items=True
)

# map item-index -> movieId
idx_to_movie = {idx: mid for mid, idx in item_id_to_idx.items()}
rec_movie_ids = [idx_to_movie[int(i)] for i in rec_ids]
rec_scores    = [float(s) for s in rec_scores]

rec_df = pl.DataFrame({"movieId": rec_movie_ids, "score": rec_scores})
rec_df = rec_df.join(movies, on="movieId", how="left").join(links, on="movieId", how="left")

print("‚úÖ Example userId:", some_user_id, "| user_index:", uidx)
rec_df.head(10)


ValueError: user_items must contain 1 row for every user in userids

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import os, glob
BASE = "/content/drive/MyDrive/recsys_final_20m"
ART = f"{BASE}/artifacts/als_model"
print("exists:", os.path.exists(ART))
print(sorted([os.path.basename(p) for p in glob.glob(ART+"/*")]))


exists: True
['item_factors.npy', 'meta.json', 'movie_ids.json', 'user_factors.npy', 'user_ids.json']


In [3]:
import numpy as np, json, os
BASE="/content/drive/MyDrive/recsys_final_20m"
ART=f"{BASE}/artifacts/als_model"

item_factors = np.load(f"{ART}/item_factors.npy")
with open(f"{ART}/movie_ids.json","r",encoding="utf-8") as f:
    movie_ids = json.load(f)

print("item_factors rows:", item_factors.shape[0])
print("movie_ids:", len(movie_ids))
print("Example movieId:", movie_ids[0])


item_factors rows: 26744
movie_ids: 26744
Example movieId: 1


In [11]:
import polars as pl, re, json

BASE = "/content/drive/MyDrive/recsys_final_20m"

ratings = pl.read_parquet(f"{BASE}/parquet/ratings.parquet").select(["movieId","rating"])
movies  = pl.read_parquet(f"{BASE}/parquet/movies.parquet")
links   = pl.read_parquet(f"{BASE}/parquet/links.parquet")

# --- 1) Compute weighted score (same as before) ---
stats = (
    ratings.group_by("movieId")
    .agg([pl.len().alias("n_ratings"), pl.mean("rating").alias("avg_rating")])
)

C = stats.select(pl.mean("avg_rating")).item()
m = stats.select(pl.quantile("n_ratings", 0.80)).item()

cand = (
    stats.with_columns(
        (
            (pl.col("n_ratings")/(pl.col("n_ratings")+m))*pl.col("avg_rating") +
            (m/(pl.col("n_ratings")+m))*C
        ).alias("weighted")
    )
    .join(movies, on="movieId", how="left")
    .join(links, on="movieId", how="left")
    .sort(["weighted","n_ratings"], descending=True)
    .head(2000)  # take a larger candidate pool, then diversify
)

rows = cand.to_dicts()

# --- 2) Helpers: primary genre + decade from title year ---
year_re = re.compile(r"\((\d{4})\)\s*$")

def get_year(title):
    if not title:
        return None
    m = year_re.search(title)
    return int(m.group(1)) if m else None

def get_decade(y):
    if y is None:
        return None
    return (y // 10) * 10

def primary_genre(genres):
    if not genres:
        return "Unknown"
    return genres.split("|")[0] if "|" in genres else genres

# --- 3) Diversity caps (tunable, safe defaults) ---
MAX_PER_PRIMARY_GENRE = 28   # prevents 120 dramas
MAX_PER_DECADE = 55          # prevents 150 from one decade
MAX_UNKNOWN_YEAR = 15

genre_count = {}
decade_count = {}
unknown_year = 0

selected = []
selected_ids = set()

for r in rows:
    mid = int(r["movieId"])
    if mid in selected_ids:
        continue

    g = primary_genre(r.get("genres"))
    y = get_year(r.get("title"))
    d = get_decade(y)

    # enforce caps
    if genre_count.get(g, 0) >= MAX_PER_PRIMARY_GENRE:
        continue

    if d is None:
        if unknown_year >= MAX_UNKNOWN_YEAR:
            continue
    else:
        if decade_count.get(d, 0) >= MAX_PER_DECADE:
            continue

    # accept
    selected.append(r)
    selected_ids.add(mid)
    genre_count[g] = genre_count.get(g, 0) + 1
    if d is None:
        unknown_year += 1
    else:
        decade_count[d] = decade_count.get(d, 0) + 1

    if len(selected) >= 250:
        break

print("‚úÖ Selected:", len(selected))
print("Top primary genres:", sorted(genre_count.items(), key=lambda x: -x[1])[:10])
print("Decades:", sorted(decade_count.items()))

# --- 4) Enrich ONLY for these 250 using your tmdb_movie_full cache ---
out = []
for r in selected:
    payload = tmdb_movie_full(r.get("tmdbId"))
    out.append({
        "movieId": int(r["movieId"]),
        "tmdbId": None if r.get("tmdbId") is None else int(r["tmdbId"]),
        "title": r.get("title"),
        "genres_ml": r.get("genres"),
        "poster_url": payload.get("poster_url") if payload else None,
        "backdrop_url": payload.get("backdrop_url") if payload else None,
        "overview": payload.get("overview") if payload else None,
        "year": payload.get("year") if payload else get_year(r.get("title")),
        "director": payload.get("director") if payload else None,
        "runtime": payload.get("runtime") if payload else None,
        "original_language": payload.get("original_language") if payload else None,
        "score_weighted": float(r["weighted"]),
        "n_ratings": int(r["n_ratings"]),
        "avg_rating": float(r["avg_rating"]),
    })

save_tmdb_cache()

OUT_PATH = f"{BASE}/artifacts/onboarding_250_diverse.json"
with open(OUT_PATH, "w", encoding="utf-8") as f:
    json.dump(out, f, ensure_ascii=False, indent=2)

print("‚úÖ Saved:", OUT_PATH)


‚úÖ Selected: 250
Top primary genres: [('Crime', 28), ('Drama', 28), ('Action', 28), ('Comedy', 28), ('Adventure', 28), ('Animation', 28), ('Documentary', 28), ('Children', 11), ('Mystery', 10), ('Horror', 9)]
Decades: [(1920, 3), (1930, 10), (1940, 21), (1950, 25), (1960, 26), (1970, 20), (1980, 36), (1990, 55), (2000, 43), (2010, 11)]
‚úÖ Saved: /content/drive/MyDrive/recsys_final_20m/artifacts/onboarding_250_diverse.json


In [12]:
# ============================
# NOTEBOOK 01 FINAL CHECKLIST
# ============================

from google.colab import drive
drive.mount('/content/drive')

import os, json, numpy as np

BASE = "/content/drive/MyDrive/recsys_final_20m"
ART = f"{BASE}/artifacts"

print("\nüîç Checking base path...")
assert os.path.exists(BASE), "‚ùå Base directory missing"
print("‚úÖ Base directory exists")

# ---- 1) ALS artifacts ----
ALS_DIR = f"{ART}/als_model"
print("\nüîç Checking ALS artifacts...")

required_als_files = [
    "item_factors.npy",
    "movie_ids.json",
    "meta.json",
    "user_factors.npy",   # may not be used later, but should exist
    "user_ids.json"
]

for f in required_als_files:
    path = f"{ALS_DIR}/{f}"
    assert os.path.exists(path), f"‚ùå Missing ALS file: {f}"
    print(f"‚úÖ Found {f}")

# Load and sanity-check ALS dimensions
item_factors = np.load(f"{ALS_DIR}/item_factors.npy")
with open(f"{ALS_DIR}/movie_ids.json", "r", encoding="utf-8") as f:
    movie_ids = json.load(f)

print("\nüìê ALS dimension checks:")
print("item_factors shape:", item_factors.shape)
print("movie_ids length:", len(movie_ids))
assert item_factors.shape[0] == len(movie_ids), \
    "‚ùå item_factors rows != movie_ids length"

print("‚úÖ ALS catalog consistency OK")

# ---- 2) Onboarding files ----
print("\nüîç Checking onboarding files...")

onboarding_files = [
    "onboarding_250_diverse.json"
]

for f in onboarding_files:
    path = f"{ART}/{f}"
    assert os.path.exists(path), f"‚ùå Missing onboarding file: {f}"
    print(f"‚úÖ Found {f}")

with open(f"{ART}/onboarding_250_diverse.json", "r", encoding="utf-8") as f:
    onboarding = json.load(f)

print("onboarding_250_diverse.json items:", len(onboarding))
assert len(onboarding) == 250, "‚ùå Onboarding file does not contain 250 movies"
print("‚úÖ Onboarding pool size OK")

# ---- 3) Homepage file (optional but recommended) ----
print("\nüîç Checking homepage data...")

home_path = f"{ART}/home_popular_80.json"
if os.path.exists(home_path):
    with open(home_path, "r", encoding="utf-8") as f:
        home = json.load(f)
    print("‚úÖ home_popular_80.json found | items:", len(home))
else:
    print("‚ö†Ô∏è home_popular_80.json NOT found (optional, not fatal)")

# ---- 4) TMDB cache ----
print("\nüîç Checking TMDB cache...")

tmdb_cache_path = f"{BASE}/tmdb_cache/tmdb_cache_full.json"
assert os.path.exists(tmdb_cache_path), "‚ùå TMDB cache missing"

with open(tmdb_cache_path, "r", encoding="utf-8") as f:
    tmdb_cache = json.load(f)

print("TMDB cache entries:", len(tmdb_cache))
assert len(tmdb_cache) >= 200, \
    "‚ö†Ô∏è TMDB cache seems small (not fatal, but unexpected)"

print("‚úÖ TMDB cache present")

# ---- FINAL STATUS ----
print("\n==============================")
print("üéâ NOTEBOOK 01 IS COMPLETE üéâ")
print("You will NOT need to rerun it.")
print("==============================")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

üîç Checking base path...
‚úÖ Base directory exists

üîç Checking ALS artifacts...
‚úÖ Found item_factors.npy
‚úÖ Found movie_ids.json
‚úÖ Found meta.json
‚úÖ Found user_factors.npy
‚úÖ Found user_ids.json

üìê ALS dimension checks:
item_factors shape: (26744, 128)
movie_ids length: 26744
‚úÖ ALS catalog consistency OK

üîç Checking onboarding files...
‚úÖ Found onboarding_250_diverse.json
onboarding_250_diverse.json items: 250
‚úÖ Onboarding pool size OK

üîç Checking homepage data...
‚úÖ home_popular_80.json found | items: 80

üîç Checking TMDB cache...
TMDB cache entries: 249
‚úÖ TMDB cache present

üéâ NOTEBOOK 01 IS COMPLETE üéâ
You will NOT need to rerun it.
