In [1]:
# ============================
# NOTEBOOK 02 — SETUP
# ============================

from google.colab import drive
drive.mount('/content/drive')

import json
import numpy as np
import os

BASE = "/content/drive/MyDrive/recsys_final_20m"
ART = f"{BASE}/artifacts"

# ---- Load ALS item factors (FULL CATALOG) ----
item_factors = np.load(f"{ART}/als_model/item_factors.npy")

with open(f"{ART}/als_model/movie_ids.json", "r", encoding="utf-8") as f:
    item_movie_ids = [int(x) for x in json.load(f)]

print("✅ item_factors shape:", item_factors.shape)
print("✅ movie_ids length:", len(item_movie_ids))

assert item_factors.shape[0] == len(item_movie_ids), \
    "Mismatch between item factors and movie IDs"

# ---- Load onboarding pool ----
with open(f"{ART}/onboarding_250_diverse.json", "r", encoding="utf-8") as f:
    onboarding_movies = json.load(f)

print("✅ onboarding movies:", len(onboarding_movies))
print("Example onboarding keys:", onboarding_movies[0].keys())


Mounted at /content/drive
✅ item_factors shape: (26744, 128)
✅ movie_ids length: 26744
✅ onboarding movies: 250
Example onboarding keys: dict_keys(['movieId', 'tmdbId', 'title', 'genres_ml', 'poster_url', 'backdrop_url', 'overview', 'year', 'director', 'runtime', 'original_language', 'score_weighted', 'n_ratings', 'avg_rating'])


In [2]:
# ============================
# NOTEBOOK 02 — STEP 1: USER VECTOR
# ============================

import numpy as np

# Build mapping: MovieLens movieId -> index in item_factors
movie_id_to_index = {mid: idx for idx, mid in enumerate(item_movie_ids)}

# Simulate a new user rating some onboarding movies (you can edit these)
# Rating scale: 1..5
user_ratings = {
    onboarding_movies[0]["movieId"]: 5,
    onboarding_movies[3]["movieId"]: 4,
    onboarding_movies[7]["movieId"]: 2,
    onboarding_movies[12]["movieId"]: 5,
    onboarding_movies[20]["movieId"]: 1,
    onboarding_movies[25]["movieId"]: 4,
    onboarding_movies[40]["movieId"]: 3,
    onboarding_movies[60]["movieId"]: 5,
    onboarding_movies[90]["movieId"]: 2,
    onboarding_movies[120]["movieId"]: 4,
    onboarding_movies[160]["movieId"]: 5,
    onboarding_movies[200]["movieId"]: 1,
}

rated_movie_ids = set(user_ratings.keys())

K = item_factors.shape[1]
user_vector = np.zeros(K, dtype=np.float32)

# Center ratings around neutral = 3; weight in [-2, +2]
weight_sum = 0.0
used = 0

for mid, rating in user_ratings.items():
    idx = movie_id_to_index.get(int(mid))
    if idx is None:
        continue

    w = float(rating) - 3.0
    if w == 0:
        continue

    user_vector += w * item_factors[idx]
    weight_sum += abs(w)
    used += 1

if weight_sum > 0:
    user_vector /= weight_sum

print("✅ Rated movies:", len(rated_movie_ids))
print("✅ Used in vector:", used)
print("✅ User vector norm:", float(np.linalg.norm(user_vector)))


✅ Rated movies: 12
✅ Used in vector: 11
✅ User vector norm: 0.3355165123939514


In [3]:
# ============================
# NOTEBOOK 02 — STEP 2: SCORE + TOP-100 (EXCLUDING RATED)
# ============================

import numpy as np

# 1) Score every movie in the catalog
scores = item_factors @ user_vector  # shape: (26744,)

# 2) Take a large candidate pool, then filter out rated movies
TOP_K = 5000
top_idx = np.argpartition(-scores, TOP_K)[:TOP_K]
top_idx = top_idx[np.argsort(-scores[top_idx])]

recs = []
for idx in top_idx:
    mid = item_movie_ids[int(idx)]
    if mid in rated_movie_ids:
        continue
    recs.append({
        "movieId": int(mid),
        "score": float(scores[int(idx)])
    })
    if len(recs) >= 100:
        break

print("✅ Scores computed:", scores.shape)
print("✅ Recommendations returned:", len(recs))
print("Top 5 recs (movieId, score):", [(r["movieId"], round(r["score"], 4)) for r in recs[:5]])


✅ Scores computed: (26744,)
✅ Recommendations returned: 100
Top 5 recs (movieId, score): [(296, 0.1509), (593, 0.14), (50, 0.1353), (904, 0.1206), (150, 0.12)]


In [4]:
# ============================
# NOTEBOOK 02 — STEP 3: ENRICH + EXPORT JSON
# ============================

import json
import os

OUT = f"{ART}/recs_top100.json"

# Helper: map movieId -> tmdbId using onboarding pool + cache
movieId_to_tmdb = {m["movieId"]: m.get("tmdbId") for m in onboarding_movies}

def enrich_recommendations(recs):
    enriched = []

    for r in recs:
        mid = r["movieId"]
        score = r["score"]

        tmdb_id = movieId_to_tmdb.get(mid)

        payload = tmdb_movie_full(tmdb_id) if tmdb_id else None

        enriched.append({
            "movieId": mid,
            "tmdbId": tmdb_id,
            "score": round(score, 6),

            # UI / metadata
            "title": payload.get("title") if payload else None,
            "poster_url": payload.get("poster_url") if payload else None,
            "backdrop_url": payload.get("backdrop_url") if payload else None,
            "overview": payload.get("overview") if payload else None,
            "year": payload.get("year") if payload else None,
            "director": payload.get("director") if payload else None,
            "runtime": payload.get("runtime") if payload else None,
            "original_language": payload.get("original_language") if payload else None,
            "genres": payload.get("genres") if payload else None,
        })

    return enriched

final_recs = enrich_recommendations(recs)

with open(OUT, "w", encoding="utf-8") as f:
    json.dump(final_recs, f, ensure_ascii=False, indent=2)

print("✅ Saved recommendations:", OUT)
print("Example item:")
final_recs[0]


NameError: name 'tmdb_movie_full' is not defined

In [5]:
# ============================
# NOTEBOOK 02 — TMDB SETUP (DEFINE tmdb_movie_full)
# ============================

import json, requests
import polars as pl
from google.colab import userdata

TMDB_KEY = userdata.get("TMDB_API_KEY")
assert TMDB_KEY, "❌ Add TMDB_API_KEY in Colab Secrets (same as Notebook 01)."

TMDB_BASE = "https://api.themoviedb.org/3"
TMDB_CACHE_PATH = f"{BASE}/tmdb_cache/tmdb_cache_full.json"

# Load cache created in Notebook 01
with open(TMDB_CACHE_PATH, "r", encoding="utf-8") as f:
    tmdb_cache = json.load(f)

def poster_url(path, size="w342"):
    return f"https://image.tmdb.org/t/p/{size}{path}" if path else None

def backdrop_url(path, size="w780"):
    return f"https://image.tmdb.org/t/p/{size}{path}" if path else None

def tmdb_movie_full(tmdb_id: int):
    """TMDB details+credits with Drive-backed cache."""
    if tmdb_id is None:
        return None
    key = str(int(tmdb_id))
    if key in tmdb_cache:
        return tmdb_cache[key]

    # details
    d = requests.get(
        f"{TMDB_BASE}/movie/{int(tmdb_id)}",
        params={"api_key": TMDB_KEY, "language": "en-US"},
        timeout=20
    )
    if d.status_code != 200:
        tmdb_cache[key] = None
        return None
    d = d.json()

    # credits
    c = requests.get(
        f"{TMDB_BASE}/movie/{int(tmdb_id)}/credits",
        params={"api_key": TMDB_KEY},
        timeout=20
    )
    credits = c.json() if c.status_code == 200 else {}

    directors = [x["name"] for x in credits.get("crew", []) if x.get("job") == "Director"]
    top_cast = [x["name"] for x in credits.get("cast", [])[:5]]

    payload = {
        "tmdb_id": d.get("id"),
        "title": d.get("title"),
        "overview": d.get("overview"),
        "release_date": d.get("release_date"),
        "year": int(d["release_date"][:4]) if d.get("release_date") else None,
        "runtime": d.get("runtime"),
        "original_language": d.get("original_language"),
        "genres": [g["name"] for g in d.get("genres", [])],
        "director": directors[0] if directors else None,
        "top_cast": top_cast,
        "poster_url": poster_url(d.get("poster_path")),
        "backdrop_url": backdrop_url(d.get("backdrop_path")),
    }

    tmdb_cache[key] = payload
    return payload

def save_tmdb_cache():
    with open(TMDB_CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(tmdb_cache, f, ensure_ascii=False, indent=2)

print("✅ TMDB helper loaded. Cache entries:", len(tmdb_cache))


✅ TMDB helper loaded. Cache entries: 249


In [6]:
# ============================
# NOTEBOOK 02 — movieId -> tmdbId mapping (FULL)
# ============================

links = pl.read_parquet(f"{BASE}/parquet/links.parquet").select(["movieId","tmdbId"])
movieId_to_tmdb = {
    int(r["movieId"]): (int(r["tmdbId"]) if r["tmdbId"] is not None else None)
    for r in links.to_dicts()
}

print("✅ Loaded movieId->tmdbId mapping:", len(movieId_to_tmdb))


✅ Loaded movieId->tmdbId mapping: 27278


In [7]:
# ============================
# NOTEBOOK 02 — STEP 3: ENRICH + EXPORT JSON
# ============================

import json

OUT = f"{ART}/recs_top100.json"

final_recs = []
missing_tmdb = 0

for r in recs:
    mid = int(r["movieId"])
    score = float(r["score"])

    tmdb_id = movieId_to_tmdb.get(mid)
    if tmdb_id is None:
        missing_tmdb += 1
        payload = None
    else:
        payload = tmdb_movie_full(tmdb_id)

    final_recs.append({
        "movieId": mid,
        "tmdbId": tmdb_id,
        "score": round(score, 6),
        "title": payload.get("title") if payload else None,
        "poster_url": payload.get("poster_url") if payload else None,
        "backdrop_url": payload.get("backdrop_url") if payload else None,
        "overview": payload.get("overview") if payload else None,
        "year": payload.get("year") if payload else None,
        "director": payload.get("director") if payload else None,
        "runtime": payload.get("runtime") if payload else None,
        "original_language": payload.get("original_language") if payload else None,
        "genres": payload.get("genres") if payload else None,
    })

save_tmdb_cache()

with open(OUT, "w", encoding="utf-8") as f:
    json.dump(final_recs, f, ensure_ascii=False, indent=2)

print("✅ Saved:", OUT)
print("Movies without tmdbId:", missing_tmdb, "out of", len(final_recs))
print("Example item:", final_recs[0])


✅ Saved: /content/drive/MyDrive/recsys_final_20m/artifacts/recs_top100.json
Movies without tmdbId: 0 out of 100
Example item: {'movieId': 296, 'tmdbId': 680, 'score': 0.150892, 'title': 'Pulp Fiction', 'poster_url': 'https://image.tmdb.org/t/p/w342/vQWk5YBFWF4bZaofAbv0tShwBvQ.jpg', 'backdrop_url': 'https://image.tmdb.org/t/p/w780/96hiUXEuYsu4tcnvlaY8tEMFM0m.jpg', 'overview': "A burger-loving hit man, his philosophical partner, a drug-addled gangster's moll and a washed-up boxer converge in this sprawling, comedic crime caper. Their adventures unfurl in three stories that ingeniously trip back and forth in time.", 'year': 1994, 'director': 'Quentin Tarantino', 'runtime': 154, 'original_language': 'en', 'genres': ['Thriller', 'Crime', 'Comedy']}
