In [3]:
# =========================
# Imports + Configuration
# =========================

import os
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Prefer repo paths, fallback to your uploaded paths
MOVIES_PATHS  = ["movies.csv",  "movies.csv"]
RATINGS_PATHS = ["ratings.csv", "ratings.csv"]

MOVIES_PATH  = next((p for p in MOVIES_PATHS if os.path.exists(p)), None)
RATINGS_PATH = next((p for p in RATINGS_PATHS if os.path.exists(p)), None)

if MOVIES_PATH is None or RATINGS_PATH is None:
    print("❌ Could not find movies.csv or ratings.csv.")
    print("✅ Put them in data/ or ensure /mnt/data/ paths exist.")
else:
    print("Using movies:", MOVIES_PATH)
    print("Using ratings:", RATINGS_PATH)

    

Using movies: movies.csv
Using ratings: ratings.csv


In [4]:
# =========================
# Task 1 — Load Data
# =========================

try:
    movies = pd.read_csv(MOVIES_PATH)
    ratings = pd.read_csv(RATINGS_PATH)
    print("✅ Loaded files successfully.")
except Exception:
    print("❌ Failed to load CSV files. Check file paths and CSV format.")

display(movies.head())
display(ratings.head())

print("Movies shape :", movies.shape)
print("Ratings shape:", ratings.shape)


✅ Loaded files successfully.


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Movies shape : (9742, 3)
Ratings shape: (100836, 4)


In [5]:
# =========================
# Task 1 — Explore (EDA)
# =========================

print("Movies columns :", list(movies.columns))
print("Ratings columns:", list(ratings.columns))

print("\nMissing values (movies):")
display(movies.isna().sum())

print("\nMissing values (ratings):")
display(ratings.isna().sum())

print("\nDuplicates:")
print("movies duplicates :", movies.duplicated().sum())
print("ratings duplicates:", ratings.duplicated().sum())

print("\nRating stats:")
display(ratings["rating"].describe())

print("\nUnique users:", ratings["userId"].nunique())
print("Unique movies in ratings:", ratings["movieId"].nunique())


Movies columns : ['movieId', 'title', 'genres']
Ratings columns: ['userId', 'movieId', 'rating', 'timestamp']

Missing values (movies):


movieId    0
title      0
genres     0
dtype: int64


Missing values (ratings):


userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


Duplicates:
movies duplicates : 0
ratings duplicates: 0

Rating stats:


count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64


Unique users: 610
Unique movies in ratings: 9724


In [6]:
# =========================
# Task 1 — Preprocess
# - Remove duplicates
# - Handle missing values
# - Remove rating outliers
# - Keep only ratings whose movieId exists in movies.csv
# =========================

movies_clean = movies.drop_duplicates().copy()
ratings_clean = ratings.drop_duplicates().copy()

# Fill missing
if "title" in movies_clean.columns:
    movies_clean["title"] = movies_clean["title"].fillna("Untitled").astype(str)
else:
    movies_clean["title"] = "Untitled"

if "genres" in movies_clean.columns:
    movies_clean["genres"] = movies_clean["genres"].fillna("Unknown").astype(str)
else:
    movies_clean["genres"] = "Unknown"

# Basic cleanup
movies_clean["title"] = movies_clean["title"].str.strip()
movies_clean["genres"] = movies_clean["genres"].str.strip()

# Clip outliers (MovieLens typically 0.5..5 or 1..5)
ratings_clean["rating"] = pd.to_numeric(ratings_clean["rating"], errors="coerce")
ratings_clean = ratings_clean.dropna(subset=["rating"])
ratings_clean = ratings_clean[(ratings_clean["rating"] >= 0.5) & (ratings_clean["rating"] <= 5.0)]

# Ensure timestamp exists
if "timestamp" not in ratings_clean.columns:
    ratings_clean["timestamp"] = np.nan

# Keep only movieIds that exist in movies.csv
valid_movie_ids = set(movies_clean["movieId"].unique())
ratings_clean = ratings_clean[ratings_clean["movieId"].isin(valid_movie_ids)].copy()

print("After preprocessing:")
print("Movies :", movies_clean.shape)
print("Ratings:", ratings_clean.shape)

print("Users:", ratings_clean["userId"].nunique(), "| Movies:", ratings_clean["movieId"].nunique())


After preprocessing:
Movies : (9742, 3)
Ratings: (100836, 4)
Users: 610 | Movies: 9724


In [7]:
# =========================
# Task 2 — Feature Engineering (Content-Based)
# Build item_text = title + genres
# =========================

movies_cb = movies_clean.copy()

movies_cb["genres_text"] = movies_cb["genres"].str.replace("|", " ", regex=False)
movies_cb["title_norm"] = movies_cb["title"].str.lower().str.replace(r"\s+", " ", regex=True).str.strip()

movies_cb["item_text"] = (movies_cb["title"].astype(str) + " " + movies_cb["genres_text"].astype(str)).str.strip()

display(movies_cb[["movieId","title","genres","item_text"]].head())


Unnamed: 0,movieId,title,genres,item_text
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure Animation Children ...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995) Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy


In [8]:
# =========================
# Task 3 — Train/Test Split (Per-user holdout)
# For each user, keep the last interaction as test (if timestamp exists),
# otherwise fallback to random split.
# =========================

def per_user_holdout(df):
    d = df.copy()
    if d["timestamp"].notna().any():
        d = d.sort_values(["userId", "timestamp"])
        test = d.groupby("userId").tail(1)
        train = d.drop(test.index)
    else:
        train, test = train_test_split(d, test_size=0.2, random_state=RANDOM_SEED)
    return train, test

train_ratings, test_ratings = per_user_holdout(ratings_clean)

print("Train ratings:", train_ratings.shape)
print("Test ratings :", test_ratings.shape)
print("Train users:", train_ratings["userId"].nunique(), "| Test users:", test_ratings["userId"].nunique())


Train ratings: (100226, 4)
Test ratings : (610, 4)
Train users: 610 | Test users: 610


In [9]:
# =========================
# Task 4 — Train Content-Based Model
# (fit TF-IDF on all items, compute similarities on-demand)
# =========================

tfidf = TfidfVectorizer(stop_words="english")
X_items = tfidf.fit_transform(movies_cb["item_text"])

movieId_to_idx = {mid: i for i, mid in enumerate(movies_cb["movieId"].tolist())}
idx_to_movieId = {i: mid for mid, i in movieId_to_idx.items()}

def recommend_similar_movies(movie_id, top_k=10):
    if movie_id not in movieId_to_idx:
        return pd.DataFrame(columns=["movieId","title","genres","similarity"])

    idx = movieId_to_idx[movie_id]
    sims = cosine_similarity(X_items[idx], X_items).ravel()
    sims[idx] = -1  # exclude itself

    top_idx = np.argsort(-sims)[:top_k]
    recs = movies_cb.iloc[top_idx][["movieId","title","genres"]].copy()
    recs["similarity"] = sims[top_idx]
    return recs

# Demo
example_mid = int(movies_cb["movieId"].iloc[0])
display(movies_cb[movies_cb["movieId"] == example_mid][["movieId","title","genres"]])
display(recommend_similar_movies(example_mid, top_k=10))


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


Unnamed: 0,movieId,title,genres,similarity
2355,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,0.880446
7355,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,0.821047
3595,4929,"Toy, The (1982)",Comedy,0.538018
2539,3400,We're Back! A Dinosaur's Story (1993),Adventure|Animation|Children|Fantasy,0.456423
26,27,Now and Then (1995),Children|Drama,0.421651
4089,5843,Toy Soldiers (1991),Action|Drama,0.400561
1617,2161,"NeverEnding Story, The (1984)",Adventure|Children|Fantasy,0.385685
6194,45074,"Wild, The (2006)",Adventure|Animation|Children|Comedy|Fantasy,0.365351
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.35797
12,13,Balto (1995),Adventure|Animation|Children,0.357436


In [10]:
# =========================
# Task 2 — Feature Engineering (Collaborative Filtering)
# Encode userId and movieId into consecutive indices using TRAIN only
# =========================

user_ids = train_ratings["userId"].unique().tolist()
movie_ids = train_ratings["movieId"].unique().tolist()

user2idx = {u:i for i,u in enumerate(user_ids)}
movie2idx = {m:i for i,m in enumerate(movie_ids)}

def encode_ratings(df):
    d = df[df["userId"].isin(user2idx) & df["movieId"].isin(movie2idx)].copy()
    d["user_idx"] = d["userId"].map(user2idx)
    d["movie_idx"] = d["movieId"].map(movie2idx)
    return d

train_enc = encode_ratings(train_ratings)
test_enc  = encode_ratings(test_ratings)

num_users = len(user2idx)
num_movies = len(movie2idx)

print("Encoded users :", num_users)
print("Encoded movies:", num_movies)
print("Train encoded:", train_enc.shape, "| Test encoded:", test_enc.shape)


Encoded users : 610
Encoded movies: 9701
Train encoded: (100226, 6) | Test encoded: (587, 6)


In [11]:
# =========================
# Task 4 — Train Collaborative Filtering Model (Matrix Factorization)
# =========================

def build_mf_model(num_users, num_movies, emb_dim=32):
    user_in = keras.Input(shape=(1,), name="user")
    movie_in = keras.Input(shape=(1,), name="movie")

    user_emb = layers.Embedding(num_users, emb_dim, name="user_embedding")(user_in)
    movie_emb = layers.Embedding(num_movies, emb_dim, name="movie_embedding")(movie_in)

    user_vec = layers.Flatten()(user_emb)
    movie_vec = layers.Flatten()(movie_emb)

    dot = layers.Dot(axes=1)([user_vec, movie_vec])
    out = layers.Activation("linear")(dot)

    model = keras.Model([user_in, movie_in], out)
    model.compile(
        optimizer=keras.optimizers.Adam(1e-3),
        loss="mse",
        metrics=[keras.metrics.MeanAbsoluteError()]
    )
    return model

mf_model = build_mf_model(num_users, num_movies, emb_dim=32)
mf_model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 movie (InputLayer)             [(None, 1)]          0           []                               
                                                                                                  
 user_embedding (Embedding)     (None, 1, 32)        19520       ['user[0][0]']                   
                                                                                                  
 movie_embedding (Embedding)    (None, 1, 32)        310432      ['movie[0][0]']                  
                                                                                              

In [12]:
# =========================
# Task 4 — Fit CF model
# =========================

X_train = train_enc[["user_idx","movie_idx"]].values
y_train = train_enc["rating"].astype(np.float32).values

history = mf_model.fit(
    [X_train[:,0], X_train[:,1]],
    y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=256,
    verbose=1
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
# =========================
# Task 5 — Evaluate CF model
# Metrics: RMSE, MAE
# =========================

X_test = test_enc[["user_idx","movie_idx"]].values
y_test = test_enc["rating"].astype(np.float32).values

preds = mf_model.predict([X_test[:,0], X_test[:,1]], verbose=0).reshape(-1)

rmse = np.sqrt(mean_squared_error(y_test, preds))
mae  = mean_absolute_error(y_test, preds)

print("CF Test RMSE:", rmse)
print("CF Test MAE :", mae)


CF Test RMSE: 1.4434437
CF Test MAE : 1.119808


In [15]:
# =========================
# Task 5 — Ranking evaluation (Top-K)
# We evaluate whether the user's held-out test movie appears in top-K recommendations.
# - HitRate@K: % users where test movie is in top-K
# - Precision@K: average 1/K when hit else 0 (simple proxy)
# =========================

def cf_topk_for_user(user_id, K=10):
    if user_id not in user2idx:
        return []

    uidx = user2idx[user_id]

    # movies already in train for this user
    seen = set(train_ratings.loc[train_ratings["userId"] == user_id, "movieId"].tolist())

    # candidate movies are movies in training universe minus seen
    candidates = [mid for mid in movie2idx.keys() if mid not in seen]
    if not candidates:
        return []

    cand_idx = np.array([movie2idx[mid] for mid in candidates], dtype=np.int32)
    user_vec = np.full(shape=(len(cand_idx),), fill_value=uidx, dtype=np.int32)

    scores = mf_model.predict([user_vec, cand_idx], verbose=0).reshape(-1)
    top = np.argsort(-scores)[:K]
    return [candidates[i] for i in top]

def evaluate_topk(K=10, max_users=20):
    # users present in both train and test
    common_users = list(set(train_ratings["userId"]).intersection(set(test_ratings["userId"])))
    if len(common_users) > max_users:
        common_users = common_users[:max_users]

    hits = 0
    precisions = []

    for u in common_users:
        # held-out test movie for user u (we used per-user holdout => usually 1 row)
        test_rows = test_ratings[test_ratings["userId"] == u]
        if test_rows.empty:
            continue

        true_mid = int(test_rows.iloc[0]["movieId"])
        recs = cf_topk_for_user(u, K=K)

        hit = 1 if true_mid in recs else 0
        hits += hit
        precisions.append(hit / K)

    hitrate = hits / len(common_users) if common_users else 0.0
    precision = float(np.mean(precisions)) if precisions else 0.0
    return hitrate, precision

for K in [5, 10, 20]:
    hr, pk = evaluate_topk(K=K, max_users=20)
    print(f"HitRate@{K}: {hr:.4f} | Precision@{K}: {pk:.4f}")


HitRate@5: 0.0000 | Precision@5: 0.0000
HitRate@10: 0.0000 | Precision@10: 0.0000
HitRate@20: 0.0000 | Precision@20: 0.0000


In [17]:
# =========================
# Task 5 — Simple Prediction Interface
# 1) Enter userId => CF Top-10 recommendations
# 2) Enter movie title => Content-based similar movies
# =========================

def cf_recommend_for_user(user_id, top_k=10):
    rec_ids = cf_topk_for_user(user_id, K=top_k)
    if not rec_ids:
        return pd.DataFrame(columns=["movieId","title","genres"])
    return movies_cb[movies_cb["movieId"].isin(rec_ids)][["movieId","title","genres"]]

def find_title_matches(query, top_n=5):
    q = query.lower().strip()
    hits = movies_cb[movies_cb["title_norm"].str.contains(re.escape(q), na=False)].copy()
    return hits[["movieId","title","genres"]].head(top_n)

# ---- Interface ----
try:
    mode = input("Type 'user' for userId recommendations OR 'title' for similar movies (or Enter to skip): ").strip().lower()

    if mode == "user":
        s = input("Enter userId: ").strip()
        if s:
            uid = int(s)
            print("\n--- CF Recommendations (Top 10) ---")
            display(cf_recommend_for_user(uid, top_k=10))
        else:
            print("No userId provided.")

    elif mode == "title":
        q = input("Enter movie title (example: Toy Story): ").strip()
        if q:
            print("\nTop title matches:")
            matches = find_title_matches(q, top_n=5)
            display(matches)

            if len(matches) > 0:
                chosen_id = int(matches.iloc[0]["movieId"])
                print(f"\nUsing movieId={chosen_id} for similar-movie recommendations.\n")
                display(recommend_similar_movies(chosen_id, top_k=10))
            else:
                print("No matches found.")
        else:
            print("No title provided.")

    else:
        print("Skipped.")
except Exception:
    print("Skipped (invalid input).")


Type 'user' for userId recommendations OR 'title' for similar movies (or Enter to skip): title
Enter movie title (example: Toy Story): sabrina

Top title matches:


Unnamed: 0,movieId,title,genres
6,7,Sabrina (1995),Comedy|Romance
697,915,Sabrina (1954),Comedy|Romance



Using movieId=7 for similar-movie recommendations.



Unnamed: 0,movieId,title,genres,similarity
697,915,Sabrina (1954),Comedy|Romance,0.73208
529,618,Two Much (1995),Comedy|Romance,0.534803
26,27,Now and Then (1995),Children|Drama,0.322843
165,195,Something to Talk About (1995),Comedy|Drama|Romance,0.316572
297,339,While You Were Sleeping (1995),Comedy|Romance,0.297872
157,186,Nine Months (1995),Comedy|Romance,0.286014
35,39,Clueless (1995),Comedy|Romance,0.27713
152,180,Mallrats (1995),Comedy|Romance,0.27713
140,168,First Knight (1995),Action|Drama|Romance,0.2717
186,218,Boys on the Side (1995),Comedy|Drama,0.26982
