In [1]:
# Setup and Imports
import os
import json
import math
import warnings
from typing import List

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings("ignore")

pd.set_option("display.max_colwidth", 200)

print("Libraries imported.")


Libraries imported.


In [3]:

movies_raw = pd.read_csv('tmdb_5000_movies.csv')
print(movies_raw.shape)
movies_raw.head(3)


(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 878, ""name"": ""Science Fiction""}]",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""space war""}, {""id"": 3388, ""name"": ""space colony""}, {""id"": 3679, ""name"": ""society""}, {""id"": 3801, ""name...",en,Avatar,"In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289}, {""name"": ""Twentieth Century Fox Film Corporation"", ""id"": 306}, {""name"": ""Dune Entertainment"", ""id"": 444}, {""name"": ""Lightstorm Entertainment"", ""id""...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}, {""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}]",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 28, ""name"": ""Action""}]",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911, ""name"": ""exotic island""}, {""id"": 1319, ""name"": ""east india trading company""}, {""id"": 2038, ""name"": ""love of one's life...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems.",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""name"": ""Jerry Bruckheimer Films"", ""id"": 130}, {""name"": ""Second Mate Productions"", ""id"": 19936}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 80, ""name"": ""Crime""}]",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name"": ""based on novel""}, {""id"": 4289, ""name"": ""secret agent""}, {""id"": 9663, ""name"": ""sequel""}, {""id"": 14555, ""name"": ""mi6""}, {""id"": 156095, ""name"": ""brit...",en,Spectre,"A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles political forces to keep the secret service alive, Bond peels back the layers of deceit ...",107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""name"": ""Danjaq"", ""id"": 10761}, {""name"": ""B24"", ""id"": 69434}]","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}, {""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""}, {""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}, {""iso_639_1"": ""it"", ""name"": ""Italiano""}, {""iso_639_1"": ""de"", ""na...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [4]:

# Preprocessing: build a single text field

def parse_genres(genre_str: str) -> List[str]:
    try:
        items = json.loads(genre_str)
        return [g.get("name", "").lower() for g in items if isinstance(g, dict)]
    except Exception:
        return []

movies = movies_raw.copy()
movies = movies[["title", "overview", "genres", "popularity", "vote_average", "vote_count"]]
movies["overview"] = movies["overview"].fillna("")
movies["genres_list"] = movies["genres"].apply(parse_genres)
movies["genres_text"] = movies["genres_list"].apply(lambda xs: " ".join(xs))

# Combine overview + genres_text
movies["text"] = (movies["overview"] + " " + movies["genres_text"]).str.strip()

print(movies.shape)
movies.head(3)[["title", "genres_list", "text"]]



(4803, 9)


Unnamed: 0,title,genres_list,text
0,Avatar,"[action, adventure, fantasy, science fiction]","In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fan..."
1,Pirates of the Caribbean: At World's End,"[adventure, fantasy, action]","Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems. adventure fantasy a..."
2,Spectre,"[action, adventure, crime]","A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles political forces to keep the secret service alive, Bond peels back the layers of deceit ..."


In [5]:
# TF-IDF Model and Cosine Similarity

vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)
X = vectorizer.fit_transform(movies["text"])  # sparse matrix

similarity_matrix = cosine_similarity(X)

print("TF-IDF shape:", X.shape)
print("Similarity matrix shape:", similarity_matrix.shape)


TF-IDF shape: (4803, 20978)
Similarity matrix shape: (4803, 4803)


In [6]:
# Recommendation Functions

# Build mapping from lowercase title to index for quick lookup
index_by_title = {t.lower(): i for i, t in enumerate(movies["title"].fillna("").tolist())}

def get_recommendations(title: str, top_k: int = 5) -> pd.DataFrame:
    if not title:
        raise ValueError("Please provide a non-empty movie title.")
    key = title.lower().strip()
    if key not in index_by_title:
        raise KeyError(f"Title not found: {title}")
    idx = index_by_title[key]
    sims = similarity_matrix[idx]
    # Exclude the movie itself by setting its similarity to -1
    sims[idx] = -1
    top_indices = np.argsort(sims)[-top_k:][::-1]
    results = movies.iloc[top_indices][["title", "genres_list", "popularity", "vote_average", "vote_count"]].copy()
    results["similarity"] = sims[top_indices]
    return results.reset_index(drop=True)

# Quick smoke test
get_recommendations("Avatar", top_k=5)


Unnamed: 0,title,genres_list,popularity,vote_average,vote_count,similarity
0,Apollo 18,"[horror, thriller, science fiction]",17.028252,5.0,356,0.196314
1,The Helix... Loaded,"[action, comedy, science fiction]",0.0206,4.8,2,0.177566
2,The Matrix,"[action, science fiction]",104.309993,7.9,8907,0.162486
3,The American,"[crime, drama, thriller]",18.632156,5.8,481,0.161776
4,The Inhabited Island,"[action, fantasy, science fiction, thriller]",2.785832,5.3,23,0.156595


In [7]:
# Popularity-based fallback recommender

# We'll compute a weighted rating (IMDB-style) to rank movies when title not found
# Weighted Rating (WR) = (v/(v+m) * R) + (m/(v+m) * C)
# where:
# - R = average rating for the movie (vote_average)
# - v = number of votes for the movie (vote_count)
# - m = minimum votes required to be listed (e.g., 80th percentile)
# - C = mean vote across the dataset

C = movies["vote_average"].replace(0, np.nan).mean()
M = movies["vote_count"].quantile(0.80)

def weighted_rating(row):
    v = row["vote_count"]
    R = row["vote_average"]
    return (v / (v + M) * R) + (M / (v + M) * C)

movies["weighted_score"] = movies.apply(weighted_rating, axis=1)

popular_top = movies.sort_values(["weighted_score", "popularity"], ascending=False)


def popularity_fallback(top_k: int = 5) -> pd.DataFrame:
    return popular_top.head(top_k)[["title", "genres_list", "popularity", "vote_average", "vote_count", "weighted_score"]].reset_index(drop=True)

# Example
popularity_fallback(5)


Unnamed: 0,title,genres_list,popularity,vote_average,vote_count,weighted_score
0,The Shawshank Redemption,"[drama, crime]",136.747729,8.5,8205,8.256816
1,Fight Club,[drama],146.757391,8.3,9413,8.10361
2,The Godfather,"[drama, crime]",143.659698,8.4,5893,8.088722
3,Pulp Fiction,"[thriller, crime]",121.463076,8.3,8428,8.083
4,The Dark Knight,"[drama, action, crime, thriller]",187.322927,8.2,12002,8.050233


In [8]:
# Demo: Enter a title and get recommendations

def recommend_or_fallback(query: str, top_k: int = 5) -> pd.DataFrame:
    try:
        return get_recommendations(query, top_k=top_k)
    except KeyError:
        print(f"'{query}' not found. Showing popular picks instead.")
        return popularity_fallback(top_k)

example_title = "Avatar"  # change this and run the cell to get predictions
recommend_or_fallback(example_title, top_k=5)


Unnamed: 0,title,genres_list,popularity,vote_average,vote_count,similarity
0,Apollo 18,"[horror, thriller, science fiction]",17.028252,5.0,356,0.196314
1,The Helix... Loaded,"[action, comedy, science fiction]",0.0206,4.8,2,0.177566
2,The Matrix,"[action, science fiction]",104.309993,7.9,8907,0.162486
3,The American,"[crime, drama, thriller]",18.632156,5.8,481,0.161776
4,The Inhabited Island,"[action, fantasy, science fiction, thriller]",2.785832,5.3,23,0.156595
