In [2]:
import pandas as pd
import numpy as np
import ast # Abstract Syntax Trees - to safely evaluate string-based lists

In [4]:
# Load the datasets

# Load the movies dataset (to map movie IDs to titles)
movies_path = 'data_files/tmdb_5000_movies.csv'
movies_df = pd.read_csv(movies_path)

# Load the ratings dataset
credits_path = 'data_files/tmdb_5000_credits.csv'
credits_df = pd.read_csv(credits_path)

In [6]:
credits_df.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [8]:
movies_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


##### Preprocess the "Content" Columns

In [14]:
# Helper function to parse the JSON-like strings
def extract_names(data_str):
    """
    Extracts 'name' values from a stringified list of dictionaries.
    e.g., '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}]' 
    -> ["Action", "Adventure"]
    """
    try:
        # ast.literal_eval safely evaluates the string as a Python list
        features = ast.literal_eval(data_str)
        if isinstance(features, list):
            return [i['name'] for i in features]
        return []
    except:
        return []

# Helper function to get the director's name from the 'crew' column
def get_director(data_str):
    """
    Finds the director's name from the crew list.
    """
    try:
        crew_list = ast.literal_eval(data_str)
        if isinstance(crew_list, list):
            for i in crew_list:
                if i['job'] == 'Director':
                    return [i['name']]
        return []
    except:
        return []

if not movies_df.empty:
    # Apply the helper functions to the respective columns
    print("\nProcessing 'genres' and 'keywords'...")
    movies_df['genres'] = movies_df['genres'].apply(extract_names)
    movies_df['keywords'] = movies_df['keywords'].apply(extract_names)
    
    # For 'cast', we'll just take the top 3 actors
    print("Processing 'cast'...")
    movies_df['cast'] = credits_df['cast'].apply(lambda x: extract_names(x)[:3])
    
    # For 'crew', we just want the director
    print("Processing 'crew' (extracting director)...")
    movies_df['crew'] = credits_df['crew'].apply(get_director)

    print("\nProcessed 'genres' for first movie:")
    print(movies_df.iloc[0]['genres'])


Processing 'genres' and 'keywords'...
Processing 'cast'...
Processing 'crew' (extracting director)...

Processed 'genres' for first movie:
[]


##### Create the "Tags" Column

In [24]:
def clean_and_combine(row):
    """
    Removes spaces and combines all features into a single string.
    """
    words = []
    
    # Add overview (split into words)
    words.extend(row['overview'].split())
    
    # Add genres (e.g., "ScienceFiction")
    words.extend([g.replace(" ", "") for g in row['genres']])
    
    # Add keywords
    words.extend([k.replace(" ", "") for k in row['keywords']])
    
    # Add cast
    words.extend([c.replace(" ", "") for c in row['cast']])
    
    # Add director
    words.extend([d.replace(" ", "") for d in row['crew']])
    
    return " ".join(words)


if not movies_df.empty:
    # Handle potential NaN values in 'overview' before processing
    movies_df['overview'] = movies_df['overview'].fillna('')
        
    print("\nCreating 'tags' column...")
    movies_df['tags'] = movies_df.apply(clean_and_combine, axis=1)
    
    # Create a new, final dataframe
    final_df = movies_df[['id', 'title', 'tags']]
    
    print("Example 'tags' for first movie (Avatar):")
    print(final_df.iloc[0]['tags'][:500] + "...") # Show first 500 chars


Creating 'tags' column...
Example 'tags' for first movie (Avatar):
In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. SamWorthington ZoeSaldana SigourneyWeaver JamesCameron...


##### Vectorize Tags and Calculate Similarity

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
if not final_df.empty:
    print("\nVectorizing text 'tags'...")
    # Initialize the CountVectorizer
    # max_features=5000: Use the top 5000 most frequent words
    # stop_words='english': Remove common English words (like 'the', 'a', 'in')
    cv = CountVectorizer(max_features=5000, stop_words='english')
    
    # Fit and transform the 'tags' column
    vectors = cv.fit_transform(final_df['tags']).toarray()
    
    print("Vector matrix shape:", vectors.shape)
    
    # --- Calculate Cosine Similarity ---
    print("Calculating cosine similarity matrix...")
    similarity_matrix = cosine_similarity(vectors)
    
    print("Similarity matrix shape:", similarity_matrix.shape)
    print("Similarity score between movie 0 and movie 1:", similarity_matrix[0][1])
else:
    print("Skipping vectorization as final_df is empty.")


Vectorizing text 'tags'...
Vector matrix shape: (4803, 5000)
Calculating cosine similarity matrix...
Similarity matrix shape: (4803, 4803)
Similarity score between movie 0 and movie 1: 0.0


##### Build the Recommendation Function

In [30]:
def recommend_movies(movie_title, n=10):
    """
    Recommends N similar movies based on a given movie title.
    """
    if final_df.empty or 'similarity_matrix' not in globals():
        print("Model has not been trained. Please run all steps.")
        return

    try:
        # 1. Find the index of the movie in the dataframe
        movie_index = final_df[final_df['title'] == movie_title].index[0]
    except IndexError:
        print(f"Error: Movie '{movie_title}' not found in the dataset.")
        return
    
    # 2. Get the similarity scores for that movie against all other movies
    # similarity_matrix[movie_index] gives us the row for our movie
    movie_similarities = similarity_matrix[movie_index]
    
    # 3. Get the indices and scores, and sort them
    # We use enumerate to keep track of the original index (which maps to the movie)
    similar_movies = list(enumerate(movie_similarities))
    
    # Sort by the similarity score (item[1]) in descending order
    sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    
    # 4. Get the top N movies
    # We start from index 1 to skip the movie itself (which has a score of 1.0)
    top_movies = sorted_similar_movies[1 : n+1]
    
    print(f"\n--- Top {n} Recommendations for '{movie_title}' ---")
    
    # 5. Print the titles
    for i, score in top_movies:
        recommended_title = final_df.iloc[i]['title']
        print(f"  - {recommended_title} (Similarity: {score:.3f})")

# --- Get Recommendations! ---
if not final_df.empty:
    recommend_movies('The Dark Knight Rises')
    recommend_movies('Avatar')


--- Top 10 Recommendations for 'The Dark Knight Rises' ---
  - Batman Forever (Similarity: 0.319)
  - Batman (Similarity: 0.295)
  - Batman Begins (Similarity: 0.286)
  - The Dark Knight (Similarity: 0.275)
  - Batman Returns (Similarity: 0.265)
  - The Thief and the Cobbler (Similarity: 0.214)
  - Slow Burn (Similarity: 0.204)
  - Batman: The Dark Knight Returns, Part 2 (Similarity: 0.192)
  - JFK (Similarity: 0.191)
  - Batman & Robin (Similarity: 0.189)

--- Top 10 Recommendations for 'Avatar' ---
  - Apollo 18 (Similarity: 0.240)
  - Beowulf (Similarity: 0.177)
  - The Book of Life (Similarity: 0.172)
  - Tears of the Sun (Similarity: 0.156)
  - The American (Similarity: 0.139)
  - Aliens (Similarity: 0.137)
  - Aliens vs Predator: Requiem (Similarity: 0.120)
  - Hanna (Similarity: 0.118)
  - The Host (Similarity: 0.115)
  - Just Visiting (Similarity: 0.114)


##### This project implements a content-based recommender system using the TMDB 5000 Movie Dataset. Unlike collaborative filtering, which uses user ratings, this model recommends movies based on their intrinsic properties.

The recommendation logic is built on Cosine Similarity. Key features for each movie—genres, overview, keywords, cast (top 3 actors), and crew (director)—were extracted, cleaned, and combined into a single "tags" document.

I used scikit-learn's CountVectorizer to transform these tags into a high-dimensional vector space (a matrix of 4,800+ movies x 5,000 features). The cosine_similarity metric was then computed between all movies. The final function takes a movie title as input and returns the top 10 most similar movies by querying this similarity matrix.