# Load and Inspect the Data

In [2]:
import pandas as pd

# Load the datasets
movies = pd.read_csv('tmdb_5000_credits.csv')
ratings = pd.read_csv('tmdb_5000_movies.csv')

# Display first few rows of both datasets
print("Movies DataFrame:")
print(movies.head(), "\n")

print("Ratings DataFrame:")
print(ratings.head(), "\n")

# Check for null values
print("Null values in movies.csv:")
print(movies.isnull().sum(), "\n")

print("Null values in ratings.csv:")
print(ratings.isnull().sum(), "\n")

# Check basic info
print("Movies Info:")
print(movies.info(), "\n")

print("Ratings Info:")
print(ratings.info(), "\n")


Movies DataFrame:
   movie_id                                     title  \
0     19995                                    Avatar   
1       285  Pirates of the Caribbean: At World's End   
2    206647                                   Spectre   
3     49026                     The Dark Knight Rises   
4     49529                               John Carter   

                                                cast  \
0  [{"cast_id": 242, "character": "Jake Sully", "...   
1  [{"cast_id": 4, "character": "Captain Jack Spa...   
2  [{"cast_id": 1, "character": "James Bond", "cr...   
3  [{"cast_id": 2, "character": "Bruce Wayne / Ba...   
4  [{"cast_id": 5, "character": "John Carter", "c...   

                                                crew  
0  [{"credit_id": "52fe48009251416c750aca23", "de...  
1  [{"credit_id": "52fe4232c3a36847f800b579", "de...  
2  [{"credit_id": "54805967c3a36829b5002c41", "de...  
3  [{"credit_id": "52fe4781c3a36847f81398c3", "de...  
4  [{"credit_id": "52fe479a

Dataset Summary
You have two datasets:

movies.csv → contains movie_id, title, cast, and crew.

ratings.csv (actually metadata) → includes info like budget, genres, revenue, runtime, etc.

They share a common identifier:

movie_id in movies.csv corresponds to id in the second dataset.


# 

# Merge the Two Datasets

We’ll now merge them on movie_id (from movies.csv) and id (from ratings.csv) to create a unified dataset for modeling.

In [3]:
# Merge the datasets on movie_id and id
merged_df = movies.merge(ratings, left_on='movie_id', right_on='id')

# Drop unnecessary duplicate columns
merged_df.drop(columns=['id', 'original_title'], inplace=True)

# Show the shape and preview the merged dataset
print("Merged DataFrame shape:", merged_df.shape)
print(merged_df.head(2))


Merged DataFrame shape: (4803, 22)
   movie_id                                   title_x  \
0     19995                                    Avatar   
1       285  Pirates of the Caribbean: At World's End   

                                                cast  \
0  [{"cast_id": 242, "character": "Jake Sully", "...   
1  [{"cast_id": 4, "character": "Captain Jack Spa...   

                                                crew     budget  \
0  [{"credit_id": "52fe48009251416c750aca23", "de...  237000000   
1  [{"credit_id": "52fe4232c3a36847f800b579", "de...  300000000   

                                              genres  \
0  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   

                                       homepage  \
0                   http://www.avatarmovie.com/   
1  http://disney.go.com/disneypictures/pirates/   

                                            keywords original_language  \
0  [{"id": 1463, "name": 

Sample Data from the First Two Rows
It displays details for two movies: Avatar and Pirates of the Caribbean: At World's End.

Column	Explanation
movie_id	Unique ID of the movie
title_x	Movie title from the first dataset
cast	A list of dictionaries with actor names and roles (in JSON-like format)
crew	A list of dictionaries with crew member details (directors, producers, etc.)
budget	Budget of the movie in USD
genres	List of genre dictionaries
homepage	Official homepage URL
keywords	List of keyword dictionaries
original_language	Language code (e.g., "en" for English)
overview	Movie summary
production_countries	List of countries involved in production
release_date	Date of release
revenue	Total revenue in USD
runtime	Duration in minutes
spoken_languages	List of spoken language dictionaries
status	Release status ("Released", "Post Production", etc.)
tagline	Marketing tagline
title_y	Movie title from the second dataset (likely used for verification or comparison)
vote_average	Average user rating
vote_count	Number of user votes 




# Data Cleaning & Feature Extraction

# Extract Relevant Fields

In [4]:
# Extract top 3 cast members
import ast

def extract_top_cast(cast_str):
    try:
        cast = ast.literal_eval(cast_str)
        return [actor['name'] for actor in cast[:3]]
    except:
        return []

merged_df['top_cast'] = merged_df['cast'].apply(extract_top_cast)


#  Extract director from crew
def extract_director(crew_str):
    try:
        crew = ast.literal_eval(crew_str)
        for person in crew:
            if person['job'] == 'Director':
                return person['name']
    except:
        return None

merged_df['director'] = merged_df['crew'].apply(extract_director)

# Extract genre names
def extract_genres(genre_str):
    try:
        genres = ast.literal_eval(genre_str)
        return [genre['name'] for genre in genres]
    except:
        return []

merged_df['genre_names'] = merged_df['genres'].apply(extract_genres)

# Extract keywords
def extract_keywords(kw_str):
    try:
        keywords = ast.literal_eval(kw_str)
        return [kw['name'] for kw in keywords]
    except:
        return []

merged_df['keyword_list'] = merged_df['keywords'].apply(extract_keywords)


In [5]:
merged_df.head()

Unnamed: 0,movie_id,title_x,cast,crew,budget,genres,homepage,keywords,original_language,overview,...,spoken_languages,status,tagline,title_y,vote_average,vote_count,top_cast,director,genre_names,keyword_list
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,"In the 22nd century, a paraplegic Marine is di...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,"Captain Barbossa, long believed to be dead, ha...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,A cryptic message from Bond’s past sends him o...,...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,Following the death of District Attorney Harve...,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,"[Christian Bale, Michael Caine, Gary Oldman]",Christopher Nolan,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,"John Carter is a war-weary, former military ca...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",Andrew Stanton,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel..."


# Add Poster URLs to the Dataset

In [11]:
def get_poster_url(movie_id):
    import requests

    base_url = "https://image.tmdb.org/t/p/w500"
    fallback_url = "https://via.placeholder.com/500x750?text=No+Image"
    api_key = "YOUR_API_KEY"  # Replace with your actual TMDB API key

    try:
        response = requests.get(
            f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}",
            timeout=5  # max 5 seconds wait
        )
        if response.status_code == 200:
            data = response.json()
            poster_path = data.get('poster_path')
            if poster_path:
                return base_url + poster_path
        else:
            print(f"TMDB API Error: Status Code {response.status_code}")
    except requests.exceptions.Timeout:
        print("⚠️ Request to TMDB API timed out.")
    except requests.exceptions.RequestException as e:
        print(f"⚠️ API request failed: {e}")

    return fallback_url



# Build a Simple Recommendation Function

# content-based recommendation systems using CountVectorizer and TfidfVectorizer

In [23]:
import time
import sys
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Function to get movie poster URL (you already have this)
def get_poster_url(movie_id):
    base_url = "https://image.tmdb.org/t/p/w500"
    # Assuming movie_id and API logic is defined correctly here
    # Example: You could call an API or have a hardcoded URL for simplicity
    return f"{base_url}/{movie_id}.jpg"  # This is just an example placeholder URL.

# --- Recommend_v1: CountVectorizer ---
def recommend_v1(movie_title):
    # Create tags
    merged_df['tags'] = merged_df['genre_names'].astype(str) + ' ' + \
                        merged_df['top_cast'].astype(str) + ' ' + \
                        merged_df['director'].astype(str)

    # Vectorize
    cv = CountVectorizer(max_features=5000, stop_words='english')
    vectors = cv.fit_transform(merged_df['tags'].values.astype('U')).toarray()

    # Compute similarity
    similarity = cosine_similarity(vectors)

    # Convert similarity matrix to sparse format (optional for memory efficiency)
    similarity_sparse = csr_matrix(similarity)

    # Test performance of conversion to sparse matrix
    start_time = time.time()
    similarity_sparse = csr_matrix(similarity)
    conversion_time = time.time() - start_time
    print(f"Time taken to convert similarity to sparse matrix (CountVectorizer): {conversion_time:.6f} seconds")

    # Memory usage test
    dense_memory = sys.getsizeof(similarity)
    sparse_memory = similarity_sparse.data.nbytes + similarity_sparse.indptr.nbytes + similarity_sparse.indices.nbytes
    print(f"Memory usage of dense matrix: {dense_memory / 1024:.2f} KB")
    print(f"Memory usage of sparse matrix: {sparse_memory / 1024:.2f} KB")

    # Find top 5 similar movies
    idx = merged_df[merged_df['title_x'] == movie_title].index[0]
    distances = list(enumerate(similarity[idx]))
    sorted_movies = sorted(distances, key=lambda x: x[1], reverse=True)[1:6]

    print(f"\nTop 5 Recommendations for **{movie_title}** (CountVectorizer):\n")
    for i in sorted_movies:
        movie = merged_df.iloc[i[0]]
        title = movie['title_x']
        poster_url = get_poster_url(movie['movie_id'])
        print(f"🎬 {title}\n📸 Poster: {poster_url}\n")

# --- Recommend_v2: TF-IDF Vectorizer ---
def recommend_v2(movie_title):
    # Create extended tags
    merged_df['tags'] = merged_df['genre_names'].astype(str) + ' ' + \
                        merged_df['top_cast'].astype(str) + ' ' + \
                        merged_df['director'].astype(str) + ' ' + \
                        merged_df['overview'].astype(str) + ' ' + \
                        merged_df['original_language'].astype(str)

    # Vectorize
    tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
    vectors = tfidf.fit_transform(merged_df['tags'].values.astype('U')).toarray()

    # Compute similarity
    similarity = cosine_similarity(vectors)

    # Convert similarity matrix to sparse format (optional for memory efficiency)
    similarity_sparse = csr_matrix(similarity)

    # Test performance of conversion to sparse matrix
    start_time = time.time()
    similarity_sparse = csr_matrix(similarity)
    conversion_time = time.time() - start_time
    print(f"Time taken to convert similarity to sparse matrix (TF-IDF): {conversion_time:.6f} seconds")

    # Memory usage test
    dense_memory = sys.getsizeof(similarity)
    sparse_memory = similarity_sparse.data.nbytes + similarity_sparse.indptr.nbytes + similarity_sparse.indices.nbytes
    print(f"Memory usage of dense matrix: {dense_memory / 1024:.2f} KB")
    print(f"Memory usage of sparse matrix: {sparse_memory / 1024:.2f} KB")

    # Find top 5 similar movies
    idx = merged_df[merged_df['title_x'] == movie_title].index[0]
    distances = list(enumerate(similarity[idx]))
    sorted_movies = sorted(distances, key=lambda x: x[1], reverse=True)[1:6]

    print(f"\nTop 5 Recommendations for **{movie_title}** (TF-IDF):\n")
    for i in sorted_movies:
        movie = merged_df.iloc[i[0]]
        title = movie['title_x']
        poster_url = get_poster_url(movie['movie_id'])
        print(f"🎬 {title}\n📸 Poster: {poster_url}\n")


In [25]:
recommend_v1("Inception")  # Based on CountVectorizer
recommend_v2("Inception")  # Based on TF-IDF


Time taken to convert similarity to sparse matrix (CountVectorizer): 0.502945 seconds
Memory usage of dense matrix: 180225.20 KB
Memory usage of sparse matrix: 141785.24 KB

Top 5 Recommendations for **Inception** (CountVectorizer):

🎬 Looper
📸 Poster: https://image.tmdb.org/t/p/w500/59967.jpg

🎬 Fortress
📸 Poster: https://image.tmdb.org/t/p/w500/12088.jpg

🎬 G.I. Joe: Retaliation
📸 Poster: https://image.tmdb.org/t/p/w500/72559.jpg

🎬 Oblivion
📸 Poster: https://image.tmdb.org/t/p/w500/75612.jpg

🎬 Sky Captain and the World of Tomorrow
📸 Poster: https://image.tmdb.org/t/p/w500/5137.jpg

Time taken to convert similarity to sparse matrix (TF-IDF): 0.660780 seconds
Memory usage of dense matrix: 180225.20 KB
Memory usage of sparse matrix: 259775.21 KB

Top 5 Recommendations for **Inception** (TF-IDF):

🎬 Don Jon
📸 Poster: https://image.tmdb.org/t/p/w500/138697.jpg

🎬 Premium Rush
📸 Poster: https://image.tmdb.org/t/p/w500/49526.jpg

🎬 (500) Days of Summer
📸 Poster: https://image.tmdb.org/t/p