In [1]:
# PART 1 — DATA PREPROCESSING

# TASK 1 — Load & Understand Dataset

# Step 1: Import libraries
import pandas as pd

# Step 2: Load dataset
df = pd.read_csv("data/tmdb_5000_movies.csv")

# Step 3: Print dataset info
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns)
print("\nFirst 5 rows:")
display(df.head())

# Step 4: Select required columns
df = df[['title', 'overview']]

# Step 5: Handle missing values
df['overview'] = df['overview'].fillna("")

Dataset Shape: (4803, 20)

Columns: Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

First 5 rows:


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [2]:
# TASK 2 — Text Preprocessing

# Step 1: Import preprocessing tools
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Step 2: Create cleaning function
def clean_text(text):

    # convert to lowercase
    text = text.lower()

    # remove punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # remove stopwords
    words = text.split()
    words = [w for w in words if w not in stop_words]

    return " ".join(words)

# Step 3: Apply preprocessing
df['clean_text'] = df['overview'].apply(clean_text)

df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kirut\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,title,overview,clean_text
0,Avatar,"In the 22nd century, a paraplegic Marine is di...",nd century paraplegic marine dispatched moon p...
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",captain barbossa long believed dead come back ...
2,Spectre,A cryptic message from Bond’s past sends him o...,cryptic message bonds past sends trail uncover...
3,The Dark Knight Rises,Following the death of District Attorney Harve...,following death district attorney harvey dent ...
4,John Carter,"John Carter is a war-weary, former military ca...",john carter warweary former military captain w...


In [3]:
# PART 2 — TEXT VECTORIZATION

# TASK 3 — TF-IDF 

from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Initialize vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)  # unigram + bigram
)

# Step 2: Fit transform
tfidf_matrix = tfidf.fit_transform(df['clean_text'])

print("TF-IDF Shape:", tfidf_matrix.shape)

TF-IDF Shape: (4803, 5000)


In [4]:
# TASK 4 — Cosine Similarity

from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Compute similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)

print("Similarity matrix created")

# Explanation:
# Cosine similarity measures angle between vectors,
# which works well for text because magnitude doesn't matter.

Similarity matrix created


In [5]:
# PART 3 — RECOMMENDATION LOGIC

# TASK 5 — Recommendation Function

def recommend(movie_name, top_n=5):

    # Step 1: Check movie exists
    if movie_name not in df['title'].values:
        return "Movie not found"

    # Step 2: Get index
    idx = df[df['title'] == movie_name].index[0]

    # Step 3: Get similarity scores
    scores = list(enumerate(similarity_matrix[idx]))

    # Step 4: Sort scores
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    # Step 5: Get top recommendations
    scores = scores[1:top_n+1]

    movie_indices = [i[0] for i in scores]

    return df['title'].iloc[movie_indices]

# Test
recommend("Avatar")
recommend("Batman Begins")
recommend("Titanic")

4638    Amidst the Devil's Wings
2143                  Ghost Ship
2289                  The Switch
57                        WALL·E
3212                    The Rose
Name: title, dtype: object