In [None]:
# PART 1 — DATA PREPROCESSING
# Task 1: Load & Understand Dataset

# Step 1: Import libraries
import pandas as pd

# Step 2: Load dataset
df = pd.read_csv("data/tmdb_5000_movies.csv")

# Step 3: Print dataset shape
print("Dataset Shape:", df.shape)

# Step 4: Print column names
print("\nColumns:\n", df.columns)

# Step 5: Display first 5 rows
print("\nFirst 5 Rows:")
print(df.head())

# Step 6: Identify text column used for recommendation
# We will use 'overview'
df = df[['title', 'overview']]


Dataset Shape: (4803, 20)

Columns:
 Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

First 5 Rows:
      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.c

In [2]:
# Task 2: Text Preprocessing

import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Step 1: Handle missing values
df['overview'] = df['overview'].fillna('')

# Step 2: Convert text to lowercase
df['clean_text'] = df['overview'].str.lower()

# Step 3: Remove punctuation & special characters
df['clean_text'] = df['clean_text'].apply(
    lambda x: re.sub(r'[^a-zA-Z\s]', '', x)
)

# Step 4: Remove stopwords
def remove_stopwords(text):
    words = text.split()
    filtered = [w for w in words if w not in ENGLISH_STOP_WORDS]
    return " ".join(filtered)

df['clean_text'] = df['clean_text'].apply(remove_stopwords)

print(df.head())


                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                            overview  \
0  In the 22nd century, a paraplegic Marine is di...   
1  Captain Barbossa, long believed to be dead, ha...   
2  A cryptic message from Bond’s past sends him o...   
3  Following the death of District Attorney Harve...   
4  John Carter is a war-weary, former military ca...   

                                          clean_text  
0  nd century paraplegic marine dispatched moon p...  
1  captain barbossa long believed dead come life ...  
2  cryptic message bonds past sends trail uncover...  
3  following death district attorney harvey dent ...  
4  john carter warweary military captain whos ine...  


In [6]:
# PART 2 — Task 3: TF-IDF


from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Initialize vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

# Step 2: Convert text into vectors
tfidf_matrix = tfidf.fit_transform(df['clean_text'])

# Step 3: Display shape
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)


TF-IDF Matrix Shape: (4803, 5000)


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Create similarity matrix
similarity = cosine_similarity(tfidf_matrix)

print("Similarity matrix created!")
print(similarity.shape)




Similarity matrix created!
(4803, 4803)


In [12]:
# PART 3 — Recommendation Function
from sklearn.metrics.pairwise import cosine_similarity

def recommend(item_name, top_n=5):

    # Step 1: Find movie index
    idx = df[df['title'] == item_name].index[0]

    # Step 2: Compute similarity scores
    scores = list(enumerate(similarity[idx]))

    # Step 3: Sort scores
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    # Step 4: Get top recommendations
    scores = scores[1:top_n+1]

    # Step 5: Return movie titles
    recommended_movies = [df.iloc[i[0]].title for i in scores]

    return recommended_movies


# Testing with 3 items
print(recommend("Avatar"))
print(recommend("Batman Begins"))
print(recommend("Toy Story"))


['Apollo 18', 'The American', 'The Inhabited Island', 'Tears of the Sun', 'The Matrix']
['Batman Returns', "Gangster's Paradise: Jerusalema", 'Batman', 'The Dark Knight', 'Seven Psychopaths']
['Toy Story 2', 'Toy Story 3', 'For Your Consideration', "Losin' It", 'In the Shadow of the Moon']


In [14]:
import pickle

with open("movies.pkl", "wb") as f:
    pickle.dump(df, f)

with open("similarity.pkl", "wb") as f:
    pickle.dump(similarity, f)

print(" PKL files saved correctly!")


 PKL files saved correctly!
