In [None]:

# Movie Recommendation System (Content-Based Filtering)
# Author: Your Name
# Description: This notebook builds a content-based movie recommendation system using the TMDB 5000 dataset.

# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
import ast
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

# Step 2: Load the Datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Step 3: Merge Datasets on 'title'
movies = movies.merge(credits, on='title')

# Step 4: Select Useful Features
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

# Step 5: Handle Missing Values
movies.dropna(inplace=True)

# Step 6: Define Conversion Functions for JSON Strings
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L

def convert3(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter += 1
    return L

def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

# Step 7: Apply the Conversion
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert3)
movies['crew'] = movies['crew'].apply(fetch_director)

# Step 8: Process the Overview Column
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Step 9: Remove Spaces in Multi-Word Phrases
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# Step 10: Combine All into a Single 'tags' Column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

# Step 11: Perform Stemming to Improve Matching Accuracy
ps = PorterStemmer()
def stem(text):
    return " ".join([ps.stem(word) for word in text.split()])
movies['tags'] = movies['tags'].apply(stem)

# Step 12: Vectorize the Tags Using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vectors = tfidf.fit_transform(movies['tags']).toarray()

# Step 13: Compute Similarity Matrix
similarity = cosine_similarity(vectors)

# Step 14.1: Save Preprocessed Data and Similarity Matrix
import pickle

# Save the data so we don't need to reprocess every time
pickle.dump(movies, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))


# Step 14: Define the Recommendation Function
def recommend(movie):
    movie = movie.lower()
    if movie not in movies['title'].str.lower().values:
        print("Movie not found in the dataset.")
        return
    movie_index = movies[movies['title'].str.lower() == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    print(f"Top 5 recommendations for '{movies.iloc[movie_index].title}':")
    for i in movie_list:
        print(movies.iloc[i[0]].title)

# Step 15: Test the Recommender System
recommend('Avatar')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top 5 recommendations for 'Avatar':
Aliens
Falcon Rising
Battle: Los Angeles
Aliens vs Predator: Requiem
Apollo 18
