In [2]:
# Content-Based Movie Recommendation System

import pandas as pd
import numpy as np
import ast
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Ensure NLTK data is downloaded
nltk.download('stopwords')

# Load datasets
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

# Merge datasets
movies = movies.merge(credits, left_on='id', right_on='movie_id')

# Drop irrelevant columns
movies = movies.drop(columns=['homepage', 'status', 'tagline', 'movie_id', 'title_y'])

# Helper functions to parse stringified data
def convert(text):
    try:
        return [i['name'] for i in ast.literal_eval(text)]
    except:
        return []

def get_top_cast(text):
    try:
        return [i['name'] for i in ast.literal_eval(text)[:3]]
    except:
        return []

def get_director(text):
    try:
        for i in ast.literal_eval(text):
            if i['job'] == 'Director':
                return [i['name']]
        return []
    except:
        return []

# Apply transformations
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(get_top_cast)
movies['crew'] = movies['crew'].apply(get_director)

# Fill missing overview with empty string
movies['overview'] = movies['overview'].fillna('')

# Create a new 'tags' column
movies['tags'] = movies['overview'] + ' ' + \
                 movies['genres'].apply(lambda x: ' '.join(x)) + ' ' + \
                 movies['keywords'].apply(lambda x: ' '.join(x)) + ' ' + \
                 movies['cast'].apply(lambda x: ' '.join(x)) + ' ' + \
                 movies['crew'].apply(lambda x: ' '.join(x))

# Normalize text
ps = PorterStemmer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [ps.stem(w) for w in words if w not in stopwords.words('english')]
    return ' '.join(words)

movies['tags'] = movies['tags'].apply(preprocess)

# Final dataframe for modeling
final_df = movies[['id', 'title_x', 'tags']].rename(columns={'title_x': 'title'})

# Vectorization
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(final_df['tags']).toarray()

# Similarity matrix
similarity = cosine_similarity(vectors)

# Recommendation function
def recommend(movie_title, top_n=5):
    movie_title = movie_title.lower()
    if movie_title not in final_df['title'].str.lower().values:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return

    index = final_df[final_df['title'].str.lower() == movie_title].index[0]
    distances = list(enumerate(similarity[index]))
    sorted_movies = sorted(distances, key=lambda x: x[1], reverse=True)[1:top_n+1]

    print(f"\nTop {top_n} movies similar to '{final_df.iloc[index]['title']}' are:")
    for i in sorted_movies:
        print(final_df.iloc[i[0]]['title'])

# Example Usage:
recommend("Avengers: Age of Ultron", top_n=10)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kruth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Top 10 movies similar to 'Avengers: Age of Ultron' are:
The Avengers
Iron Man
Iron Man 2
Captain America: Civil War
Iron Man 3
The Wolverine
X-Men
Fantastic Four
Ant-Man
Guardians of the Galaxy
