In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [3]:
movies = pd.read_csv('tmdb_5000_movies.csv/tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_movies.csv/tmdb_5000_credits.csv')

movies = movies.merge(credits, on='title')

movies = movies[['movie_id', 'genres', 'keywords', 'title', 'overview', 'cast', 'crew']]

movies.dropna(inplace=True)

In [4]:
import ast


def convert(text):
    return [i['name'] for i in ast.literal_eval(text)]


def convert3(text):
    return [i['name'] for i in ast.literal_eval(text)][:3]


def fetch_director(text):
    return [i['name'] for i in ast.literal_eval(text) if i['job'] == 'Director']

In [5]:

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert3)
movies['crew'] = movies['crew'].apply(fetch_director)

movies['cast'] = movies['cast'].apply(lambda x: [actor.replace(" ", "") for actor in x])
movies['crew'] = movies['crew'].apply(lambda x: [director.replace(" ", "") for director in x])
movies['genres'] = movies['genres'].apply(lambda x: [genre.replace(" ", "") for genre in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [keyword.replace(" ", "") for keyword in x])

movies['overview'] = movies['overview'].apply(lambda x: x.split())

movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

new = movies.drop(columns=['overview', 'genres', 'keywords', 'cast', 'crew'])

new['tags'] = new['tags'].apply(lambda x: " ".join(x))

In [6]:

# Use TfidfVectorizer instead of CountVectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vector = tfidf.fit_transform(new['tags']).toarray()

# Use TfidfVectorizer for vectorization
similarity = cosine_similarity(vector)

# Save the updated data and similarity matrix
pickle.dump(new, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))


def recommend(movie, n=5):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    recommended_movies = [new.iloc[i[0]]['title'] for i in distances[1:n + 1]]
    return recommended_movies

In [7]:
# Example usage
recommendations = recommend('Batman Begins')
print(recommendations)

['The Dark Knight', 'The Dark Knight Rises', 'Batman', 'Batman', 'Batman Returns']


In [8]:
pickle.dump(new,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))