Importing dependencies

In [76]:
import numpy as np
import pandas as pd 
import ast
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer

loading dataset

In [77]:
movies_data = pd.read_csv('tmdb_5000_movies.csv')
credits_data = pd.read_csv('tmdb_5000_credits.csv')

In [78]:
# movies_data.head()
# credits_data.head()

Combining datasets and selecting appropriate features to use

In [79]:
movies = pd.merge(movies_data,credits_data,left_on=['id'],right_on=['movie_id'])
# movies.head()

In [80]:
movies = movies[['id','title_x','genres','keywords','overview','cast','crew']]
# movies.head()

In [81]:
movies.isna().sum()

id          0
title_x     0
genres      0
keywords    0
overview    3
cast        0
crew        0
dtype: int64

In [82]:
def transform(x):
    resp = []
    for i in ast.literal_eval(x):
        resp.append(i['name'])
    return resp

In [83]:
movies['genres'] = movies['genres'].apply(transform)

In [84]:
movies['keywords'] = movies['keywords'].apply(transform)
# movies['keywords'].head()

In [85]:
def transform_cast(x):
    count = 0
    resp = []
    for i in ast.literal_eval(x):
        resp.append(i['name'])
        count +=1
        if count ==3:
            break
    return resp

In [86]:
movies['cast'] = movies['cast'].apply(transform_cast)
# movies['cast'].head()

In [87]:
def transform_crew(x):
    resp = []
    for i in ast.literal_eval(x):
        if  i['job'] == 'Director':
            resp.append(i['name'])
            break
    return resp

In [88]:
movies['crew'] = movies['crew'].apply(transform_crew)
# movies['crew'].head()

In [89]:
movies['overview'] = movies['overview'].apply(lambda x: str(x).split())
# movies['overview'].head(2)

In [90]:
# combine multiple words ina name to single
movies['cast'] = movies['cast'].apply(lambda x: [i.lower().replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.lower().replace(" ","") for i in x])
movies['genres'] = movies['genres'].apply(lambda x: [i.lower().replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.lower().replace(" ","") for i in x])
# movies.head()

In [91]:
movies['tags'] = movies['cast'] + movies['crew'] + movies['genres'] + movies['keywords'] + movies['overview']
movies['title'] = movies['title_x']
# movies.head()

In [92]:
movies_df = movies[['id','title','tags']]
# movies_df.iloc[0,2]

In [93]:
ps = PorterStemmer()
movies_df['tags'] = movies_df['tags'].apply(lambda x: [ps.stem(i)for i in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['tags'] = movies_df['tags'].apply(lambda x: [ps.stem(i)for i in x])


In [94]:
movies_df['tags'] = movies_df['tags'].apply(lambda x:" ".join(x))
# movies_df['tags'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['tags'] = movies_df['tags'].apply(lambda x:" ".join(x))


In [95]:
cv = CountVectorizer(max_features=5000,stop_words='english')
vectors = cv.fit_transform(movies_df['tags']).toarray()
vectors.shape

(4803, 5000)

In [96]:
words = cv.get_feature_names_out()

In [97]:
from sklearn.metrics.pairwise import cosine_similarity

In [101]:
similarity = cosine_similarity(vectors)
similarity.shape

(4803, 4803)

In [99]:
def recommend_movie(movie):
    #TODO implement method to give movies similar to all movies with the same name ex,multiple movies with name batman are present
    movie_indx = movies_df[movies_df['title']==movie].index
    if movie_indx.size != 0:
        movie_indx = movie_indx[0]
        distances = list(enumerate(similarity[movie_indx]))
        movies_list = sorted(distances,reverse=True,key=lambda x:x[1])[1:6]

        for i in movies_list:
            print(movies_df.iloc[i[0]].title)
    else:
        print("Movie not present in DB")

In [100]:
recommend_movie('The Avengers')

Iron Man 3
Avengers: Age of Ultron
Captain America: Civil War
Captain America: The First Avenger
Iron Man


In [103]:
movies_dump = movies_df[['title','id']]
movies_dump.shape

(4803, 2)

In [104]:
import pickle

movie_file = open('movie_dump.pkl','wb')

pickle.dump(movies_dump,movie_file)
movie_file.close()

In [105]:
sim_file = open('similarity.pkl','wb')
pickle.dump(similarity,sim_file)
sim_file.close()