In [1]:
import numpy as np
import pandas as pd


In [2]:
movies = pd.read_csv('model/data/tmdb_5000_movies.csv')
credits = pd.read_csv('model/data/tmdb_5000_credits.csv') 

In [3]:
movies = movies.merge(credits,on='title')

In [4]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [5]:
import ast

In [6]:
# this function converts normal string to understandable python format
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 

In [7]:
movies.dropna(inplace=True)

In [8]:
movies['genres'] = movies['genres'].apply(convert)

In [9]:
movies['keywords'] = movies['keywords'].apply(convert)

In [10]:
movies['cast'] = movies['cast'].apply(convert)

In [11]:
# below lambda function picks top 3 cast from list of cast
movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

In [12]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 

In [13]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [14]:
def remove_white_space(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [15]:
movies['cast'] = movies['cast'].apply(remove_white_space)
movies['crew'] = movies['crew'].apply(remove_white_space)
movies['genres'] = movies['genres'].apply(remove_white_space)
movies['keywords'] = movies['keywords'].apply(remove_white_space)

In [16]:
# this lambda function converts plain string to list of string 
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [17]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [18]:
cleaned_movies = movies.drop(columns=['overview','genres','keywords','cast','crew'])

In [19]:
# this lambda function converts a list of tags into normal plain string
cleaned_movies['tags'] = cleaned_movies['tags'].apply(lambda x: " ".join(x))

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')  

In [21]:
vector = cv.fit_transform(cleaned_movies['tags']).toarray()

In [22]:
vector.shape

(4806, 5000)

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
similarity = cosine_similarity(vector)

In [25]:
def get_recommendation(movie):
    try:
        index = cleaned_movies[cleaned_movies['title'] == movie].index[0]
        distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
        for i in distances[1:6]:
            print(cleaned_movies.iloc[i[0]].title)
    except:
        print("Movie Not Found")
        
    

In [26]:
get_recommendation('Spectre')

Quantum of Solace
Never Say Never Again
Skyfall
Thunderball
From Russia with Love


In [27]:
import pickle

In [28]:
pickle.dump(cleaned_movies,open('model/movies.pkl','wb'))
pickle.dump(similarity,open('model/similarities.pkl','wb'))