In [1]:
import pandas as pd
import numpy as np
moviecredits_df = pd.read_csv('Dataset/tmdb_5000_credits.csv')
movie_df  = pd.read_csv('Dataset/tmdb_5000_movies.csv')

In [2]:
moviecredits_df.columns = ['id','title','cast','crew']
movie_df = movie_df.merge(moviecredits_df,on=["id","title"])

# Plot Based Recommender

In [3]:
movie_df['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [4]:
#Now we'll compute Term Frequency-Inverse Document Frequency (TF-IDF) vectors for each overview
from sklearn.feature_extraction.text import TfidfVectorizer

Tfidf = TfidfVectorizer(stop_words='english')
#fill null values with empty string
movie_df['overview'] = movie_df['overview'].fillna('')

tfidf_matrix = Tfidf.fit_transform(movie_df['overview'])

tfidf_matrix.shape

(4803, 20978)

In [5]:
#Using the cosine similarity to calculate a numeric quantity that denotes the similarity between two movies.
#linear_kernel faster than cosine_similarities but both gives us almost the same results 
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
indices = pd.Series(movie_df.index, index=movie_df['title']).drop_duplicates()

In [7]:
score = pd.Series(cosine_sim[3]).sort_values(ascending=False)
score.index[:10]

Int64Index([3, 65, 299, 428, 1359, 3854, 119, 2507, 9, 1181], dtype='int64')

In [8]:
def get_recommendations(x,cosine_sim = cosine_sim):
    idx = indices[x]
    scores = pd.Series(cosine_sim[idx]).sort_values(ascending=False)
    return movie_df['title'].iloc[scores.index[1:11]]

In [9]:
get_recommendations('The Avengers')

7               Avengers: Age of Ultron
3144                            Plastic
1715                            Timecop
4124                 This Thing of Ours
3311              Thank You for Smoking
3033                      The Corruptor
588     Wall Street: Money Never Sleeps
2136         Team America: World Police
1468                       The Fountain
1286                        Snowpiercer
Name: title, dtype: object

In [10]:
get_recommendations('The Dark Knight Rises')

65                              The Dark Knight
299                              Batman Forever
428                              Batman Returns
1359                                     Batman
3854    Batman: The Dark Knight Returns, Part 2
119                               Batman Begins
2507                                  Slow Burn
9            Batman v Superman: Dawn of Justice
1181                                        JFK
210                              Batman & Robin
Name: title, dtype: object