In [1]:
import pandas as pd
import numpy as np
moviecredits_df = pd.read_csv('Dataset/tmdb_5000_credits.csv')
movie_df  = pd.read_csv('Dataset/tmdb_5000_movies.csv')

In [2]:
moviecredits_df.columns = ['id','title','cast','crew']
movie_df = movie_df.merge(moviecredits_df,on=["id","title"])

# Plot Based Recommender

In [3]:
movie_df['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [4]:
#Now we'll compute Term Frequency-Inverse Document Frequency (TF-IDF) vectors for each overview
from sklearn.feature_extraction.text import TfidfVectorizer

Tfidf = TfidfVectorizer(stop_words='english')
#fill null values with empty string
movie_df['overview'] = movie_df['overview'].fillna('')

tfidf_matrix = Tfidf.fit_transform(movie_df['overview'])

tfidf_matrix.shape

(4803, 20978)

In [5]:
#Using the cosine similarity to calculate a numeric quantity that denotes the similarity between two movies.
#linear_kernel faster than cosine_similarities but both gives us almost the same results 
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
indices = pd.Series(movie_df.index, index=movie_df['title']).drop_duplicates()

In [7]:
score = pd.Series(cosine_sim[3]).sort_values(ascending=False)
score.index[:10]

Int64Index([3, 65, 299, 428, 1359, 3854, 119, 2507, 9, 1181], dtype='int64')

In [8]:
def get_recommendations(x,cosine_sim = cosine_sim):
    idx = indices[x]
    scores = pd.Series(cosine_sim[idx]).sort_values(ascending=False)
    return movie_df['title'].iloc[scores.index[1:11]]

In [9]:
get_recommendations('The Avengers')

7               Avengers: Age of Ultron
3144                            Plastic
1715                            Timecop
4124                 This Thing of Ours
3311              Thank You for Smoking
3033                      The Corruptor
588     Wall Street: Money Never Sleeps
2136         Team America: World Police
1468                       The Fountain
1286                        Snowpiercer
Name: title, dtype: object

In [10]:
get_recommendations('The Dark Knight Rises')

65                              The Dark Knight
299                              Batman Forever
428                              Batman Returns
1359                                     Batman
3854    Batman: The Dark Knight Returns, Part 2
119                               Batman Begins
2507                                  Slow Burn
9            Batman v Superman: Dawn of Justice
1181                                        JFK
210                              Batman & Robin
Name: title, dtype: object

# Keyword Based Recommendation


In [11]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']

for feature in features:
    movie_df[feature] = movie_df[feature].apply(literal_eval)


In [12]:
#getting director for the movie
def get_director(x):
    for i in x:
        if i["job"] == "Director":
            return i["name"]
    return np.nan

movie_df["director"] = movie_df["crew"].apply(get_director)

In [13]:
#getting the top 3 name from the cast, keywords and genres
def get_list(x):
    if isinstance(x,list):
        names = [i['name'] for i in x]
        if len(names) > 3:
             names = names[:3]
        return names
    return []

features = ['cast', 'keywords', 'genres']

for feature in features:
    movie_df[feature] = movie_df[feature].apply(get_list)

In [14]:
movie_df[['title','cast','director', 'keywords', 'genres']].head(5)

Unnamed: 0,title,cast,director,keywords,genres
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[culture clash, future, space war]","[Action, Adventure, Fantasy]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[ocean, drug abuse, exotic island]","[Adventure, Fantasy, Action]"
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[spy, based on novel, secret agent]","[Action, Adventure, Crime]"
3,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman]",Christopher Nolan,"[dc comics, crime fighter, terrorist]","[Action, Crime, Drama]"
4,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",Andrew Stanton,"[based on novel, mars, medallion]","[Action, Adventure, Science Fiction]"


In [15]:
#removing the space and converting into lowercase
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    movie_df[feature] = movie_df[feature].apply(clean_data)

In [16]:
#creating a single string containing all the informantion
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

movie_df['soup'] = movie_df.apply(create_soup , axis=1)

In [17]:
movie_df['soup'].head(5)

0    cultureclash future spacewar samworthington zo...
1    ocean drugabuse exoticisland johnnydepp orland...
2    spy basedonnovel secretagent danielcraig chris...
3    dccomics crimefighter terrorist christianbale ...
4    basedonnovel mars medallion taylorkitsch lynnc...
Name: soup, dtype: object

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movie_df['soup'])

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

movie_df = movie_df.reset_index()
indices = pd.Series(movie_df.index, index=movie_df['title'])

In [20]:
get_recommendations('The Avengers', cosine_sim2 )

7                   Avengers: Age of Ultron
26               Captain America: Civil War
169      Captain America: The First Avenger
174                     The Incredible Hulk
79                               Iron Man 2
85      Captain America: The Winter Soldier
182                                 Ant-Man
31                               Iron Man 3
68                                 Iron Man
1294                               Serenity
Name: title, dtype: object

In [21]:
get_recommendations('The Dark Knight Rises', cosine_sim2 )

119                Batman Begins
65               The Dark Knight
4638    Amidst the Devil's Wings
1196                The Prestige
3073           Romeo Is Bleeding
3326              Black November
1503                      Takers
1986                      Faster
1253               Kiss of Death
3725                 The Sweeney
Name: title, dtype: object