In [188]:
#Author: Samridddha KC 
#Based on: https://www.datacamp.com/community/tutorials/recommender-systems-python
#Puporse: Getting Started on Building my own Reccomendation Systems 


'''Steps to build a basic reccomener System.
    Decide on the metric or score to rate movies on.
    Calculate the score for every movie.
    Sort the movies based on the score and output the top results.
'''

#Packages 
import pandas as pd 
import numpy as np
from ast import literal_eval 
from sklearn.metrics.pairwise import linear_kernel

In [190]:
#Visualize Data 
metadata=pd.read_csv('/Users/samriddhakc/Desktop/the-movies-dataset/movies_metadata.csv', low_memory=False)
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [191]:
#Formula for IMDB Rating for movies 
#Weighted Rating (WR) = ((v/(v+m)*R)+(m/(v+m)*C)
#v is the number of votes for the movie;
#m is the minimum votes required to be listed in the chart;
#R is the average rating of the movie; And
#C is the mean vote across the whole report


C=metadata['vote_average'].mean()
#m is a parameter that needs to be tuned.
#It is more like a threshold value to decide to either choose 
#or not choose a value 
m=metadata['vote_count'].quantile(0.90)
v_for_movies=metadata['vote_count']
#columns for the metadata 
metadata.columns.values 
metadata['imdb_id']
filter_movies=metadata.copy().loc[metadata['vote_count']>=m]
R=filter_movies['vote_average']
V=filter_movies['vote_count']

In [192]:
print("Before Filter",metadata.shape);
print("After filter",filter_movies.shape);

Before Filter (45466, 24)
After filter (4555, 24)


In [193]:
jcopy=metadata.copy().loc[1]

In [194]:
#Filter ratings 
#Calculate Weighted Ratings 
#One way to compute the Weighted Rating 
#Method 1 
#This method uses a filter to have a basic cutoff based on the vote_count which
#is a threshhold set at 90% percentile, Then a score is used by taking into rating 
#based on the voting ratio
WR=(V/(V+m).values)*R +(m/(V+m).values)*C
WR

0        7.640253
1        6.820293
4        5.660700
5        7.537201
8        5.556626
           ...   
45177    4.959104
45204    6.671272
45258    6.590372
45265    6.344369
45343    4.791783
Length: 4555, dtype: float64

In [195]:
#Functional Way.
def weighted_rating(x,m=m,C=C):
    v=x['vote_count'];
    R=x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)  

In [196]:
filter_movies['score'] = filter_movies.apply(weighted_rating, axis=1)

In [197]:
#Note this is consistent with Method 1 but this is more efficient because 
#score can be easily sotres as a new column of the filter_movie data
filter_movies['score']

0        7.640253
1        6.820293
4        5.660700
5        7.537201
8        5.556626
           ...   
45177    4.959104
45204    6.671272
45258    6.590372
45265    6.344369
45343    4.791783
Name: score, Length: 4555, dtype: float64

In [198]:
filter_movies.sort_values(by=['score'],ascending=False,inplace=True)

In [199]:
#Simple ranking based on the Imdb ratings
filter_movies[['title', 'vote_count', 'vote_average', 'score']].head(15)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


In [201]:
#Content-Based Reccomender in Python
#This filtering method is there to create a more aware reccomender 
#where the reccomendations are based on the content. 
#The idea for this type of filter is to use TF-IDF(Term Frequency-Inverse Document Frequency) to compare the Overviews by converting text to word vectors 
#The Idea behind TF-IDF is to increase the importance of repeating words, 
#and focus more on them to come up with bag of words 
filter_movies['overview'].head()

314      Framed in the 1940s for the double murder of h...
834      Spanning the years 1945 to 1955, a chronicle o...
10309    Raj is a rich, carefree, happy-go-lucky second...
12481    Batman raises the stakes in his war on crime. ...
2843     A ticking-time-bomb insomniac and a slippery s...
Name: overview, dtype: object

In [202]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Stop words are words like "the", "a", which are not of too much 
#and can be ignored 
tfidf=TfidfVectorizer(stop_words="english")
#an empty string cannot have a comparision with other stringss but NaN value might be thought of as a word 
metadata['overview']=metadata['overview'].fillna('');
tfidf_matrix=tfidf.fit_transform(metadata['overview'])

In [203]:
#This produces a word vector for every movie where each overview is represenrted by bag of words as an n dimensional vector
length=metadata['overview'].count()
index=np.arange(length)
df_words_vector=pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names(),index=index)

In [204]:
# The overview of movie represented as vectors based on the movie overivews 
df_words_vector.shape

(45466, 75827)

In [205]:
df_words_vector.head(3)

Unnamed: 0,00,000,000km,000th,001,006,007,008,009,0093,...,ようなもの,患者さんとその世界,水俣,海難1890,見鬼10,주식회사,찾기,첫사랑,ﬁrst,ﬁve
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [206]:
#Next Step: Calculate similarity between contents of the words/through vectors
#Use Cosine similarity for our purposes
#This works well because, for same/similar word vector in df_words_vector the consine is 
#1 or tends to 1. For completely different/opposite the vector coul be antiparallel and
#have a score of -1.
#Note, the similarity score will range from -1 to 1. 

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)




In [207]:
cosine_sim.shape
cosine_sim[0][0]=-1

In [208]:
#Sanity check to see, if the closest element to one of the word seems to be realistically working 
def similar_movies_content_based(movie_name):
    arr=np.where(metadata['title'].values==movie_name)
    arr_attr=arr[0]
    if (len(arr_attr)==0):
        print("No Movie avaiable on the list")
    else: 
        similar_indices=cosine_sim[arr_attr.tolist()[0]].argsort()[-6:][::-1]
        print("Movie "+metadata['title'].values[arr_attr.tolist()[0]])
        print("Similar Movies:")
        for i in similar_indices: 
            print(metadata['title'].values[i])


In [209]:
similar_movies_content_based("Toy Story")

Movie Toy Story
Similar Movies:
Toy Story 3
Toy Story 2
The 40 Year Old Virgin
Small Fry
Andy Hardy's Blonde Trouble
Hot Splash


In [210]:
#Sanity Check to Check Similar Movies. 
#similar_movies("Toy Story")
#Context Based filtering
#Put a Movie you want to and ger recommendations! 
similar_movies_content_based("The Dark Knight Rises")

Movie The Dark Knight Rises
Similar Movies:
The Dark Knight Rises
The Dark Knight
Batman Forever
Batman Returns
Batman: Under the Red Hood
Batman


In [211]:
#The flaw with the previous approach is that, while we get a decent comparison for movies based 
#on the plot, there can be comparision based on the genre, director, and many other factors. 
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [212]:
metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [213]:
credits=pd.read_csv('/Users/samriddhakc/Desktop/the-movies-dataset/credits.csv', low_memory=False)
keywords=pd.read_csv('/Users/samriddhakc/Desktop/the-movies-dataset/keywords.csv', low_memory=False)

In [214]:
credits.head(3)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602


In [215]:
keywords.head(3)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


In [216]:
#drop  things that have wrong ids 
metadata=metadata.drop([19730,29503,35587])
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id']=metadata['id'].astype('int')

In [217]:
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

In [218]:
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


In [219]:
metadata.loc[0,'cast']

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

In [220]:
#extract director, and cast from the data 
features=['cast','crew','keywords','genres']
for feature in features: 
    metadata[feature]=metadata[feature].apply(literal_eval)

In [221]:
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


In [166]:
metadata.loc[0,'cast'][:2]

[{'cast_id': 14,
  'character': 'Woody (voice)',
  'credit_id': '52fe4284c3a36847f8024f95',
  'gender': 2,
  'id': 31,
  'name': 'Tom Hanks',
  'order': 0,
  'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'},
 {'cast_id': 15,
  'character': 'Buzz Lightyear (voice)',
  'credit_id': '52fe4284c3a36847f8024f99',
  'gender': 2,
  'id': 12898,
  'name': 'Tim Allen',
  'order': 1,
  'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}]

In [167]:
def extract_director(x):
    for i in x: 
        if (i['job']=='Director'):
            return i['name']
    return np.nan

In [168]:
def get_list(x):
    
    if (isinstance(x,list)):
        names=[i['name'] for i in x]
        if (len(names)>3):
            names=names[:3]
        return names 

    return []
    

In [169]:
metadata['director']=metadata['crew'].apply(extract_director)
for feature in features:
    metadata[feature]=metadata[feature].apply(get_list)

In [170]:
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


In [171]:
#This helps to avoid the TFidvector from counting two names with the same first name
#but different last name as the same. 

def clean_data(x):
    if isinstance(x,list):
          return [str.lower(i.replace(" ", "")) for i in x]
    else: 
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [172]:
features = ['cast', 'keywords', 'director', 'genres','crew']
for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

In [173]:
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords,director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[animation, comedy, family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,"[tomhanks, timallen, donrickles]","[johnlasseter, josswhedon, andrewstanton]","[jealousy, toy, boy]",johnlasseter
1,False,,65000000,"[adventure, fantasy, family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[robinwilliams, jonathanhyde, kirstendunst]","[larryj.franco, jonathanhensleigh, jameshorner]","[boardgame, disappearance, basedonchildren'sbook]",joejohnston
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[romance, comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[waltermatthau, jacklemmon, ann-margret]","[howarddeutch, markstevenjohnson, markstevenjo...","[fishing, bestfriend, duringcreditsstinger]",howarddeutch


In [174]:
#note space is there to not mix up the words 
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [175]:
metadata['soup'] = metadata.apply(create_soup, axis=1)

In [176]:
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,cast,crew,keywords,director,soup
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[animation, comedy, family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,"[tomhanks, timallen, donrickles]","[johnlasseter, josswhedon, andrewstanton]","[jealousy, toy, boy]",johnlasseter,jealousy toy boy tomhanks timallen donrickles ...
1,False,,65000000,"[adventure, fantasy, family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[robinwilliams, jonathanhyde, kirstendunst]","[larryj.franco, jonathanhensleigh, jameshorner]","[boardgame, disappearance, basedonchildren'sbook]",joejohnston,boardgame disappearance basedonchildren'sbook ...
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[romance, comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[waltermatthau, jacklemmon, ann-margret]","[howarddeutch, markstevenjohnson, markstevenjo...","[fishing, bestfriend, duringcreditsstinger]",howarddeutch,fishing bestfriend duringcreditsstinger walter...


In [177]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
cosine_sim3=linear_kernel(count_matrix,count_matrix)

In [178]:
cosine_sim2_attr=linear_kernel(count_matrix, count_matrix)

In [179]:
#Sanity check to see, if the closest element to one of the word seems to be realistically working 
def similar_movies(movie_name,measure):
    arr=np.where(metadata['title'].values==movie_name)
    arr_attr=arr[0]
    if (len(arr_attr)==0):
        print("No Movie avaiable on the list")
    else: 
        similar_indices=measure[arr_attr.tolist()[0]].argsort()[-6:][::-1]
        print("Movie: "+metadata['title'].values[arr_attr.tolist()[0]])
        print("Similar Movies:")
        for i in similar_indices: 
            print(metadata['title'].values[i])

In [184]:
#Content filtering with cast,generes,crew,keywords
print("Using improved Cotent Based filtering")
similar_movies("Toy Story",cosine_sim2)

Using improved Cotent Based filtering
Movie: Toy Story
Similar Movies:
Toy Story
Toy Story 2
Toy Story 3
Superstar Goofy
Toy Story That Time Forgot
Toy Story of Terror!


In [185]:
print("Using improved Cotent Based filtering with linear Kernel")
similar_movies("Toy Story",cosine_sim2_attr)

Using improved Cotent Based filtering with linear Kernel
Movie: Toy Story
Similar Movies:
Toy Story
Toy Story 2
Toy Story 3
Anina
Toy Story That Time Forgot
Partysaurus Rex


In [186]:
print("Using improved Cotent Based filtering with cosine similarity")
similar_movies("The Dark Knight Rises",cosine_sim2)

Using improved Cotent Based filtering with cosine similarity
Movie: The Dark Knight Rises
Similar Movies:
The Dark Knight Rises
The Dark Knight
Batman Begins
Shiner
Amongst Friends
Mitchell


In [187]:
print("Using improved Cotent Based filtering with linear Kernel")
similar_movies("The Dark Knight Rises",cosine_sim2_attr)

Using improved Cotent Based filtering with linear Kernel
Movie: The Dark Knight Rises
Similar Movies:
The Dark Knight Rises
The Dark Knight
Batman Begins
Mitchell
Romeo Is Bleeding
Quicksand


metadata