In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
#from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [3]:
md = pd. read_csv('C:/Users/Mayank Gokhale/Desktop/movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [5]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

In [6]:
m = vote_counts.quantile(0.95)
m

434.0

In [7]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [8]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [9]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [10]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [11]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.95,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.6454,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.3072,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,"[Adventure, Fantasy, Action]",7.851924


In [12]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

In [13]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [14]:
build_chart('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.565285
351,Forrest Gump,1994,8147,8,48.3072,7.971357
876,Vertigo,1958,1162,8,18.2082,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.8451,7.745154
1132,Cinema Paradiso,1988,834,8,14.177,7.744878
19901,Paperman,2012,734,8,7.19863,7.713951
37863,Sing Street,2016,669,8,10.672862,7.689483
882,The Apartment,1960,498,8,11.9943,7.599317
38718,The Handmaiden,2016,453,8,16.727405,7.566166


In [28]:
build_chart('Comedy').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.463024
351,Forrest Gump,1994,8147,8,48.3072,7.963363
1225,Back to the Future,1985,6239,8,25.7785,7.952358
18465,The Intouchables,2011,5410,8,16.0869,7.945207
22841,The Grand Budapest Hotel,2014,4644,8,14.442,7.936384
2211,Life Is Beautiful,1997,3643,8,39.395,7.91943
732,Dr. Strangelove or: How I Learned to Stop Worr...,1964,1472,8,9.80398,7.809073
3342,Modern Times,1936,881,8,8.15956,7.695554
883,Some Like It Hot,1959,835,8,11.8451,7.680781
1236,The Great Dictator,1940,756,8,9.24175,7.651762


In [30]:
build_chart('Action').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.955099
12481,The Dark Knight,2008,12269,8,123.167,7.94861
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,7.929579
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,7.924031
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,7.918382
256,Star Wars,1977,6778,8,42.1497,7.908327
1154,The Empire Strikes Back,1980,5998,8,19.471,7.896841
4135,Scarface,1983,3017,8,11.2997,7.802046
9430,Oldboy,2003,2000,8,10.6169,7.711649
1910,Seven Samurai,1954,892,8,15.0178,7.426145


In [32]:
build_chart('Western').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
1159,"The Good, the Bad and the Ugly",1966,2371,8,16.7888,7.935622
1166,Once Upon a Time in the West,1968,1160,8,15.5894,7.871129
20051,Django Unchained,2012,10297,7,19.785,6.989664
31865,The Revenant,2015,6558,7,23.5012,6.983814
28131,The Hateful Eight,2015,4405,7,20.3288,6.97599
16278,True Grit,2010,1701,7,10.7308,6.938889
39663,Hell or High Water,2016,1304,7,12.565896,6.920955
1221,Unforgiven,1992,1133,7,10.9421,6.909519
583,Dances with Wolves,1990,1084,7,11.6543,6.905605
3559,For a Few Dollars More,1965,988,7,13.2953,6.896865


In [15]:
build_chart('Fantasy').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,7.888126
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,7.879484
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,7.870711
3030,The Green Mile,1999,4166,8,19.9668,7.772216
5481,Spirited Away,2001,3968,8,41.0489,7.76188
9698,Howl's Moving Castle,2004,2049,8,16.136,7.574941
2884,Princess Mononoke,1997,2041,8,17.1667,7.573545
5833,My Neighbor Totoro,1988,1730,8,13.5073,7.511144
926,It's a Wonderful Life,1946,1103,8,15.0316,7.306584
14551,Avatar,2009,12114,7,185.071,6.94855


In [16]:
build_chart('Adventure').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.906526
22879,Interstellar,2014,11187,8,32.2135,7.883426
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,7.854939
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,7.843867
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,7.832647
256,Star Wars,1977,6778,8,42.1497,7.812801
1225,Back to the Future,1985,6239,8,25.7785,7.797828
1154,The Empire Strikes Back,1980,5998,8,19.471,7.790329
5481,Spirited Away,2001,3968,8,41.0489,7.695056
9698,Howl's Moving Castle,2004,2049,8,16.136,7.465435


In [17]:
build_chart('Family').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
1225,Back to the Future,1985,6239,8,25.7785,7.893053
359,The Lion King,1994,5520,8,21.6058,7.879754
5481,Spirited Away,2001,3968,8,41.0489,7.835635
5833,My Neighbor Totoro,1988,1730,8,13.5073,7.650968
926,It's a Wonderful Life,1946,1103,8,15.0316,7.490637
19901,Paperman,2012,734,8,7.19863,7.301918
39386,Piper,2016,487,8,11.243161,7.071694
20779,Wolf Children,2012,483,8,10.2495,7.066709
31658,Feast,2014,420,8,7.36566,6.98049
25044,Song of the Sea,2014,420,8,6.96736,6.98049


In [18]:
build_chart('Crime').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
12481,The Dark Knight,2008,12269,8,123.167,7.957677
292,Pulp Fiction,1994,8670,8,140.95,7.940522
314,The Shawshank Redemption,1994,8358,8,51.6454,7.938355
834,The Godfather,1972,6024,8,41.1093,7.915273
46,Se7en,1995,5915,8,18.4574,7.913765
586,The Silence of the Lambs,1991,4549,8,4.30722,7.889007
289,Leon: The Professional,1994,4293,8,20.4773,7.882696
3030,The Green Mile,1999,4166,8,19.9668,7.879291
1057,Reservoir Dogs,1992,3821,8,12.2203,7.868957
1178,The Godfather: Part II,1974,3418,8,36.6293,7.854398


In [19]:
build_chart('Thriller').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.95646
12481,The Dark Knight,2008,12269,8,123.167,7.950165
292,Pulp Fiction,1994,8670,8,140.95,7.929996
46,Se7en,1995,5915,8,18.4574,7.898573
24860,The Imitation Game,2014,5895,8,31.5959,7.898242
586,The Silence of the Lambs,1991,4549,8,4.30722,7.869538
11354,The Prestige,2006,4510,8,16.9456,7.868463
289,Leon: The Professional,1994,4293,8,20.4773,7.862142
4099,Memento,2000,4168,8,15.4508,7.858217
1213,The Shining,1980,3890,8,19.6116,7.848633


In [20]:
build_chart('Horror').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
1213,The Shining,1980,3890,8,19.6116,7.901294
1176,Psycho,1960,2405,8,36.8263,7.843335
1171,Alien,1979,4564,7,23.3774,6.941936
41492,Split,2016,4461,7,28.920839,6.940631
14236,Zombieland,2009,3655,7,11.063,6.927969
1158,Aliens,1986,3282,7,21.7612,6.920081
21276,The Conjuring,2013,3169,7,14.9017,6.917338
42169,Get Out,2017,2978,7,36.894806,6.912248
1338,Jaws,1975,2628,7,19.7261,6.901088
8147,Shaun of the Dead,2004,2479,7,14.9029,6.895426


In [21]:
build_chart('Animation').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
359,The Lion King,1994,5520,8,21.6058,7.909339
5481,Spirited Away,2001,3968,8,41.0489,7.875933
9698,Howl's Moving Castle,2004,2049,8,16.136,7.772103
2884,Princess Mononoke,1997,2041,8,17.1667,7.771305
5833,My Neighbor Totoro,1988,1730,8,13.5073,7.735274
40251,Your Name.,2016,1030,8,34.461252,7.58982
5553,Grave of the Fireflies,1988,974,8,0.010902,7.570962
19901,Paperman,2012,734,8,7.19863,7.465676
39386,Piper,2016,487,8,11.243161,7.285132
20779,Wolf Children,2012,483,8,10.2495,7.281198


In [22]:
build_chart('History').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
24860,The Imitation Game,2014,5895,8,31.5959,7.960225
522,Schindler's List,1993,4436,8,41.7251,7.94745
38244,11.22.63,2016,213,8,5.791978,7.253742
6669,Come and See,1985,174,8,12.6432,7.150132
6833,The Passion of Joan of Arc,1928,159,8,8.90395,7.102189
8601,Andrei Rublev,1966,139,8,6.48985,7.029166
8553,Harakiri,1962,136,8,9.28697,7.017176
1919,Saving Private Ryan,1998,5148,7,21.7581,6.974913
21878,12 Years a Slave,2013,3787,7,30.3162,6.966145
108,Braveheart,1995,3404,7,20.7551,6.962452


In [23]:
build_chart('Science Fiction').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.939069
22879,Interstellar,2014,11187,8,32.2135,7.923728
256,Star Wars,1977,6778,8,42.1497,7.876106
1225,Back to the Future,1985,6239,8,25.7785,7.865868
1154,The Empire Strikes Back,1980,5998,8,19.471,7.860722
1163,A Clockwork Orange,1971,3432,8,17.1126,7.764533
1901,Metropolis,1927,666,8,14.4879,7.078592
14551,Avatar,2009,12114,7,185.071,6.952299
17818,The Avengers,2012,12000,7,89.8876,6.951856
23753,Guardians of the Galaxy,2014,10014,7,53.2916,6.942571


In [24]:
build_chart('Mystery').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.966674
46,Se7en,1995,5915,8,18.4574,7.922088
11354,The Prestige,2006,4510,8,16.9456,7.89877
4099,Memento,2000,4168,8,15.4508,7.890815
9430,Oldboy,2003,2000,8,10.6169,7.782445
877,Rear Window,1954,1531,8,17.9113,7.722961
896,Citizen Kane,1941,1244,8,15.8119,7.667293
876,Vertigo,1958,1162,8,18.2082,7.647028
5157,Rashomon,1950,471,8,9.88735,7.274827
3315,Double Indemnity,1944,425,8,6.49432,7.220079


In [26]:
build_chart('War').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
24860,The Imitation Game,2014,5895,8,31.5959,7.965906
522,Schindler's List,1993,4436,8,41.7251,7.954908
1165,Apocalypse Now,1979,2112,8,13.5963,7.907246
5857,The Pianist,2002,1927,8,14.8116,7.898725
732,Dr. Strangelove or: How I Learned to Stop Worr...,1964,1472,8,9.80398,7.869161
5553,Grave of the Fireflies,1988,974,8,0.010902,7.807732
43190,Band of Brothers,2001,725,8,7.903731,7.748752
1138,Paths of Glory,1957,565,8,8.54852,7.687067
6669,Come and See,1985,174,8,12.6432,7.217728
13605,Inglourious Basterds,2009,6598,7,16.8956,6.982462
