In [13]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [48]:
meta = pd.read_csv("data/movies_metadata.csv", low_memory=False)
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')

In [49]:
meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [50]:
meta['genres'] = meta['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
meta['production_companies']= meta['production_companies'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
meta['production_countries'] = meta['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
meta['spoken_languages'] = meta['spoken_languages'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
meta['year'] = pd.to_datetime(meta['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
meta = meta.drop([19730, 29503, 35587])
meta['id'] = meta['id'].astype('int')

In [51]:
meta = meta.merge(credits, on='id')
meta = meta.merge(keywords, on='id')
meta['cast'] = meta['cast'].apply(literal_eval)
meta['crew'] = meta['crew'].apply(literal_eval)
meta['keywords'] = meta['keywords'].apply(literal_eval)
meta['cast_size'] = meta['cast'].apply(lambda x: len(x))
meta['crew_size'] = meta['crew'].apply(lambda x: len(x))

In [52]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

meta['director'] = meta['crew'].apply(get_director)

In [53]:
meta['cast'] = meta['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
meta['cast'] = meta['cast'].apply(lambda x: x[:4] if len(x) >=4 else x)
meta['keywords'] = meta['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
meta['cast'] = meta['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
meta['director'] = meta['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

In [54]:
meta.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,video,vote_average,vote_count,year,cast,crew,keywords,cast_size,crew_size,director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,False,7.7,5415.0,1995,"[tomhanks, timallen, donrickles, jimvarney]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy, friendship, friends, riva...",13,106,johnlasseter
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,False,6.9,2413.0,1995,"[robinwilliams, jonathanhyde, kirstendunst, br...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[board game, disappearance, based on children'...",26,16,joejohnston
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,False,6.5,92.0,1995,"[waltermatthau, jacklemmon, ann-margret, sophi...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, best friend, duringcreditsstinger, o...",7,4,howarddeutch
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,False,6.1,34.0,1995,"[whitneyhouston, angelabassett, lorettadevine,...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[based on novel, interracial relationship, sin...",10,10,forestwhitaker
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,False,5.7,173.0,1995,"[stevemartin, dianekeaton, martinshort, kimber...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlife crisis, confidence, aging, daug...",12,7,charlesshyer


In [55]:
meta = meta.sort_values('vote_average', ascending=False)
meta.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,video,vote_average,vote_count,year,cast,crew,keywords,cast_size,crew_size,director
22543,False,,0,"[Mystery, Crime, Drama]",,388182,tt2669832,fa,هیچ کجا هیچ کس,,...,False,10.0,1.0,2013,"[mohammadrezaforutan, mahnazafshar, saberabar,...","[{'credit_id': '56eadbd5925141388d00011a', 'de...",[iranian],8,3,ebrahimsheibani
3187,False,,0,[Documentary],,49477,tt0192069,en,Gendernauts: A Journey Through Shifting Identi...,Monika Treut explores the worlds and thoughts ...,...,False,10.0,2.0,1999,[],"[{'credit_id': '532c85cec3a3685fac0000c8', 'de...","[lgbt, woman director]",0,1,monikatreut
36822,False,,0,"[Drama, Action, Adventure]",,211139,tt0058289,en,Leone di Tebe,"Fleeing Troy in the wake of its destruction, f...",...,False,10.0,1.0,1964,"[markforest, yvonnefurneaux, massimoserato, pi...","[{'credit_id': '5486b30892514176030010bc', 'de...","[sword and sandal, peplum]",7,10,giorgioferroni
14513,False,,130,[Documentary],,234937,tt0906743,en,Forever,Père-Lachaise - one of the world's most famous...,...,False,10.0,1.0,2006,"[yoshinokimura, bertrandbeyern]","[{'credit_id': '52fe4e3cc3a36847f82853ff', 'de...","[cemetery, artist, grave, interview, woman dir...",2,1,heddyhonigmann
22250,False,,0,[],,143534,tt0015493,en,The White Shadow,The White Shadow is a British drama film direc...,...,False,10.0,1.0,1924,"[bettycompson, clivebrook, henryvictor, a.b.im...","[{'credit_id': '52fe4b0e9251416c750f6d65', 'de...",[twins],5,8,grahamcutts


In [56]:
vote_counts = meta[meta['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = meta[meta['vote_average'].notnull()]['vote_average'].astype('int')

C = vote_averages.mean()
m = vote_counts.quantile(0.96)

def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [57]:
q_movies = meta.copy().loc[meta['vote_count'] >= m]
q_movies.shape

(1866, 31)

In [58]:
q_movies['weight_score'] = q_movies.apply(weighted_rating, axis=1)

In [59]:
q_movies = q_movies.sort_values('weight_score', ascending=False)
q_movies[['title', 'vote_count', 'vote_average', 'weight_score']].head(20)

Unnamed: 0,title,vote_count,vote_average,weight_score
314,The Shawshank Redemption,8358.0,8.5,8.293154
841,The Godfather,6024.0,8.5,8.219894
12589,The Dark Knight,12269.0,8.3,8.165002
2870,Fight Club,9678.0,8.3,8.130857
292,Pulp Fiction,8670.0,8.3,8.112397
351,Forrest Gump,8147.0,8.2,8.007633
15651,Inception,14075.0,8.1,7.989386
23076,Interstellar,11187.0,8.1,7.962206
522,Schindler's List,4436.0,8.3,7.953599
23868,Whiplash,4376.0,8.3,7.949393


In [75]:
features = ['keywords','cast','genres','director']

for feature in features:
   meta[feature] = meta[feature].fillna('')

def combine_features(row):
    keywords = " ".join(row['keywords'])
    genres = " ".join(row['genres'])
    try:
        return keywords +" "+row['cast']+" "+genres+" "+row["director"]
    except:
        return genres + keywords

meta["combined_features"] = meta.apply(combine_features,axis=1)

In [76]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv = CountVectorizer()
count_matrix = cv.fit_transform(meta["combined_features"])

cosine_sim = cosine_similarity(count_matrix) 
sim_df = pd.DataFrame(cosine_sim, index=meta.title, columns=meta.title)
sim_df.head()

title,هیچ کجا هیچ کس,Gendernauts: A Journey Through Shifting Identities,The Lion of Thebes,Forever,The White Shadow,Macbeth,Claymation Comedy of Horrors,Birch Interval,Birch Interval,Vessel of Wrath,...,A Small Act,24 Hours On Craigslist,The Genius of Marie Curie: The Woman Who Lit up the World,The Mole at the Sea,Paul Taylor Creative Domain,Queerama,NaN,NaN,NaN,NaN
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
هیچ کجا هیچ کس,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149071,0.0,0.0
Gendernauts: A Journey Through Shifting Identities,0.0,1.0,0.0,0.471405,0.0,0.0,0.0,0.0,0.0,0.0,...,0.408248,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0
The Lion of Thebes,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.408248,0.408248,0.204124,...,0.0,0.0,0.0,0.0,0.0,0.0,0.204124,0.0,0.096225,0.096225
Forever,0.0,0.471405,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.288675,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.0,0.0
The White Shadow,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
movie_user_likes = "Avatar"
sim_df[movie_user_likes].sort_values(ascending=False)[:20]

title
Avatar                                1.000000
Rogue One: A Star Wars Story          0.560404
Babylon 5: A Call to Arms             0.542326
Icarus XB 1                           0.532291
Mr. Blot in Space                     0.529256
Alien³                                0.516150
Star Wreck: In the Pirkinning         0.514877
Crash of Moons                        0.513436
Aliens                                0.511310
Planet of the Apes                    0.511310
Star Trek Into Darkness               0.509427
Approaching the Unknown               0.506171
Dante 01                              0.504878
Mission to Mars                       0.504101
Journey to the Far Side of the Sun    0.494312
Alien                                 0.490098
Mission to Mir                        0.477665
Conquest of Space                     0.476331
Moonraker                             0.475349
Prince of Space                       0.457330
Name: Avatar, dtype: float64

In [82]:
movie_user_likes = "Toy Story"
sim_df[movie_user_likes].sort_values(ascending=False)[:20]

title
Toy Story                          1.000000
Toy Story 3                        0.526077
Small Soldiers                     0.520000
Pinocchio                          0.417029
Ted                                0.404145
Child's Play                       0.402492
Barbie and the Three Musketeers    0.400000
Child's Play 2                     0.400000
Toy Story 2                        0.395132
The Indian in the Cupboard         0.361814
Toy Story That Time Forgot         0.357771
Hawaiian Vacation                  0.353553
Toys                               0.353553
Halfaouine: Boy of the Terraces    0.346410
Life-Size                          0.339550
Alma                               0.339550
Ted 2                              0.332820
Home Alone 3                       0.329983
Trolls                             0.326599
Monster House                      0.316228
Name: Toy Story, dtype: float64

In [97]:
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

ratings = pd.read_csv("data/ratings_small.csv")
ratings = ratings[["userId", "movieId", "rating"]]
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0
