In [None]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
from ast import literal_eval

In [None]:
md = pd.read_csv('../input/the-movies-dataset/movies_metadata.csv')
md.head()

In [None]:
print('The data has {0} rows'.format(len(md)))

In [None]:
md.dtypes

In [None]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md.head()

In [None]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_average = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_average.mean()
C

In [None]:
m = vote_counts.quantile(0.95)
m

In [None]:
md['year'] = pd.to_datetime(md['release_date'],  errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
md['year']

In [None]:
quantified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
quantified['vote_count'] = quantified['vote_count'].astype('int')
quantified['vote_average'] = quantified['vote_average'].astype('int')
quantified

In [None]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m)*R) + (m/(v+m)*C)

In [None]:
quantified['wr'] = quantified.apply(weighted_rating, axis = 1)
quantified = quantified.sort_values('wr', ascending = False).head(250)
quantified

In [None]:
quantified.head(15)

In [None]:
s = md.apply(lambda x: pd.Series(x['genres']), axis = 1).stack().reset_index(level = 1, drop = True)
s.name = 'genre'
gen_md = md.drop('genres', axis = 1).join(s)
gen_md

In [None]:
def build_chart(genre, percentile = 0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [None]:
build_chart('Romance').head(15)

## Content Based Recommender

In [None]:
links_small = pd.read_csv('../input/the-movies-dataset/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
links_small

In [None]:
md = md.drop([19730, 29503, 35587])
md

In [None]:
md['id'] = md['id'].astype('int')
smd = md[md['id'].isin(links_small)]
smd

### Movie Description Based Recommender 

In [None]:
smd['tagline'] = smd['tagline'].fillna('')


In [None]:
smd

In [None]:
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')
smd.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1,2), min_df = 0, stop_words = 'english')
tfidf_matrix = tf.fit_transform(smd['description'])
tfidf_matrix.shape

In [None]:
cosine_sim = tfidf_matrix.dot(tfidf_matrix.T)
cosine_sim = cosine_sim.toarray()
cosine_sim[0]

In [None]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])
indices

In [None]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1: 31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [None]:
get_recommendations('The Godfather')

In [None]:
get_recommendations('The Dark Knight')

In [None]:
credits = pd.read_csv('../input/the-movies-dataset/credits.csv')
keywords = pd.read_csv('../input/the-movies-dataset/keywords.csv')

In [None]:
credits.head()

In [None]:
keywords.head()

In [None]:
md.head()

In [None]:
credits['id'] = credits['id'].astype('int')
keywords['id'] = keywords['id'].astype('int')
md['id'] = md['id'].astype('int')

In [None]:
md.shape

In [None]:
md = md.merge(credits, on = 'id')
md = md.merge(keywords, on = 'id')

In [None]:
md.shape

In [None]:
smd = md[md['id'].isin(links_small)]
smd.head()

In [None]:
smd['cast'] = smd['cast'].apply(literal_eval) 
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))
smd.head()

In [None]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name'] 
        
    return np.nan

In [None]:
smd['director'] = smd['crew'].apply(get_director)
smd.head()

In [None]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >= 3 else x)
smd

In [None]:
smd['keywords'][0]

In [None]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd

In [None]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(' ','')) for i in x])
smd.head()

In [None]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])
smd.head()

In [None]:
s = smd.apply(lambda x: pd.Series(x['keywords']), axis = 1).stack().reset_index(level = 1, drop = True)
s.name = 'keyword'
s

In [None]:
s = s.value_counts()
s[:5]

In [None]:
s = s[s>1]

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer  = SnowballStemmer('english')
stemmer.stem('dogs')

In [None]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words        

In [None]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
smd.head()

In [None]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] +  smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))
smd.head()

In [None]:
count = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])
count_matrix

In [None]:
cosine_sim = count_matrix.dot(count_matrix.T).toarray()
cosine_sim

In [None]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [None]:
get_recommendations('The Dark Knight').head(10)

In [None]:
get_recommendations('Mean Girls').head(10)

## Collaborative Filtering
Collaborative Filtering is based on the idea that users similar to a me can be used to predict how much I will like a particular product or service those users have used/experienced but I have not.

In [None]:
ratings  = pd.read_csv('../input/the-movies-dataset/ratings_small.csv')
ratings

In [None]:
m = ratings['movieId']
r = ratings['rating']
xxx = [(m[i], r[i]) for i in range(len(m))]
ratings['xxx'] = xxx
ratings

In [None]:
ratings_group = ratings.groupby(by = 'userId')['xxx'].apply(list).reset_index()
ratings_group

In [None]:
movieid = list(enumerate(np.sort(ratings['movieId'].unique())))
movieid = [x[::-1] for x in movieid]
movieid = dict(movieid)
userid = list(enumerate(np.sort(ratings['userId'].unique())))
userid = [x[::-1] for x in userid]
userid = dict(userid)
rating_matrix = np.zeros((len(userid), len(movieid)))
rating_matrix.shape

In [None]:
# fill in the rating_matrix
for user in userid:
    for movie in ratings_group['xxx'].iloc[userid[user]]:
        rating_matrix[userid[user], movieid[movie[0]]] = movie[1]
rating_matrix        