# Popularity Based Recommendation System

In [7]:
import numpy as np
import pandas as pd

In [9]:
ratings = pd.read_csv("D:/RISE - WPU/Internship/archive (2)/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
movies = pd.read_csv("D:/RISE - WPU/Internship/archive (2)/movies.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [14]:
# Merging both the datasets

ratings_movies = pd.merge(movies, ratings, on = 'movieId')
print(ratings_movies.shape)
ratings_movies

(100836, 6)


Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
5,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,18,3.5,1455209816
6,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,4.0,965705637
7,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,21,3.5,1407618878
8,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,27,3.0,962685262
9,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,31,5.0,850466616


In [15]:
# Recommending Popular Movies
# Grouping movies together and finding mean ratings

ratings_movies.groupby('title')['rating'].mean().head()

title
'71 (2014)                                 4.0
'Hellboy': The Seeds of Creation (2004)    4.0
'Round Midnight (1986)                     3.5
'Salem's Lot (2004)                        5.0
'Til There Was You (1997)                  4.0
Name: rating, dtype: float64

In [16]:
# Sorting ratings from highest to lowest

ratings_movies.groupby('title')['rating'].mean().sort_values(ascending = False)

title
Karlson Returns (1970)                                5.0
Winter in Prostokvashino (1984)                       5.0
My Love (2006)                                        5.0
Sorority House Massacre II (1990)                     5.0
Winnie the Pooh and the Day of Concern (1972)         5.0
Sorority House Massacre (1986)                        5.0
Bill Hicks: Revelations (1993)                        5.0
My Man Godfrey (1957)                                 5.0
Hellbenders (2012)                                    5.0
In the blue sea, in the white foam. (1984)            5.0
Won't You Be My Neighbor? (2018)                      5.0
Red Sorghum (Hong gao liang) (1987)                   5.0
Love Exposure (Ai No Mukidashi) (2008)                5.0
My Sassy Girl (Yeopgijeogin geunyeo) (2001)           5.0
The Love Bug (1997)                                   5.0
Ballad of Narayama, The (Narayama bushiko) (1983)     5.0
Heidi Fleiss: Hollywood Madam (1995)                  5.0
Louis Th

In [17]:
# Recommending top 10 popular movies

ratings_movies.groupby('title')['rating'].mean().sort_values(ascending = False).head(10)

title
Karlson Returns (1970)                           5.0
Winter in Prostokvashino (1984)                  5.0
My Love (2006)                                   5.0
Sorority House Massacre II (1990)                5.0
Winnie the Pooh and the Day of Concern (1972)    5.0
Sorority House Massacre (1986)                   5.0
Bill Hicks: Revelations (1993)                   5.0
My Man Godfrey (1957)                            5.0
Hellbenders (2012)                               5.0
In the blue sea, in the white foam. (1984)       5.0
Name: rating, dtype: float64

In [18]:
# Calculating how many people have voted for each movie

ratings_movies['title'].value_counts()

Forrest Gump (1994)                                                                                                329
Shawshank Redemption, The (1994)                                                                                   317
Pulp Fiction (1994)                                                                                                307
Silence of the Lambs, The (1991)                                                                                   279
Matrix, The (1999)                                                                                                 278
Star Wars: Episode IV - A New Hope (1977)                                                                          251
Jurassic Park (1993)                                                                                               238
Braveheart (1995)                                                                                                  237
Terminator 2: Judgment Day (1991)               

In [22]:
# Finding the movie rating and count of users

rating_count['rating_counts'] = pd.DataFrame(ratings_movies.groupby('title')['rating'].count())
rating_count.head()

Unnamed: 0_level_0,rating,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),4.0,1
'Hellboy': The Seeds of Creation (2004),4.0,1
'Round Midnight (1986),3.5,2
'Salem's Lot (2004),5.0,1
'Til There Was You (1997),4.0,2


In [23]:
# Calculating cosine similarity

from math import *

def square_rooted(x):
    return round(sqrt(sum([a*a for a in x])),3)

def cosine_similarity(x,y):
    numerator = sum(a*b for a,b in zip(x,y))
    denominator = square_rooted(x) * square_rooted(y)
    return round(numerator/ float(denominator),3)

print(cosine_similarity([3,45,7,2],[2,54,13,15]))

0.972


# Content Based Recommendation System

In [25]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

In [26]:
movies_metadata = pd.read_csv('D:/RISE - WPU/Internship/archive (1)/movies_metadata.csv')
movies_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [27]:
movies_metadata['genres'] = movies_metadata['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [28]:
vote_counts = movies_metadata[movies_metadata['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies_metadata[movies_metadata['vote_average'].notnull()]['vote_average'].astype('int')
vote_mean = vote_averages.mean()
vote_mean

5.244896612406511

In [29]:
vote_quantile = vote_counts.quantile(0.95)
vote_quantile

434.0

In [30]:
movies_metadata['year'] = pd.to_datetime(movies_metadata['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [31]:
qualified = movies_metadata[(movies_metadata['vote_count'] >= vote_quantile) & (movies_metadata['vote_count'].notnull()) & (movies_metadata['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [None]:
# To qualify to be considered for the chart, a movie has to have at least 434 votes. We also see that the average rating for a movie on is 5.244 on a scale of 10. 2274 Movies qualify to be on our chart.

In [33]:
def weighted_rating(x):
    cnt = x['vote_count']
    avg = x['vote_average']
    return (cnt/(cnt+vote_quantile) * avg) + (vote_quantile/(vote_quantile+cnt) * vote_mean)

In [34]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [35]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [37]:
qualified.head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.95,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.6454,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.3072,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,"[Adventure, Fantasy, Action]",7.851924


In [38]:
# For particular genres

s = movies_metadata.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = movies_metadata.drop('genres', axis=1).join(s)

In [39]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    vote_mean = vote_averages.mean()
    vote_quantile = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= vote_quantile) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+vote_quantile) * x['vote_average']) + (vote_quantile/(vote_quantile+x['vote_count']) * vote_mean), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [41]:
build_chart('Adventure').head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.906526
22879,Interstellar,2014,11187,8,32.2135,7.883426
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,7.854939
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,7.843867
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,7.832647
256,Star Wars,1977,6778,8,42.1497,7.812801
1225,Back to the Future,1985,6239,8,25.7785,7.797828
1154,The Empire Strikes Back,1980,5998,8,19.471,7.790329
5481,Spirited Away,2001,3968,8,41.0489,7.695056
9698,Howl's Moving Castle,2004,2049,8,16.136,7.465435


# The top radventure movie according to our metrics is Inception.