#Setting up recommender system environment




In [None]:
! pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download rounakbanik/the-movies-dataset/metadata

Downloading the-movies-dataset.zip to /content
 98% 224M/228M [00:01<00:00, 147MB/s]
100% 228M/228M [00:01<00:00, 148MB/s]


In [None]:
! unzip the-movies-dataset.zip

Archive:  the-movies-dataset.zip
  inflating: credits.csv             
  inflating: keywords.csv            
  inflating: links.csv               
  inflating: links_small.csv         
  inflating: movies_metadata.csv     
  inflating: ratings.csv             
  inflating: ratings_small.csv       


Simple recommender system


In [None]:
import pandas as pd

metadata = pd.read_csv('movies_metadata.csv', low_memory=False) #https://www.roelpeters.be/solved-dtypewarning-columns-have-mixed-types-specify-dtype-option-on-import-or-set-low-memory-in-pandas/


metadata.shape

metadata = metadata[(metadata['release_date'] >"2000-01-01")]
metadata= metadata.reset_index()
metadata = metadata.rename(columns={"index":"old_index"})
metadata.head()


# from google.colab import data_table
# data_table.DataTable(metadata, num_rows_per_page=10)

In [None]:
#Calculating the mean of vote average column

C = metadata['vote_average'].mean()
print(C)

5.660172075345689


In [None]:
#caculate the number of votes recieved from a movie in the 90th pecentile of vote count(votes recieved from the most popular movies)

m = metadata['vote_count'].quantile(0.90)
print(m)

269.7999999999993


In [None]:
q_movies = metadata.copy().loc[metadata["vote_count"] >= m]
q_movies.shape

(2395, 25)

In [None]:
def weighted_rating(x, m=m , C=C):
  v = x['vote_count']
  R = x['vote_average']

  return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [None]:
q_movies = q_movies.sort_values("score", ascending = False)

In [None]:
q_movies[['title','vote_count','vote_average','score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
3743,The Dark Knight,12269.0,8.3,8.243198
10636,Whiplash,4376.0,8.3,8.146695
754,Spirited Away,3968.0,8.3,8.131935
7263,The Intouchables,5410.0,8.2,8.079354
5573,Inception,14075.0,8.1,8.054111
10135,Interstellar,11187.0,8.1,8.042544
1293,The Lord of the Rings: The Return of the King,8226.0,8.1,8.022519
267,Memento,4168.0,8.1,7.951668
512,The Lord of the Rings: The Fellowship of the Ring,8892.0,8.0,7.931096
872,The Lord of the Rings: The Two Towers,7641.0,8.0,7.9202


#Content-Based Recommender

In [None]:
### We will recommend movies based on what is written in their description and its similarity among other movies
### E.g if you love a particular romantic movie, we will find other movies similar to that movie that has similar description to your movie :)

In [None]:
metadata['overview'].head()

0    Two Sicilian friends, Nunzio and Pino, share t...
1    An atmospheric coming-of-age story featuring a...
2    The town of Derry has a secret, but no one tol...
3    As her surroundings are invaded by outsiders, ...
4    Using personal stories, this powerful document...
Name: overview, dtype: object

In [None]:
##we will be using a NLP formula to vectorized the description

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english")
metadata['overview'] = metadata['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(metadata['overview'])
tfidf_matrix.shape

(23944, 55383)

In [None]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print("this is cosine sim shape",cosine_sim.shape)
indices = pd.Series(metadata.index, index=metadata['title'])
indices

this is cosine sim shape (23944, 23944)


title
Two Friends                      0
Venice                           1
The Sleepover                    2
The Farmer's Wife                3
A Place at the Table             4
                             ...  
Shadow of the Blair Witch    23939
The Burkittsville 7          23940
Century of Birthing          23941
Betrayal                     23942
Queerama                     23943
Length: 23944, dtype: int64

In [None]:
def get_recommendations(title, cosine_sim = cosine_sim):
  indx = indices[title]
  print(indx)

  sim_scores = list(enumerate(cosine_sim[indx]))
  print(sim_scores)

  sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
  sim_scores = sim_scores[1:11]
  movies_indices = [i[0] for i in sim_scores]
  return metadata['title'].iloc[movies_indices]

In [None]:
get_recommendations("Guardians of the Galaxy")

10686
[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.010805111625656415), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.0), (12, 0.0), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.011485712307745222), (17, 0.0), (18, 0.017182564589177093), (19, 0.0), (20, 0.009183945687913234), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.024464596636198786), (25, 0.0), (26, 0.01981541453914738), (27, 0.021864160776409798), (28, 0.059619304026145806), (29, 0.0), (30, 0.027001951689626238), (31, 0.0), (32, 0.009486799313563473), (33, 0.02317494328587022), (34, 0.014073157383310149), (35, 0.0), (36, 0.0), (37, 0.0), (38, 0.0), (39, 0.0), (40, 0.0), (41, 0.0), (42, 0.0), (43, 0.0), (44, 0.0), (45, 0.0), (46, 0.0), (47, 0.025193225171883552), (48, 0.0), (49, 0.0), (50, 0.017376806863511028), (51, 0.0), (52, 0.0), (53, 0.0), (54, 0.0), (55, 0.0), (56, 0.0), (57, 0.036623879405706036), (58, 0.0), (59, 0.0), (60, 0.0), (61, 0.0), (62, 0.0), (63, 0.0), (64, 0.0), (65, 0.0), (66, 0.0), (67, 0.0), (6

12209              Guardians of the Galaxy Vol. 2
4484                         Gardens of the Night
16531                                     Forever
3899              Quill:  The Life of a Guide Dog
7321                 Johnny Cash at Folsom Prison
19113                                 Dutch Light
20068    The Fjällbacka Murders: Friends for Life
3751                                          CJ7
10342                    The Amazing Spider-Man 2
20909                             Girl in the Box
Name: title, dtype: object