In [1]:
# Recommender systems - Suggest similar items based on user's profile and search or views.

# 2 Types of Systems
# 1) Collaborative Filtering - Recommedns items based on similarity measures between users or items. The basic assumption is 
#    users with similar interest have common preferences. Collaborative Filtering uses a user-item matrix to generate 
#    recommendation system. 
#    The matrix contains values that indicates users preference towards a given item. User preference values can be both
#    a) Explicit feedback - Users Ratings, Scores, Stars, etc.
#    b) Implicit feedback - Indirect user behaviour like number of times watched, purchased, or usage.

# 2) Content Based Recommendation System - In this system overview or story or feedback or written feedback is colleced and
#    using NLP to process the content convert into TFIDF matrix. After which Similarity is calculated using distance metrics
#    like Cosine, Euclidean, etc. to find similar items.



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
moviedf = pd.read_csv('movies_metadata.csv',low_memory=False)

In [5]:
moviedf.shape

(45466, 24)

In [4]:
moviedf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [6]:
moviedf.describe()

Unnamed: 0,revenue,runtime,vote_average,vote_count
count,45460.0,45203.0,45460.0,45460.0
mean,11209350.0,94.128199,5.618207,109.897338
std,64332250.0,38.40781,1.924216,491.310374
min,0.0,0.0,0.0,0.0
25%,0.0,85.0,5.0,3.0
50%,0.0,95.0,6.0,10.0
75%,0.0,107.0,6.8,34.0
max,2787965000.0,1256.0,10.0,14075.0


In [8]:
moviedf.isnull().sum()*100/len(moviedf)

adult                     0.000000
belongs_to_collection    90.115691
budget                    0.000000
genres                    0.000000
homepage                 82.883913
id                        0.000000
imdb_id                   0.037391
original_language         0.024194
original_title            0.000000
overview                  2.098271
popularity                0.010997
poster_path               0.848986
production_companies      0.006598
production_countries      0.006598
release_date              0.191352
revenue                   0.013197
runtime                   0.578454
spoken_languages          0.013197
status                    0.191352
tagline                  55.104914
title                     0.013197
video                     0.013197
vote_average              0.013197
vote_count                0.013197
dtype: float64

In [10]:
C = moviedf.vote_average.mean()
C

5.618207215134185

In [13]:
m = moviedf.vote_count.quantile(0.90)
m

160.0

In [14]:
top_movies = moviedf.copy().loc[moviedf.vote_count>=m]

In [16]:
top_movies.shape

(4555, 24)

In [18]:
def weighted_rating(x,m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m)*R) + (m/(m+v)*C)

In [19]:
top_movies['weighted_rating'] = top_movies.apply(weighted_rating, axis = 1)

In [21]:
top_movies = top_movies.sort_values('weighted_rating', ascending = False)

In [22]:
top_movies[['title','weighted_rating']].head(10)

Unnamed: 0,title,weighted_rating
314,The Shawshank Redemption,8.445869
834,The Godfather,8.425439
10309,Dilwale Dulhania Le Jayenge,8.421453
12481,The Dark Knight,8.265477
2843,Fight Club,8.256385
292,Pulp Fiction,8.251406
522,Schindler's List,8.206639
23673,Whiplash,8.205404
5481,Spirited Away,8.196055
2211,Life Is Beautiful,8.187171


In [24]:
movie_mat = top_movies.pivot_table(index = 'id', columns='title', values='weighted_rating')

In [25]:
movie_mat.head()

title,'71,(500) Days of Summer,10 Cloverfield Lane,10 Things I Hate About You,10 Years,"10,000 BC",101 Dalmatians,101 Dalmatians II: Patch's London Adventure,102 Dalmatians,11.22.63,...,[REC]²,[REC]³ Genesis,[REC]⁴ Apocalypse,eXistenZ,iBoy,xXx,xXx: Return of Xander Cage,xXx: State of the Union,¡Three Amigos!,Æon Flux
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100,,,,,,,,,,,...,,,,,,,,,,
10003,,,,,,,,,,,...,,,,,,,,,,
100042,,,,,,,,,,,...,,,,,,,,,,
10009,,,,,,,,,,,...,,,,,,,,,,
10010,,,,,,,,,,,...,,,,,,,,,,


In [26]:
# Content Based Recommender Systems

moviedf.overview.head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
tfidf = TfidfVectorizer(stop_words="english")

In [30]:
moviedf.overview = moviedf.overview.fillna('')

In [31]:
tfidf_matrix = tfidf.fit_transform(moviedf.overview)

In [32]:
tfidf_matrix.shape

(45466, 75827)

In [None]:
# Cosine similarity is widely used in sentance segments.

In [35]:
from sklearn.metrics.pairwise import linear_kernel

In [36]:
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

In [37]:
indices = pd.Series(moviedf.index, index = moviedf.title).drop_duplicates()

In [38]:
def get_recommend(title, cosine_sim = cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores,key = lambda x:x[1], reverse=False)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return moviedf.title.iloc[movie_indices]

In [46]:
get_recommend('Jumanji')

4        Father of the Bride Part II
6                            Sabrina
7                       Tom and Huck
9                          GoldenEye
11       Dracula: Dead and Loving It
12                             Balto
14                  Cutthroat Island
15                            Casino
16             Sense and Sensibility
18    Ace Ventura: When Nature Calls
Name: title, dtype: object

In [54]:
from surprise import NormalPredictor,Dataset,Reader,SVD
from surprise.model_selection import cross_validate

In [55]:
data = Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to C:\Users\Hp/.surprise_data/ml-100k


In [58]:
algo = SVD()

In [59]:
cross_validate(algo, data, measures=['RMSE','MAE'], cv = 5, verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9447  0.9339  0.9312  0.9364  0.9389  0.9370  0.0046  
MAE (testset)     0.7443  0.7324  0.7317  0.7413  0.7429  0.7385  0.0054  
Fit time          0.99    1.01    0.85    0.89    0.85    0.92    0.07    
Test time         0.20    0.15    0.15    0.15    0.15    0.16    0.02    


{'test_rmse': array([0.94471857, 0.93386079, 0.93117122, 0.93640002, 0.93891221]),
 'test_mae': array([0.74428782, 0.7323616 , 0.73173409, 0.74131291, 0.74291204]),
 'fit_time': (0.9919836521148682,
  1.0145654678344727,
  0.8459200859069824,
  0.8854517936706543,
  0.8507156372070312),
 'test_time': (0.20499801635742188,
  0.1549997329711914,
  0.15400075912475586,
  0.1490004062652588,
  0.1510004997253418)}

In [None]:
reader = Reader(rating_scale=(moviedf.vote_count.min(),
                             ))