**Simple Recommender System**

The Full MovieLens Dataset comprises of 26 million ratings and 750,000 tag applications, from 270,000 users on all the 45,000 movies in this dataset. It can be accessed from the official GroupLens website.

In [1]:
# Import Pandas
import pandas as pd

# Load Movies Metadata
metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

# Print the first three rows
metadata.head(3)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92


In [2]:
metadata.shape

(2000, 24)

In [3]:
# Calculate mean of vote average column
C = metadata['vote_average'].mean()
print(C)


6.06225


In [4]:
# Calculate the minimum number of votes required to be in the chart, m
m = metadata['vote_count'].quantile(0.90)
print(m)


855.1000000000001


In [5]:
# Filter out all qualified movies into a new DataFrame
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
q_movies.shape


(200, 24)

In [6]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)


v is the number of votes for the movie;

m is the minimum votes required to be listed in the chart;

R is the average rating of the movie;

C is the mean vote across the whole report.

In [7]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)


In [8]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)


Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358,8.5,8.273744
834,The Godfather,6024,8.5,8.196978
292,Pulp Fiction,8670,8.3,8.09911
351,Forrest Gump,8147,8.2,7.996937
522,Schindler's List,4436,8.3,7.938355
1154,The Empire Strikes Back,5998,8.2,7.933261
256,Star Wars,6778,8.1,7.871721
1178,The Godfather: Part II,3418,8.3,7.852199
289,Leon: The Professional,4293,8.2,7.844919
46,Se7en,5915,8.1,7.842621


**Content** **Based** **Recommender**

In [9]:
#Print plot overviews of the first 5 movies.
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [10]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(2000, 13918)

In [11]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [12]:
cosine_sim.shape


(2000, 2000)

In [13]:
cosine_sim[1]


array([0.01729311, 1.        , 0.04843596, ..., 0.        , 0.        ,
       0.01222168])

In [14]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()


In [15]:
indices[:10]


title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [16]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]


In [17]:
get_recommendations('Toy Story')


1071       Rebel Without a Cause
1932                   Condorman
485                       Malice
448            For Love or Money
1032               The Sunchaser
1884              Child's Play 3
748           I Shot Andy Warhol
591              Window to Paris
314     The Shawshank Redemption
180                 Mute Witness
Name: title, dtype: object

**Collaborative Filtering**

In [None]:
!pip install surprise

In [19]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import SVD, KNNBasic, NMF, SlopeOne, CoClustering, BaselineOnly
from surprise import accuracy

In [20]:
import pandas as pd
metadata = pd.read_csv('/content/ratings_small.csv', low_memory=False)
df = pd.DataFrame(metadata)
df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182


In [21]:
df = df.drop(columns = 'timestamp')
df.head(3)

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0


In [22]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
userId,100004.0,347.01131,195.163838,1.0,182.0,367.0,520.0,671.0
movieId,100004.0,12548.664363,26369.198969,1.0,1028.0,2406.5,5418.0,163949.0
rating,100004.0,3.543608,1.058064,0.5,3.0,4.0,4.0,5.0


In [23]:
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(df , reader)

In [24]:
train_set , test_set = train_test_split(data ,test_size = 0.2 )

In [25]:
algorithms = [SVD(), KNNBasic(), NMF(), SlopeOne(), CoClustering(), BaselineOnly()]

In [26]:
rmse_vals = []
mae_vals = []


for algo in algorithms:

    algo.fit(train_set)


    preds = algo.test(test_set)


    rmse = accuracy.rmse(preds)
    mae = accuracy.mae(preds)


    rmse_vals.append(rmse)
    mae_vals.append(mae)

RMSE: 0.8913
MAE:  0.6899
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9613
MAE:  0.7403
RMSE: 0.9447
MAE:  0.7254
RMSE: 0.9276
MAE:  0.7104
RMSE: 0.9620
MAE:  0.7448
Estimating biases using als...
RMSE: 0.8894
MAE:  0.6891


In [27]:
combined_acc = [(rmse + mae) / 2 for rmse, mae in zip(rmse_vals, mae_vals)]

In [28]:
best_algo_index = combined_acc.index(min(combined_acc))
best_algo_name = algorithms[best_algo_index].__class__.__name__

In [29]:
print(f"Best Algorithm: {best_algo_name}")
print(f"RMSE: {rmse_vals[best_algo_index]}")
print(f"MAE: {mae_vals[best_algo_index]}")
print(f"Combined Score: {combined_acc[best_algo_index]}")

Best Algorithm: BaselineOnly
RMSE: 0.8893908818330238
MAE: 0.6891299325086455
Combined Score: 0.7892604071708347


In [30]:
bsl = BaselineOnly()
data_train = data.build_full_trainset()
bsl.fit(data_train)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7bd0fdbd3280>

In [31]:
bsl.predict(3, 872)

Prediction(uid=3, iid=872, r_ui=None, est=3.494574995057245, details={'was_impossible': False})

This simple model is predicting the rating of 3.49 for movie id 872 for user id 3