In [1]:
%load_ext autotime

import numpy as np
import random
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 

from surprise import NormalPredictor, SVD, KNNBasic, NMF
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, KFold

from tqdm import tqdm

time: 680 ms (started: 2022-11-12 01:44:08 +05:30)


## 1. Introduction
Recommender systems goal is to push *relevant* items to a given user. Understanding and modelling the user's preferences is required to reach this goal. In this project you will learn how to model the user's preferences with the [Surprise library](http://surpriselib.com/) to build different recommender systems. The first one will be a pure *collaborative filtering* approach, and the second one will rely on item attributes in a *content-based* way.

## 2. Loading Data


In [2]:
RATINGS_DATA_FILE = './goodbooks-10k/ratings.csv'
BOOKS_DATA_FILE = './goodbooks-10k/books.csv'

time: 144 µs (started: 2022-11-12 01:44:09 +05:30)


In [3]:
# load the raw csv into a data_frame
df_ratings = pd.read_csv(RATINGS_DATA_FILE)


# movies dataframe
df_books = pd.read_csv(BOOKS_DATA_FILE)

time: 102 ms (started: 2022-11-12 01:44:09 +05:30)


In [4]:
df_ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


time: 5.77 ms (started: 2022-11-12 01:44:09 +05:30)


In [5]:
# check we have 25M users' ratings
df_ratings.user_id.count()

981756

time: 1.96 ms (started: 2022-11-12 01:44:09 +05:30)


In [6]:
def get_subset(df, number):
    """
    Get a subset of a large dataset for debug purpose.
    """
    rids = np.arange(df.shape[0])
    np.random.shuffle(rids)
    df_subset = df.iloc[rids[:number], :].copy()
    return df_subset

df_ratings_100k = get_subset(df_ratings, 100000)
df_books_1000 = get_subset(df_books, 1000)

time: 16.3 ms (started: 2022-11-12 01:44:09 +05:30)


In [7]:
# Surprise reader
reader = Reader(rating_scale=(0, 5))

# Finally load all ratings
ratings = Dataset.load_from_df(df_ratings_100k, reader)

time: 32.4 ms (started: 2022-11-12 01:44:09 +05:30)


In [8]:
df_ratings_100k.head(5)

Unnamed: 0,book_id,user_id,rating
758792,7644,25278,5
953484,9693,22902,4
58538,586,13000,4
752743,7582,28653,3
400689,4013,10724,1


time: 1.91 ms (started: 2022-11-12 01:44:09 +05:30)


## 3. Collaborative Filtering
We can test first any of the [Surprise algorithms](https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html).

In [9]:
# define a cross-validation iterator
kf = KFold(n_splits=3)

algos = [SVD(), NMF(), KNNBasic()]    

time: 193 µs (started: 2022-11-12 01:44:09 +05:30)


In [10]:
def get_rmse(algo, testset):
        predictions = algo.test(testset)
        accuracy.rmse(predictions, verbose=True)
        
for trainset, testset in tqdm(kf.split(ratings)): 
    """
        get an evaluation with cross-validation for different algorithms
    """  
    for algo in algos:
        algo.fit(trainset)
        get_rmse(algo, testset)

0it [00:00, ?it/s]

RMSE: 0.9298


1it [00:03,  3.25s/it]

RMSE: 1.0952
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9910
RMSE: 0.9337
RMSE: 1.1031
Computing the msd similarity matrix...
Done computing similarity matrix.


2it [00:06,  3.39s/it]

RMSE: 0.9959
RMSE: 0.9280
RMSE: 1.1075
Computing the msd similarity matrix...


3it [00:10,  3.38s/it]

Done computing similarity matrix.
RMSE: 0.9915
time: 10.1 s (started: 2022-11-12 01:44:09 +05:30)





## 4. Content-based Filtering
Here we will rely directly on items attributes. First we have to describe a user profile with an attributes vector. Then we will use these vectors to generate recommendations.

In [11]:
# computing similarities requires too much ressources on the whole dataset, so we take the subset with 100 items
df_books_1000 = df_books_1000.reset_index(drop=True)
df_books_1000.head(5)
df_books_1000 = df_books_1000.dropna()
df_books_1000 = df_books_1000.reset_index(drop=True)

time: 2.86 ms (started: 2022-11-12 01:44:19 +05:30)


In [12]:
# we compute a TFIDF on the titles of the movies
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_books_1000['original_title'])

time: 7.11 ms (started: 2022-11-12 01:44:19 +05:30)


In [13]:
# we get cosine similarities: this takes a lot of time on the real dataset
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

time: 1.31 ms (started: 2022-11-12 01:44:19 +05:30)


In [14]:
# we generate in 'results' the most similar movies for each movie: we put a pair (score, movie_id)
results = {}
for idx, row in df_books_1000.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1] 
    similar_items = [(cosine_similarities[idx][i], df_books_1000['book_id'].iloc[[i]].tolist()[0]) for i in similar_indices] 
    results[idx] = similar_items[1:]

time: 1.05 s (started: 2022-11-12 01:44:19 +05:30)


In [15]:
# transform a 'movieId' into its corresponding movie title
def item(id):  
    return df_books_1000.loc[df_books_1000['book_id'] == id]['original_title'].tolist()[0].split(' - ')[0] 

time: 191 µs (started: 2022-11-12 01:44:20 +05:30)


In [16]:
# transform a 'movieId' into the index id
def get_idx(id):
    return df_books_1000[df_books_1000['book_id'] == id].index.tolist()[0]

time: 179 µs (started: 2022-11-12 01:44:20 +05:30)


In [17]:
# Finally we put everything together here:
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")   
    print("-------")    
    recs = results[get_idx(item_id)][:num]   
    for rec in recs: 
        print("\tRecommended: " + item(rec[1]) + " (score:" +      str(rec[0]) + ")")

time: 275 µs (started: 2022-11-12 01:44:20 +05:30)


In [18]:
df_ratings_100k.head()

Unnamed: 0,book_id,user_id,rating
758792,7644,25278,5
953484,9693,22902,4
58538,586,13000,4
752743,7582,28653,3
400689,4013,10724,1


time: 1.99 ms (started: 2022-11-12 01:44:20 +05:30)


In [19]:
df_books_1000.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,5287,47956,47956,353853,25,451456718,9780451000000.0,Anne Bishop,1998.0,Daughter of the Blood,...,25432,28102,1656,958,1490,4128,8035,13491,https://images.gr-assets.com/books/1400569100m...,https://images.gr-assets.com/books/1400569100s...
1,4649,99296,99296,301388,16,006000150X,9780060000000.0,"Doreen Cronin, Harry Bliss",1996.0,Diary of a Worm,...,23320,23423,691,322,987,4006,6599,11509,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
2,4281,6758331,6758331,6954915,41,778303764,9780778000000.0,Gena Showalter,2010.0,"The Darkest Passion (Lords of the Underworld, #5)",...,32610,34290,1155,168,623,4157,10738,18604,https://images.gr-assets.com/books/1351278057m...,https://images.gr-assets.com/books/1351278057s...
3,3755,114230,114230,15956900,431,974607800,9780975000000.0,Herman Melville,1853.0,"Bartleby, the Scrivener: A Story of Wall Street",...,26265,31730,1915,710,2028,7207,11693,10092,https://images.gr-assets.com/books/1320404048m...,https://images.gr-assets.com/books/1320404048s...
4,6544,2897258,2897258,2923667,33,307381757,9780307000000.0,Michelle Moran,2008.0,The Heretic Queen,...,16155,18301,1442,144,428,2816,6916,7997,https://images.gr-assets.com/books/1422755729m...,https://images.gr-assets.com/books/1422755729s...


time: 4.99 ms (started: 2022-11-12 01:44:20 +05:30)


Suppose a user wants the 10 most 'similar' (from a CBF point of view) movies from the movie 'Alley Cats Strike':

In [20]:
random_item_id = random.choice(df_books_1000["book_id"].tolist())
recommend(item_id=random_item_id, num=10)

Recommending 10 products similar to The Mermaid Chair...
-------
	Recommended: The Dragonbone Chair (score:0.3071703642268616)
	Recommended: Sacré Bleu: A Comedy d'Art (score:0.0)
	Recommended: Sea Glass (score:0.0)
	Recommended: Ash (score:0.0)
	Recommended: Revelation Space (score:0.0)
	Recommended: On the Genealogy of Morals / Ecce Homo (score:0.0)
	Recommended: The Shell Seekers (score:0.0)
	Recommended: Ghost Wars: The Secret History of the CIA, Afghanistan, and Bin Laden, from the Soviet Invasion to September 10, 2001 (score:0.0)
	Recommended: In Cold Blood (score:0.0)
	Recommended: Sunset Express (score:0.0)
time: 2.75 ms (started: 2022-11-12 01:44:20 +05:30)
