In [265]:
import pandas as pd
import numpy as np
from SGD import SGDMF
import pickle

Documentation

Content based recommendation fundamentals: https://www.linkedin.com/pulse/content-based-recommender-engine-under-hood-venkat-raman 

Content base recommendation system: https://towardsdatascience.com/building-a-content-based-book-recommendation-engine-9fd4d57a4da

Sparse matrix recommendation system: https://towardsdatascience.com/building-my-own-2021-book-recommendation-engine-903ea10d5021


In [266]:
def get_datasets():
    goodbooks_dataset_path = "/home/joacopolo/Documents/software/python/draft-projects/books-recommender/goodbooks-10k"
    books = pd.read_csv(goodbooks_dataset_path+"/books.csv")
    book_tags = pd.read_csv(goodbooks_dataset_path+"/book_tags.csv")
    ratings = pd.read_csv(goodbooks_dataset_path+"/ratings.csv")
    tags = pd.read_csv(goodbooks_dataset_path+"/tags.csv")
    to_read = pd.read_csv(goodbooks_dataset_path+"/to_read.csv")
    return (books, book_tags, ratings, tags, to_read)

books, book_tags, ratings, tags, to_read = get_datasets()

n_users = ratings.user_id.unique().shape[0]
n_books = ratings.book_id.unique().shape[0]
ratings_matrix = np.zeros((n_users, n_books))


In [267]:
for row in ratings.itertuples():
    # row[1] - 1 is the user id readjusted to start by index 0
    # row[2] - 1 is the item id readjusted to start by index 0
    ratings_matrix[row[1]-1, row[2]-1] = row[3]

sample_row, sample_col = ratings_matrix.nonzero()
#takes sample row and col from the same index and its guaranteed to be non zero
n = 1
row = sample_row[n] 
col = sample_col[n]
ratings_matrix[row, col]

In [268]:
def new_sgd(ratings):  
    sgd = SGDMF(ratings)
    return sgd

In [269]:
def dump_sgd_to_pickle(sgd):
    with open('sgd_predictor.pkl', 'wb') as sgd_predictor_file:
        pickle.dump(sgd, sgd_predictor_file)

In [270]:
def load_sgd_from_pickle():    
    file = open("sgd_predictor.pkl",'rb')
    sgd =  pickle.load(file)
    return sgd

In [271]:
sgd = new_sgd(ratings_matrix)
sgd.train()

  (error * self.item_vecs[i, :] - \
  (error * self.user_vecs[u, :] - \
  (error * self.user_vecs[u, :] - \


Finished epoch 1 of 10
Finished epoch 2 of 10
Finished epoch 3 of 10
Finished epoch 4 of 10
Finished epoch 5 of 10
Finished epoch 6 of 10
Finished epoch 7 of 10
Finished epoch 8 of 10
Finished epoch 9 of 10
Finished epoch 10 of 10


In [278]:
sgd.user_vecs.shape #returns (53424, 40)
sgd.item_vecs.T.shape #returns (40, 10000)
ratings_matrix.shape #returns (53424, 10000)
#so user_vecs dot (items_vecs transposed) has the shape of the ratings matrix

(53424, 10000)

In [279]:
sgd.item_vecs[4]

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan])

# list(books[books["book_id"].isin([1,2])]["title"]) returns the name of the books from ids
# ratings[ratings["user_id"] == 1] returns the books id and ratings that a user has rated

In [259]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class BookDataset():
    def __init__(self, 
                 books,
                 ratings,
                 ratings_matrix,
                ):
        self.ratings = ratings
        self.books = books
        self.ratings_matrix = ratings_matrix
        self.ratings_dict = {}
        self.books["title_lowercase"] = books["title"].str.lower()

    def get_book_by_title(self, title):
        match = self.books[self.books["title_lowercase"].str.contains(title.lower())]
        return match.head(1)

    def get_book_by_id(self, book_id):
        return self.books[self.books["book_id"]==book_id]

    
    def rate_book(self, book_id, rating):
        self.ratings_dict[book_id] = rating
        return self.ratings_dict

    def see_ratings_dict(self, ratings_dict = None):
        if ratings_dict is None:
            ratings_dict = self.ratings_dict
        ratings_dict_title = {}
        for book_id, rating in ratings_dict.items():
            title = self.get_book_by_id(book_id)["title"]
            ratings_dict_title[str(title)] = rating
        return ratings_dict_title

    def generate_ratings_row(self):
        ratings_row = np.zeros((n_books))
        for book_id, rating in self.ratings_dict.items():
            ratings_row[book_id-1] = rating
        return ratings_row

    def get_most_similar_user_id(self,my_row):
        similarities=cosine_similarity([my_row],ratings_matrix)[0]
        most_similar_user = np.argmax(similarities)
        similarity = similarities[most_similar_user]
        return most_similar_user, similarity

    def ratings_dict_from_row(self, rating_row):
        ratings_dict = {}
        for i, rating in enumerate(rating_row):
            if rating != 0:
                ratings_dict[i+1] = rating
        return ratings_dict


In [260]:
books_dataset = BookDataset(books = books, ratings = ratings, ratings_matrix = ratings_matrix)

In [261]:
book_titles = [["siddhartha",5], ["demian",5], ["dune",5], ["brief history of time",5]]
for book_title, rating in book_titles:
    book_to_rate_id = int(books_dataset.get_book_by_title(book_title).book_id)
    books_dataset.rate_book(book_to_rate_id,rating)

In [262]:
books_dataset.see_ratings_dict()

{'179    Siddhartha\nName: title, dtype: object': 5,
 '2096    Demian. Die Geschichte von Emil Sinclairs Jugend\nName: title, dtype: object': 5,
 '125    Dune (Dune Chronicles #1)\nName: title, dtype: object': 5,
 '438    A Brief History of Time\nName: title, dtype: object': 5}

In [263]:
ratings_row = books_dataset.generate_ratings_row()

In [256]:
most_similar_user_id, similarity = books_dataset.get_most_similar_user_id(ratings_row) 

[0. 0. 0. ... 0. 0. 0.]


In [257]:
similar_ratings_dict = books_dataset.ratings_dict_from_row(ratings_matrix[most_similar_user_id])

In [258]:
print(books_dataset.see_ratings_dict(similar_ratings_dict))

{'3    To Kill a Mockingbird\nName: title, dtype: object': 3.0, '6    The Hobbit\nName: title, dtype: object': 5.0, '13    Animal Farm\nName: title, dtype: object': 5.0, '28    Romeo and Juliet\nName: title, dtype: object': 4.0, '34    The Alchemist\nName: title, dtype: object': 4.0, '38    A Game of Thrones (A Song of Ice and Fire, #1)\nName: title, dtype: object': 5.0, '49    Where the Sidewalk Ends\nName: title, dtype: object': 5.0, '57    The Adventures of Huckleberry Finn\nName: title, dtype: object': 4.0, '64    Slaughterhouse-Five\nName: title, dtype: object': 4.0, '93    One Hundred Years of Solitude\nName: title, dtype: object': 5.0, '125    Dune (Dune Chronicles #1)\nName: title, dtype: object': 5.0, '166    American Gods (American Gods, #1)\nName: title, dtype: object': 4.0, '179    Siddhartha\nName: title, dtype: object': 5.0, '188    The Lord of the Rings (The Lord of the Rings, ...\nName: title, dtype: object': 5.0, '190    Watchmen\nName: title, dtype: object': 5.0, '193