In [1]:
!pip install -r requirements.txt



In [2]:
import gzip
from collections import Counter
from json import loads
from typing import Union

import numpy as np
import pandas as pd
import torch
from IPython.display import display  # removes unnecessary error reports in PyCharm
from scipy import sparse
from tqdm import tqdm


tqdm.pandas()

In [3]:
def parse_json(filename: str, read_max: int = None) -> pd.DataFrame:
    """
    Reads the file line by line, parsing each line as json.
    """
    file = gzip.open(filename, "r")
    data = []
    for index, line in enumerate(tqdm(file)):
        if index == read_max:
            break
        data.append(loads(line))
    print(f"Read {len(data)} rows.")
    return pd.DataFrame.from_dict(data)

# Dataset

The dataset consists of three separate files:

- The books; we will only need this data to make sense of the recommendations. Using this data to make recommendations
is (at least for now) not required for the project. It would require analysis and comparisons of the books, which is not
a part of the base algorithm. However, being able to show which books are being recommended, rather than just showing an
ID, is quite valuable in itself.

- The reviews; this file contains all the reviews and associated ratings that the users have given. This is essentially
the explicit feedback that we can use to generate recommendations.

- The interactions; this file contains all the interactions between users and books. It contains explicit and implicit
feedback, both of which we can use to generate recommendations. We will probably only use the implicit data if we do use
the data in this file.

We have the option of using either explicit (i.e. ratings) or implicit (i.e. interactions) data. Because the paper
discusses the prediction of ratings, this is also what we will be doing.

The following cells load the data from the files and convert them into the appropriate types. This includes parsing
datetime strings, converting integers to numpy types, etc.

## Important variables/settings

- *n* determines the maximum number of rows read from any of the files.
- *datafile*, which can be either `interactions` or `reviews`, determines the file from which the data matrix will be
read.
- *k* determines the number of predictions the recommender will make, and on how many predictions it will be evaluated,
e.g. by using `Recall@k`.
- *epochs* determines the number of epochs we will use to train the recommender.

In [4]:
data_path = "data/"
books = "goodreads_books_comics_graphic.json.gz"
interactions = "goodreads_interactions_comics_graphic.json.gz"
reviews = "goodreads_reviews_comics_graphic.json.gz"
datasets = {books: None, interactions: None, reviews: None}

n = 100000
datafile = reviews
k = 10
epochs = 10

for filename in datasets.keys():
    print(filename)
    datasets[filename] = parse_json(data_path + filename, n)
    display(datasets[filename].head(5))

books_df = datasets[books][["book_id", "title"]].copy()
interactions_df = datasets[datafile][["user_id", "book_id", "rating", "date_updated"]].copy()

goodreads_books_comics_graphic.json.gz


89411it [00:05, 15358.35it/s]


Read 89411 rows.


Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,,1,[],US,,"[{'count': '228', 'name': 'to-read'}, {'count'...",B00NLXQ534,True,4.12,,...,,,,https://www.goodreads.com/book/show/25742454-t...,https://s.gr-assets.com/assets/nophoto/book/11...,25742454,1,42749946,The Switchblade Mamma,The Switchblade Mamma
1,2205073346.0,2,[],US,fre,"[{'count': '2', 'name': 'bd'}, {'count': '2', ...",,False,3.94,,...,1.0,,2016.0,https://www.goodreads.com/book/show/30128855-c...,https://images.gr-assets.com/books/1462644346m...,30128855,16,50558228,Cruelle,Cruelle
2,,5,"[246830, 362583, 362581, 623032]",US,eng,"[{'count': '493', 'name': 'to-read'}, {'count'...",,False,4.28,,...,,,2012.0,https://www.goodreads.com/book/show/13571772-c...,https://images.gr-assets.com/books/1333287305m...,13571772,51,102217,Captain America: Winter Soldier (The Ultimate ...,Captain America: Winter Soldier (The Ultimate ...
3,,1,[],US,eng,"[{'count': '222', 'name': 'to-read'}, {'count'...",B06XKGGSB7,True,4.05,B06XKGGSB7,...,,,,https://www.goodreads.com/book/show/35452242-b...,https://s.gr-assets.com/assets/nophoto/book/11...,35452242,6,54276229,Bounty Hunter 4/3: My Life in Combat from Mari...,Bounty Hunter 4/3: My Life in Combat from Mari...
4,930289765.0,6,"[266759, 1096220]",US,en-US,"[{'count': '20', 'name': 'to-read'}, {'count':...",,False,4.06,,...,11.0,,1997.0,https://www.goodreads.com/book/show/707611.Sup...,https://images.gr-assets.com/books/1307838888m...,707611,51,693886,"Superman Archives, Vol. 2","Superman Archives, Vol. 2"


goodreads_interactions_comics_graphic.json.gz


100000it [00:00, 152962.46it/s]


Read 100000 rows.


Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
0,8842281e1d1347389f2ab93d60773d4d,836610,6b4db26aafeaf0da77c7de6214331e1e,False,0,,Mon Aug 21 12:11:00 -0700 2017,Mon Aug 21 12:11:00 -0700 2017,,
1,8842281e1d1347389f2ab93d60773d4d,7648967,99b27059f711c37de8f90ee8e4dc0d1b,False,0,,Fri Feb 24 08:59:44 -0800 2017,Fri Feb 24 08:59:44 -0800 2017,,
2,8842281e1d1347389f2ab93d60773d4d,15704307,cb944d94854df5afd22210bb0aa0c903,False,0,,Wed May 20 21:28:56 -0700 2015,Wed May 20 21:28:57 -0700 2015,,
3,8842281e1d1347389f2ab93d60773d4d,6902644,2711bac2a8cc600dae1590a6ca0edb34,False,0,,Sun Jun 01 17:25:23 -0700 2014,Sun Jun 01 17:25:23 -0700 2014,,
4,8842281e1d1347389f2ab93d60773d4d,9844623,b72979076d1cded25dded922195e5b1c,False,0,,Sun Sep 02 08:45:08 -0700 2012,Sun Sep 02 08:45:08 -0700 2012,,


goodreads_reviews_comics_graphic.json.gz


100000it [00:01, 84043.66it/s]


Read 100000 rows.


Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,dc3763cdb9b2cae805882878eebb6a32,18471619,66b2ba840f9bd36d6d27f46136fe4772,3,Sherlock Holmes and the Vampires of London \n ...,Thu Dec 05 10:44:25 -0800 2013,Thu Dec 05 10:45:15 -0800 2013,Tue Nov 05 00:00:00 -0800 2013,,0,0
1,bafc2d50014200cda7cb2b6acd60cd73,6315584,72f1229aba5a88f9e72f0dcdc007dd22,4,"I've never really liked Spider-Man. I am, howe...",Wed Aug 10 06:06:48 -0700 2016,Fri Aug 12 08:49:54 -0700 2016,Fri Aug 12 08:49:54 -0700 2016,Wed Aug 10 00:00:00 -0700 2016,0,0
2,bafc2d50014200cda7cb2b6acd60cd73,29847729,a75309355f8662caaa5e2c92ab693d3f,4,"A very quick introduction, this is coming out ...",Thu Apr 21 07:44:00 -0700 2016,Thu Apr 21 07:59:28 -0700 2016,Thu Apr 21 07:59:28 -0700 2016,Thu Apr 21 00:00:00 -0700 2016,0,0
3,bafc2d50014200cda7cb2b6acd60cd73,18454118,c3cc5a3e1d6b6c9cf1c044f306c8e752,5,I've been waiting so long for this. I first st...,Mon Mar 03 17:45:56 -0800 2014,Mon Mar 03 17:54:11 -0800 2014,Sat Mar 01 00:00:00 -0800 2014,Sat Mar 01 00:00:00 -0800 2014,1,0
4,bafc2d50014200cda7cb2b6acd60cd73,2239435,cc444be37ab0a42bfb4dd818cb5edd10,4,The only thing more entertaining than this boo...,Wed Apr 03 12:37:48 -0700 2013,Wed Apr 03 13:03:36 -0700 2013,Wed Apr 03 13:03:36 -0700 2013,,0,0


In [5]:
format_str = "%a %b %d %H:%M:%S %z %Y"
interactions_df["date_updated"] = pd.to_datetime(interactions_df["date_updated"], format=format_str)

books_df["book_id"] = books_df["book_id"].astype("int64")
interactions_df["book_id"] = interactions_df["book_id"].astype("int64")

interactions_df = interactions_df.sort_values(by=["user_id", "date_updated"], ascending=[True, True])

In [6]:
def apply_consecutive_mapping(dataframe: pd.DataFrame, column: str, new_column: str, *additional: pd.DataFrame) -> None:
    """
    Generates a consecutive ID column for the values of an existing column. Also adds this column to additional data
    frames with the exact same mapping of old ID to new (consecutive) ID.
    """
    ids = {}

    def map_to_consecutive_ids(uuid: Union[int, np.int64]) -> int:
        """
        To be used with `pd.Dataframe.apply()` or `pd.Dataframe.progress_apply()`; returns a unique ID per distinct value.
        (Using the function attribute `map_to_consecutive_ids.ids` to avoid namespace pollution.
        """
        if uuid not in ids:
            ids[uuid] = len(ids)
        return ids[uuid]

    dataframe[new_column] = dataframe[column].progress_apply(map_to_consecutive_ids)
    for frame in additional:
        frame[new_column] = frame[column].progress_apply(lambda old_id: ids.get(old_id, -1))


apply_consecutive_mapping(interactions_df, "user_id", "user_id_int")
apply_consecutive_mapping(books_df, "book_id", "book_id_int", interactions_df)

display(books_df.head(10))
display(interactions_df.head(10))

100%|██████████| 100000/100000 [00:00<00:00, 1109874.36it/s]
100%|██████████| 89411/89411 [00:00<00:00, 1067636.46it/s]
100%|██████████| 100000/100000 [00:00<00:00, 978391.11it/s]


Unnamed: 0,book_id,title,book_id_int
0,25742454,The Switchblade Mamma,0
1,30128855,Cruelle,1
2,13571772,Captain America: Winter Soldier (The Ultimate ...,2
3,35452242,Bounty Hunter 4/3: My Life in Combat from Mari...,3
4,707611,"Superman Archives, Vol. 2",4
5,2250580,"A.I. Revolution, Vol. 1",5
6,27036536,"War Stories, Volume 3",6
7,27036537,"Crossed, Volume 15",7
8,27036538,"Crossed + One Hundred, Volume 2 (Crossed +100 #2)",8
9,27036539,"War Stories, Volume 4",9


Unnamed: 0,user_id,book_id,rating,date_updated,user_id_int,book_id_int
54004,000192962b87d560f00b06fdcbd71681,30025791,5,2017-01-28 09:56:08-08:00,0,38788
60060,0019be35f5c9e0ea5cb8263aba085de4,533016,5,2014-01-23 20:17:21-08:00,1,52841
50111,001eb567b3331c3ef3291a801d31be4a,30515,5,2016-06-28 01:30:08-07:00,2,31925
15201,0021e047a599f9827d75628db22097b6,11020991,3,2014-03-04 08:08:19-08:00,3,24192
32571,00254cd48d3d8a99ca9f0ed44fa69d5f,29800,4,2013-04-01 16:54:17-07:00,4,76189
32570,00254cd48d3d8a99ca9f0ed44fa69d5f,29801,4,2013-04-01 16:55:34-07:00,4,76190
75011,002a023d3de233b4bd3ec4fc3e9c581a,916755,4,2016-04-01 00:17:23-07:00,5,18645
75025,002a023d3de233b4bd3ec4fc3e9c581a,12550154,4,2016-04-01 00:44:45-07:00,5,19450
75021,002a023d3de233b4bd3ec4fc3e9c581a,8044557,3,2016-04-01 00:58:56-07:00,5,945
75024,002a023d3de233b4bd3ec4fc3e9c581a,10974311,0,2016-04-01 01:01:29-07:00,5,85254


# Preprocessing

We apply the following preprocessing steps:

- Reconsumption item removal, although there aren't many (or even none at all) of these in the dataset.

- Infrequent item removal; we remove any items that have less than a certain number (5) of interactions/ratings.

- Infrequent user removal; we remove users with less that the same certain number (5) of interactions/ratings.

In [7]:
def preprocess(df: pd.DataFrame, min_support: int = 5) -> pd.DataFrame:
    """
    Removes reconsumption items, items that have less than `min_support` interactions, and users that have less than
    `min_support` interactions. In some cases, removing an infrequent item may turn a frequent user into an infrequent
    one, and vice versa. In these cases, we don't remove the now infrequent user/item, we only consider the original
    frequency. As such, the preprocessed dataset may contain some users and items that don't reach the minimum support
    limit.
    """
    print(df.shape[0], "initial rows")
    # Drop reconsumption items
    df = df.drop_duplicates(subset=["user_id", "item_id"])
    print(df.shape[0], "rows after removing reconsumption items")
    # Compute user and item counts
    g1 = df.groupby("item_id", as_index=False)["user_id"].size()
    g1 = g1.rename({"size": "users_per_item"}, axis="columns")
    g2 = df.groupby("user_id", as_index=False)["item_id"].size()
    g2 = g2.rename({"size": "items_per_user"}, axis="columns")
    df = pd.merge(df, g1, how="left", on=["item_id"])
    df = pd.merge(df, g2, how="left", on=["user_id"])
    # Drop items and users with less than `min_support` interactions
    df = df[(df["users_per_item"] >= min_support) & (df["items_per_user"] >= min_support)]
    print(df.shape[0], "rows after removing infrequent items and users")
    df.drop(columns=["users_per_item", "items_per_user"], inplace=True)
    return df


processed_df = interactions_df[["user_id_int", "book_id_int", "date_updated", "rating"]].copy()
processed_df = processed_df.rename(
    columns={"user_id_int": "user_id", "book_id_int": "item_id", "date_updated": "datetime"})
display(processed_df.head(10))
print(f"Number of unique users:", processed_df["user_id"].nunique())
print(f"Number of unique items:", processed_df["item_id"].nunique())
processed_df = preprocess(processed_df)
print(f"Number of unique users:", processed_df["user_id"].nunique())
print(f"Number of unique items:", processed_df["item_id"].nunique())
display(processed_df.head(10))

Unnamed: 0,user_id,item_id,datetime,rating
54004,0,38788,2017-01-28 09:56:08-08:00,5
60060,1,52841,2014-01-23 20:17:21-08:00,5
50111,2,31925,2016-06-28 01:30:08-07:00,5
15201,3,24192,2014-03-04 08:08:19-08:00,3
32571,4,76189,2013-04-01 16:54:17-07:00,4
32570,4,76190,2013-04-01 16:55:34-07:00,4
75011,5,18645,2016-04-01 00:17:23-07:00,4
75025,5,19450,2016-04-01 00:44:45-07:00,4
75021,5,945,2016-04-01 00:58:56-07:00,3
75024,5,85254,2016-04-01 01:01:29-07:00,0


Number of unique users: 9016
Number of unique items: 35714
100000 initial rows
100000 rows after removing reconsumption items
46657 rows after removing infrequent items and users
Number of unique users: 2657
Number of unique items: 3682


Unnamed: 0,user_id,item_id,datetime,rating
8,5,945,2016-04-01 00:58:56-07:00,3
9,5,85254,2016-04-01 01:01:29-07:00,0
10,5,63080,2016-04-01 01:02:25-07:00,3
13,5,44739,2016-04-01 01:14:37-07:00,4
15,5,36018,2016-04-21 00:32:09-07:00,4
17,5,17524,2016-04-21 18:43:32-07:00,3
18,5,2749,2016-08-11 09:20:40-07:00,3
21,5,31777,2016-08-11 09:35:30-07:00,5
24,5,973,2016-09-30 21:08:43-07:00,3
25,5,54896,2016-09-30 21:08:44-07:00,4


In [8]:
def split(items: list[int], percentage_train: float) -> pd.Series:
    nr_train_items = int(len(items) * percentage_train)
    return pd.Series((items[: nr_train_items], items[nr_train_items:]))


sessions_df = processed_df.groupby(by="user_id", as_index=False)[["item_id", "datetime", "rating"]].agg(list)
display(sessions_df.head(5))

percentage_train = 0.8
sessions_df[["item_id_history", "item_id_future"]] = sessions_df["item_id"].progress_apply(split,
                                                                                           args=(percentage_train,))
sessions_df[["rating_history", "rating_future"]] = sessions_df["rating"].progress_apply(split, args=(percentage_train,))
display(sessions_df.head(5))

Unnamed: 0,user_id,item_id,datetime,rating
0,5,"[945, 85254, 63080, 44739, 36018, 17524, 2749,...","[2016-04-01 00:58:56-07:00, 2016-04-01 01:01:2...","[3, 0, 3, 4, 4, 3, 3, 5, 3, 4, 4, 4, 4, 3, 4, ..."
1,6,"[58655, 21663, 84762, 5703, 47916, 70258, 43524]","[2017-05-24 14:07:48-07:00, 2017-07-10 10:55:2...","[1, 3, 1, 3, 3, 4, 3]"
2,10,"[44739, 49712, 34074, 64190, 10700, 71828, 430...","[2014-06-04 11:35:45-07:00, 2014-11-11 20:50:2...","[1, 3, 4, 1, 3, 3, 4, 5, 3, 4, 4, 4, 5, 2, 5, ..."
3,13,"[10740, 73775, 46111, 72982, 88824, 42840, 827...","[2008-04-21 10:17:04-07:00, 2009-08-16 20:40:0...","[4, 3, 5, 3, 4, 3, 4, 4, 3, 4, 4, 5, 5, 4, 5]"
4,15,"[87706, 3861, 53792, 30798, 40410, 67727]","[2014-10-17 16:50:25-07:00, 2015-04-18 15:30:2...","[5, 4, 5, 5, 4, 3]"


100%|██████████| 2657/2657 [00:00<00:00, 8576.89it/s] 
100%|██████████| 2657/2657 [00:00<00:00, 8363.13it/s] 


Unnamed: 0,user_id,item_id,datetime,rating,item_id_history,item_id_future,rating_history,rating_future
0,5,"[945, 85254, 63080, 44739, 36018, 17524, 2749,...","[2016-04-01 00:58:56-07:00, 2016-04-01 01:01:2...","[3, 0, 3, 4, 4, 3, 3, 5, 3, 4, 4, 4, 4, 3, 4, ...","[945, 85254, 63080, 44739, 36018, 17524, 2749,...","[27227, 71060, 22211, 62482]","[3, 0, 3, 4, 4, 3, 3, 5, 3, 4, 4, 4, 4]","[3, 4, 3, 5]"
1,6,"[58655, 21663, 84762, 5703, 47916, 70258, 43524]","[2017-05-24 14:07:48-07:00, 2017-07-10 10:55:2...","[1, 3, 1, 3, 3, 4, 3]","[58655, 21663, 84762, 5703, 47916]","[70258, 43524]","[1, 3, 1, 3, 3]","[4, 3]"
2,10,"[44739, 49712, 34074, 64190, 10700, 71828, 430...","[2014-06-04 11:35:45-07:00, 2014-11-11 20:50:2...","[1, 3, 4, 1, 3, 3, 4, 5, 3, 4, 4, 4, 5, 2, 5, ...","[44739, 49712, 34074, 64190, 10700, 71828, 430...","[48597, 29015, 19669, 2624, 84659, 12857, 48767]","[1, 3, 4, 1, 3, 3, 4, 5, 3, 4, 4, 4, 5, 2, 5, ...","[3, 5, 5, 5, 2, 4, 5]"
3,13,"[10740, 73775, 46111, 72982, 88824, 42840, 827...","[2008-04-21 10:17:04-07:00, 2009-08-16 20:40:0...","[4, 3, 5, 3, 4, 3, 4, 4, 3, 4, 4, 5, 5, 4, 5]","[10740, 73775, 46111, 72982, 88824, 42840, 827...","[16907, 84322, 81142]","[4, 3, 5, 3, 4, 3, 4, 4, 3, 4, 4, 5]","[5, 4, 5]"
4,15,"[87706, 3861, 53792, 30798, 40410, 67727]","[2014-10-17 16:50:25-07:00, 2015-04-18 15:30:2...","[5, 4, 5, 5, 4, 3]","[87706, 3861, 53792, 30798]","[40410, 67727]","[5, 4, 5, 5]","[4, 3]"


In [9]:
def create_sparse_matrix(dataframe: pd.DataFrame, item_id_column: str, value_column: str = None,
                         shape: tuple[int, int] = None) -> sparse.csr_matrix:
    """
    Creates a sparse matrix from the data in `dataframe`.
    """
    # Flatten the dataframe
    user_ids = []
    item_ids = []
    values = []
    for index, row in dataframe.iterrows():
        user_ids.extend([row["user_id"]] * len(row[item_id_column]))
        item_ids.extend(row[item_id_column])
        if value_column is not None:
            values.extend(row[value_column])
    if value_column is None:
        values = np.ones(len(user_ids))
    # Create the CSR matrix
    return sparse.csr_matrix((values, (user_ids, item_ids)), shape=shape, dtype=np.int32)


shape = (processed_df["user_id"].max() + 1, processed_df["item_id"].max() + 1)
# train = create_sparse_matrix(sessions_df, "item_id_history", "rating_history", shape)
# true = create_sparse_matrix(sessions_df, "item_id_future", "rating_future", shape)
train = create_sparse_matrix(sessions_df, "item_id_history", None, shape)
true = create_sparse_matrix(sessions_df, "item_id_future", None, shape)

# Popularity recommender

In [10]:
class PopularityRecommender:
    def __init__(self, k: int = 10) -> None:
        self.k = k
        self.scores = []

    def fit(self, data: sparse.csr_matrix) -> None:
        items = list(data.nonzero()[1])
        scores = Counter(items).most_common(self.k)
        self.scores = [(item, score / scores[0][1]) for item, score in scores]

    def predict(self, data: sparse.csr_matrix) -> sparse.csr_matrix:
        items, values = zip(*self.scores)
        users = set(data.nonzero()[0])

        matrix_data = ([], ([], []))
        for user in users:
            matrix_data[0].extend(values)
            matrix_data[1][0].extend([user] * self.k)
            matrix_data[1][1].extend(items)
        return sparse.csr_matrix(matrix_data, shape=data.shape)

# recommender = PopularityRecommender(k)
# recommender.fit(train)
# predicted = recommender.predict(train)
# print(predicted)

# Evaluation

The evaluation metrics we will be using are the following:

- Recall @ 10: the percentage of users where the top-10 recommendations are relevant.

- NDCG @ 10: similar to recall but the sum of the hits is weighted by the place in the top 10.

- Qualitative results, i.e. examples of the recommendations.

In [11]:
def sparse_invert_nonzero(a: sparse.csr_matrix) -> sparse.csr_matrix:
    inverse = a.copy()
    inverse.data = 1 / inverse.data
    return inverse


def sparse_divide_nonzero(a: sparse.csr_matrix, b: sparse.csr_matrix) -> sparse.csr_matrix:
    return a.multiply(sparse_invert_nonzero(b))


def compute_recall(true: sparse.csr_matrix, predicted: sparse.csr_matrix) -> float:
    scores = sparse.lil_matrix(predicted.shape)
    scores[predicted.multiply(true).astype(bool)] = 1
    scores = sparse_divide_nonzero(scores.tocsr(), sparse.csr_matrix(true.sum(axis=1))).sum(axis=1)
    return scores.mean()

# recall = compute_recall(true, predicted)
# print(f"Recall @ {k}: {recall:.4f}")

# RBM-based recommender

In [12]:
class RBMRecommender:
    """
    A recommender system for ratings based on Restricted Boltzmann Machines (RBMs).
    """

    def __init__(self, nr_items: int, nr_ratings: int, nr_hidden: int, learning_rate: float, k: int = 10) -> None:
        # self.rng = np.random.default_rng()
        # Training and evaluation parameters
        self.nr_items = nr_items
        self.nr_ratings = nr_ratings
        self.nr_hidden = nr_hidden
        self.learning_rate = learning_rate
        self.k = k
        # RBM weights and biases
        self.weights = torch.randn((nr_items, nr_hidden))
        self.bias_items = torch.randn(nr_items)
        self.bias_features = torch.randn(nr_hidden)

    def probability_hidden(self, visible: torch.Tensor) -> torch.Tensor:
        """ Computes `p(h_j = 1 | V)`. """
        return torch.sigmoid(self.bias_features + torch.matmul(visible, self.weights))

    def probability_visible(self, hidden: torch.Tensor) -> torch.Tensor:
        """ Computes `p(v^k_i = 1 | h)`. """
        return torch.sigmoid(self.bias_items + torch.matmul(hidden, self.weights.transpose(0, 1)))

    def sample_hidden(self, visible: torch.Tensor) -> torch.Tensor:
        """ Randomly samples from the conditional Bernoulli distribution defined by probability_hidden(). """
        return torch.bernoulli(self.probability_hidden(visible))

    def sample_visible(self, hidden: torch.Tensor) -> torch.Tensor:
        """ Randomly samples from the conditional Bernoulli distribution defined by probability_visible(). """
        return torch.bernoulli(self.probability_visible(hidden))

    def fit(self, data: sparse.csr_matrix, batch_size: int = 10, sampling_iterations: int = 1) -> None:
        """
        Trains the underlying RBM with the given input data. For now, we will ignore the value of the ratings, and
        instead look at the data as a binary interaction matrix (i.e. a value => 1, no value => 0).

        :param data: A |users|x|items| sparse matrix, where each entry is the rating given by that user to that item.
        :param batch_size: The number of users to use per mini-batch.
        :param sampling_iterations: The number of iterations of alternating Gibbs sampling per iteration of CD.
        """
        for batch_start in tqdm(range(0, data.shape[0], batch_size)):
            batch = torch.Tensor(data[batch_start : batch_start + batch_size, :].toarray())
            batch_t = batch.transpose(0, 1)

            visible = batch
            for _ in range(sampling_iterations):
                hidden = self.sample_hidden(visible)
                visible = self.sample_visible(hidden)
                # We don't want to update any missing values, i.e. values that are 0
                visible[batch == 0] = 0

            data_sample = torch.matmul(batch_t, self.probability_hidden(batch))
            reconstruction_sample = torch.matmul(visible.transpose(0, 1), self.probability_hidden(visible))

            self.weights += self.learning_rate * (data_sample - reconstruction_sample)

    def predict(self, data: sparse.csr_matrix) -> sparse.csr_matrix:
        """ Returns predictions for each user in the data matrix. """
        matrix_data = ([], ([], []))  # (scores, (user IDs, item IDs))

        for user_id, history in enumerate(data.iterrow()):
            hidden = self.sample_hidden(torch.Tensor(history.toarray()))
            visible = self.probability_visible(hidden)
            item_ids = np.argpartition(visible, self.k)[: self.k]
            scores = [visible[item_id] for item_id in item_ids]

            matrix_data[0].extend(scores)
            matrix_data[1][0].extend([user_id] * self.k)
            matrix_data[1][1].extend(item_ids)
        return sparse.csr_matrix(matrix_data)

In [13]:
recommender = RBMRecommender(train.shape[1], 0, 100, 0.01)
for _ in range(epochs):
    recommender.fit(train)

100%|██████████| 902/902 [00:33<00:00, 26.65it/s]
100%|██████████| 902/902 [00:34<00:00, 26.49it/s]
100%|██████████| 902/902 [00:33<00:00, 27.28it/s]
100%|██████████| 902/902 [00:32<00:00, 27.48it/s]
100%|██████████| 902/902 [00:34<00:00, 26.02it/s]
100%|██████████| 902/902 [00:34<00:00, 26.28it/s]
100%|██████████| 902/902 [00:32<00:00, 28.12it/s]
100%|██████████| 902/902 [00:33<00:00, 27.31it/s]
100%|██████████| 902/902 [00:32<00:00, 27.59it/s]
100%|██████████| 902/902 [00:32<00:00, 27.37it/s]


In [14]:
for user_id in range(train.shape[0]):
    if user_id == 10:
        break
    elif train.getrow(user_id).getnnz() == 0:
        continue

    prediction = recommender.predict(torch.Tensor(train.getrow(user_id).toarray()))
    top_items = np.argpartition(prediction, k)[:k]

    for item_id in true.indices[true.indptr[user_id]:true.indptr[user_id + 1]]:
        print(f"{item_id:7} {prediction[0][item_id]}")

  22211 0.001902350108139217
  27227 0.5362015962600708
  62482 0.0015613083960488439
  71060 0.9978786706924438
  43524 0.9998751878738403
  70258 0.9669559597969055
