In [1]:
!pip install -r requirements.txt



In [2]:
import gzip
from collections import Counter

import numpy as np
import pandas as pd
from IPython.display import display  # removes unnecessary error reports in PyCharm
from scipy import sparse
from tqdm import tqdm


tqdm.pandas()

In [3]:
def parse_json(filename: str, read_max: int = None) -> pd.DataFrame:
    """
    Reads the file line by line, parsing each line as json.
    """
    file = gzip.open(filename, "r")
    data = []
    for index, line in enumerate(tqdm(file)):
        if index == read_max:
            break
        line = line.decode()
        line = line.replace("true", "True")
        line = line.replace("false", "False")
        data.append(eval(line))
    print(f"Read {len(data)} rows.")
    return pd.DataFrame.from_dict(data)

In [4]:
data_path = "data/"
books = "goodreads_books_comics_graphic.json.gz"
interactions = "goodreads_interactions_comics_graphic.json.gz"
reviews = "goodreads_reviews_comics_graphic.json.gz"
datasets = {books: None, interactions: None, reviews: None}

for filename in datasets.keys():
    print(filename)
    datasets[filename] = parse_json(data_path + filename, 500000)
    display(datasets[filename].head(5))

goodreads_books_comics_graphic.json.gz


89411it [00:38, 2342.76it/s]


Read 89411 rows.


Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,,1,[],US,,"[{'count': '228', 'name': 'to-read'}, {'count'...",B00NLXQ534,True,4.12,,...,,,,https://www.goodreads.com/book/show/25742454-t...,https://s.gr-assets.com/assets/nophoto/book/11...,25742454,1,42749946,The Switchblade Mamma,The Switchblade Mamma
1,2205073346.0,2,[],US,fre,"[{'count': '2', 'name': 'bd'}, {'count': '2', ...",,False,3.94,,...,1.0,,2016.0,https://www.goodreads.com/book/show/30128855-c...,https://images.gr-assets.com/books/1462644346m...,30128855,16,50558228,Cruelle,Cruelle
2,,5,"[246830, 362583, 362581, 623032]",US,eng,"[{'count': '493', 'name': 'to-read'}, {'count'...",,False,4.28,,...,,,2012.0,https://www.goodreads.com/book/show/13571772-c...,https://images.gr-assets.com/books/1333287305m...,13571772,51,102217,Captain America: Winter Soldier (The Ultimate ...,Captain America: Winter Soldier (The Ultimate ...
3,,1,[],US,eng,"[{'count': '222', 'name': 'to-read'}, {'count'...",B06XKGGSB7,True,4.05,B06XKGGSB7,...,,,,https://www.goodreads.com/book/show/35452242-b...,https://s.gr-assets.com/assets/nophoto/book/11...,35452242,6,54276229,Bounty Hunter 4/3: My Life in Combat from Mari...,Bounty Hunter 4/3: My Life in Combat from Mari...
4,930289765.0,6,"[266759, 1096220]",US,en-US,"[{'count': '20', 'name': 'to-read'}, {'count':...",,False,4.06,,...,11.0,,1997.0,https://www.goodreads.com/book/show/707611.Sup...,https://images.gr-assets.com/books/1307838888m...,707611,51,693886,"Superman Archives, Vol. 2","Superman Archives, Vol. 2"


goodreads_interactions_comics_graphic.json.gz


500000it [00:14, 34542.26it/s]


Read 500000 rows.


Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
0,8842281e1d1347389f2ab93d60773d4d,836610,6b4db26aafeaf0da77c7de6214331e1e,False,0,,Mon Aug 21 12:11:00 -0700 2017,Mon Aug 21 12:11:00 -0700 2017,,
1,8842281e1d1347389f2ab93d60773d4d,7648967,99b27059f711c37de8f90ee8e4dc0d1b,False,0,,Fri Feb 24 08:59:44 -0800 2017,Fri Feb 24 08:59:44 -0800 2017,,
2,8842281e1d1347389f2ab93d60773d4d,15704307,cb944d94854df5afd22210bb0aa0c903,False,0,,Wed May 20 21:28:56 -0700 2015,Wed May 20 21:28:57 -0700 2015,,
3,8842281e1d1347389f2ab93d60773d4d,6902644,2711bac2a8cc600dae1590a6ca0edb34,False,0,,Sun Jun 01 17:25:23 -0700 2014,Sun Jun 01 17:25:23 -0700 2014,,
4,8842281e1d1347389f2ab93d60773d4d,9844623,b72979076d1cded25dded922195e5b1c,False,0,,Sun Sep 02 08:45:08 -0700 2012,Sun Sep 02 08:45:08 -0700 2012,,


goodreads_reviews_comics_graphic.json.gz


500000it [00:19, 25936.44it/s]


Read 500000 rows.


Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,dc3763cdb9b2cae805882878eebb6a32,18471619,66b2ba840f9bd36d6d27f46136fe4772,3,Sherlock Holmes and the Vampires of London \n ...,Thu Dec 05 10:44:25 -0800 2013,Thu Dec 05 10:45:15 -0800 2013,Tue Nov 05 00:00:00 -0800 2013,,0,0
1,bafc2d50014200cda7cb2b6acd60cd73,6315584,72f1229aba5a88f9e72f0dcdc007dd22,4,"I've never really liked Spider-Man. I am, howe...",Wed Aug 10 06:06:48 -0700 2016,Fri Aug 12 08:49:54 -0700 2016,Fri Aug 12 08:49:54 -0700 2016,Wed Aug 10 00:00:00 -0700 2016,0,0
2,bafc2d50014200cda7cb2b6acd60cd73,29847729,a75309355f8662caaa5e2c92ab693d3f,4,"A very quick introduction, this is coming out ...",Thu Apr 21 07:44:00 -0700 2016,Thu Apr 21 07:59:28 -0700 2016,Thu Apr 21 07:59:28 -0700 2016,Thu Apr 21 00:00:00 -0700 2016,0,0
3,bafc2d50014200cda7cb2b6acd60cd73,18454118,c3cc5a3e1d6b6c9cf1c044f306c8e752,5,I've been waiting so long for this. I first st...,Mon Mar 03 17:45:56 -0800 2014,Mon Mar 03 17:54:11 -0800 2014,Sat Mar 01 00:00:00 -0800 2014,Sat Mar 01 00:00:00 -0800 2014,1,0
4,bafc2d50014200cda7cb2b6acd60cd73,2239435,cc444be37ab0a42bfb4dd818cb5edd10,4,The only thing more entertaining than this boo...,Wed Apr 03 12:37:48 -0700 2013,Wed Apr 03 13:03:36 -0700 2013,Wed Apr 03 13:03:36 -0700 2013,,0,0


In [5]:
books_df = datasets[books][["book_id", "title"]].copy()
interactions_df = datasets[interactions][["user_id", "book_id", "is_read", "rating", "date_updated"]].copy()
ratings_df = datasets[reviews][["user_id", "book_id", "rating", "date_updated"]].copy()

In [6]:
# Parse dates
format_str = "%a %b %d %H:%M:%S %z %Y"
interactions_df["date_updated"] = pd.to_datetime(interactions_df["date_updated"], format=format_str)
ratings_df["date_updated"] = pd.to_datetime(ratings_df["date_updated"], format=format_str)

# Convert the book IDs to numpy integers
books_df["book_id"] = books_df["book_id"].astype("int64")
interactions_df["book_id"] = interactions_df["book_id"].astype("int64")
ratings_df["book_id"] = ratings_df["book_id"].astype("int64")

interactions_df = interactions_df.sort_values(by=["user_id", "date_updated"], ascending=[True, True])
ratings_df = ratings_df.sort_values(by=["user_id", "date_updated"], ascending=[True, True])

In [7]:
ids = {}


def map_to_consecutive_ids(uuid):
    if uuid not in ids:
        ids[uuid] = len(ids)
    return ids[uuid]


interactions_df["user_id_int"] = interactions_df["user_id"].progress_apply(map_to_consecutive_ids)
ratings_df["user_id_int"] = ratings_df["user_id"].progress_apply(lambda user_id: ids.get(user_id, -1))
ids.clear()
books_df["book_id_int"] = books_df["book_id"].progress_apply(map_to_consecutive_ids)
interactions_df["book_id_int"] = interactions_df["book_id"].progress_apply(lambda book_id: ids.get(book_id, -1))
ratings_df["book_id_int"] = ratings_df["book_id"].progress_apply(lambda book_id: ids.get(book_id, -1))

display(books_df.head(10))
display(interactions_df.head(10))

100%|██████████| 500000/500000 [00:00<00:00, 1395300.50it/s]
100%|██████████| 500000/500000 [00:00<00:00, 1365930.04it/s]
100%|██████████| 89411/89411 [00:00<00:00, 1132290.61it/s]
100%|██████████| 500000/500000 [00:00<00:00, 1131442.27it/s]
100%|██████████| 500000/500000 [00:00<00:00, 1105213.20it/s]


Unnamed: 0,book_id,title,book_id_int
0,25742454,The Switchblade Mamma,0
1,30128855,Cruelle,1
2,13571772,Captain America: Winter Soldier (The Ultimate ...,2
3,35452242,Bounty Hunter 4/3: My Life in Combat from Mari...,3
4,707611,"Superman Archives, Vol. 2",4
5,2250580,"A.I. Revolution, Vol. 1",5
6,27036536,"War Stories, Volume 3",6
7,27036537,"Crossed, Volume 15",7
8,27036538,"Crossed + One Hundred, Volume 2 (Crossed +100 #2)",8
9,27036539,"War Stories, Volume 4",9


Unnamed: 0,user_id,book_id,is_read,rating,date_updated,user_id_int,book_id_int
496200,00153d136ac254437511fad5e10e246d,7619292,True,3,2012-08-30 11:53:39-07:00,0,73347
496199,00153d136ac254437511fad5e10e246d,29396738,True,0,2017-08-07 03:40:09-07:00,0,2624
350064,0018ce6296baeccf95c3502deeff0600,472331,True,5,2012-08-05 09:26:00-07:00,1,2749
350062,0018ce6296baeccf95c3502deeff0600,3058907,True,4,2012-08-05 13:11:48-07:00,1,55521
350061,0018ce6296baeccf95c3502deeff0600,4280231,True,4,2012-08-05 13:11:52-07:00,1,59877
350059,0018ce6296baeccf95c3502deeff0600,11470711,True,5,2012-08-05 13:14:57-07:00,1,20568
350058,0018ce6296baeccf95c3502deeff0600,844355,True,4,2012-08-05 13:19:32-07:00,1,22652
350057,0018ce6296baeccf95c3502deeff0600,1568491,True,4,2012-08-05 13:19:39-07:00,1,88449
350056,0018ce6296baeccf95c3502deeff0600,23754,True,4,2012-08-09 03:53:26-07:00,1,85188
350063,0018ce6296baeccf95c3502deeff0600,13480845,True,5,2012-08-15 04:26:40-07:00,1,24277


In [8]:
def preprocess(df: pd.DataFrame, min_support: int = 5) -> pd.DataFrame:
    """
    Removes reconsumption items, items that have less than `min_support` interactions, and users that have less than
    `min_support` interactions.
    """
    print(df.shape[0], "initial rows")
    # Drop reconsumption items
    df = df.drop_duplicates(subset=["user_id", "item_id"])
    print(df.shape[0], "rows after removing reconsumption items")
    # Compute user and item counts
    g1 = df.groupby("item_id", as_index=False)["user_id"].size()
    g1 = g1.rename({"size": "users_per_item"}, axis="columns")
    g2 = df.groupby("user_id", as_index=False)["item_id"].size()
    g2 = g2.rename({"size": "items_per_user"}, axis="columns")
    df = pd.merge(df, g1, how="left", on=["item_id"])
    df = pd.merge(df, g2, how="left", on=["user_id"])
    # Drop items and users with less than `min_support` interactions
    df = df[df["users_per_item"] >= min_support]
    print(df.shape[0], "rows after removing infrequent items")
    df = df[df["items_per_user"] >= min_support]
    print(df.shape[0], "rows after removing infrequent users")
    df = df[["user_id", "item_id", "datetime", "rating"]].copy()
    return df


interactions_processed = interactions_df[["user_id_int", "book_id_int", "date_updated", "rating"]].copy()
interactions_processed = interactions_processed.rename(
    columns={"user_id_int": "user_id", "book_id_int": "item_id", "date_updated": "datetime"})
display(interactions_processed.head(5))
print(f"Number of unique users:", interactions_processed["user_id"].nunique())
print(f"Number of unique items:", interactions_processed["item_id"].nunique())
interactions_processed = preprocess(interactions_processed)
print(f"Number of unique users:", interactions_processed["user_id"].nunique())
print(f"Number of unique items:", interactions_processed["item_id"].nunique())
display(interactions_processed.head(5))

Unnamed: 0,user_id,item_id,datetime,rating
496200,0,73347,2012-08-30 11:53:39-07:00,3
496199,0,2624,2017-08-07 03:40:09-07:00,0
350064,1,2749,2012-08-05 09:26:00-07:00,5
350062,1,55521,2012-08-05 13:11:48-07:00,4
350061,1,59877,2012-08-05 13:11:52-07:00,4


Number of unique users: 18246
Number of unique items: 53635
500000 initial rows
500000 rows after removing reconsumption items
433878 rows after removing infrequent items
417596 rows after removing infrequent users
Number of unique users: 8924
Number of unique items: 17714


Unnamed: 0,user_id,item_id,datetime,rating
2,1,2749,2012-08-05 09:26:00-07:00,5
3,1,55521,2012-08-05 13:11:48-07:00,4
4,1,59877,2012-08-05 13:11:52-07:00,4
5,1,20568,2012-08-05 13:14:57-07:00,5
6,1,22652,2012-08-05 13:19:32-07:00,4


In [9]:
def split(items: list[int], percentage_train: float) -> pd.Series:
    nr_train_items = int(len(items) * percentage_train)
    return pd.Series((items[: nr_train_items], items[nr_train_items:]))


sessions_df = interactions_processed.groupby(by="user_id", as_index=False)[["item_id", "datetime", "rating"]].agg(list)
display(sessions_df.head(5))

percentage_train = 0.8
sessions_df[["history", "future"]] = sessions_df["item_id"].progress_apply(split, args=(percentage_train,))
display(sessions_df.head(5))

Unnamed: 0,user_id,item_id,datetime,rating
0,1,"[2749, 55521, 59877, 20568, 22652, 88449, 8518...","[2012-08-05 09:26:00-07:00, 2012-08-05 13:11:4...","[5, 4, 4, 5, 4, 4, 4, 5, 5, 3, 4, 0, 0, 4, 0, 0]"
1,2,"[1027, 41468, 16631, 58923, 53726, 46006, 6472...","[2015-04-05 11:03:44-07:00, 2015-04-05 11:04:0...","[0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3,"[78364, 2961, 17726, 47072, 2177, 72410, 25660...","[2013-02-10 19:48:55-08:00, 2013-12-25 11:43:1...","[5, 5, 5, 5, 5, 5, 5, 3, 0, 0, 5, 3, 0, 5, 0, ..."
3,4,"[76189, 76190, 11457, 36906, 74822]","[2013-04-01 16:54:17-07:00, 2013-04-01 16:55:3...","[4, 4, 0, 0, 0]"
4,7,"[64368, 30375, 47642, 48180, 86871]","[2012-07-09 10:41:56-07:00, 2012-07-09 10:41:5...","[4, 4, 4, 5, 5]"


100%|██████████| 8924/8924 [00:01<00:00, 8248.80it/s] 


Unnamed: 0,user_id,item_id,datetime,rating,history,future
0,1,"[2749, 55521, 59877, 20568, 22652, 88449, 8518...","[2012-08-05 09:26:00-07:00, 2012-08-05 13:11:4...","[5, 4, 4, 5, 4, 4, 4, 5, 5, 3, 4, 0, 0, 4, 0, 0]","[2749, 55521, 59877, 20568, 22652, 88449, 8518...","[1138, 34891, 4357, 82031]"
1,2,"[1027, 41468, 16631, 58923, 53726, 46006, 6472...","[2015-04-05 11:03:44-07:00, 2015-04-05 11:04:0...","[0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1027, 41468, 16631, 58923, 53726, 46006, 6472...","[1286, 38245, 47059, 77648, 64113, 9313, 37629..."
2,3,"[78364, 2961, 17726, 47072, 2177, 72410, 25660...","[2013-02-10 19:48:55-08:00, 2013-12-25 11:43:1...","[5, 5, 5, 5, 5, 5, 5, 3, 0, 0, 5, 3, 0, 5, 0, ...","[78364, 2961, 17726, 47072, 2177, 72410, 25660...","[25189, 57748, 2946, 12843, 42538, 79275, 3495]"
3,4,"[76189, 76190, 11457, 36906, 74822]","[2013-04-01 16:54:17-07:00, 2013-04-01 16:55:3...","[4, 4, 0, 0, 0]","[76189, 76190, 11457, 36906]",[74822]
4,7,"[64368, 30375, 47642, 48180, 86871]","[2012-07-09 10:41:56-07:00, 2012-07-09 10:41:5...","[4, 4, 4, 5, 5]","[64368, 30375, 47642, 48180]",[86871]


In [10]:
def create_sparse_matrix(dataframe: pd.DataFrame, column: str = "history",
                         shape: tuple[int, int] = None) -> sparse.csr_matrix:
    """
    Creates a sparse matrix from the data in `dataframe`.
    """
    # Flatten the dataframe
    user_ids = []
    item_ids = []
    for index, row in dataframe.iterrows():
        user_ids.extend([row["user_id"]] * len(row[column]))
        item_ids.extend(row[column])
    # Create the CSR matrix
    values = np.ones(len(user_ids))
    return sparse.csr_matrix((values, (user_ids, item_ids)), shape=shape, dtype=np.int32)


shape = (interactions_processed["user_id"].max() + 1, interactions_processed["item_id"].max() + 1)
train = create_sparse_matrix(sessions_df, "history", shape)
true = create_sparse_matrix(sessions_df, "future", shape)

In [11]:
class PopularityRecommender:
    def __init__(self, k: int = 10) -> None:
        self.k = k
        self.scores = []

    def fit(self, data: sparse.csr_matrix) -> None:
        items = list(data.nonzero()[1])
        scores = Counter(items).most_common(self.k)
        self.scores = [(item, score / scores[0][1]) for item, score in scores]

    def predict(self, data: sparse.csr_matrix) -> sparse.csr_matrix:
        items, values = zip(*self.scores)
        users = set(data.nonzero()[0])

        matrix_data = ([], ([], []))
        for user in users:
            matrix_data[0].extend(values)
            matrix_data[1][0].extend([user] * self.k)
            matrix_data[1][1].extend(items)
        return sparse.csr_matrix(matrix_data, shape=data.shape)


k = 20
recommender = PopularityRecommender(k)
recommender.fit(train)
predicted = recommender.predict(train)
print(predicted)

  (1, 469)	0.30438756855575866
  (1, 2227)	0.29981718464351004
  (1, 2749)	1.0
  (1, 3495)	0.5123400365630713
  (1, 7451)	0.31855575868372943
  (1, 7453)	0.5287934186471663
  (1, 8570)	0.5269652650822669
  (1, 12857)	0.40036563071297987
  (1, 23144)	0.3212979890310786
  (1, 29015)	0.353290676416819
  (1, 31264)	0.6773308957952467
  (1, 33784)	0.4218464351005484
  (1, 41468)	0.4076782449725777
  (1, 44739)	0.3240402193784278
  (1, 47066)	0.3697440585009141
  (1, 65107)	0.3592321755027422
  (1, 75565)	0.4428702010968921
  (1, 76189)	0.36745886654478976
  (1, 85188)	0.6672760511882998
  (1, 86659)	0.29890310786106034
  (2, 469)	0.30438756855575866
  (2, 2227)	0.29981718464351004
  (2, 2749)	1.0
  (2, 3495)	0.5123400365630713
  (2, 7451)	0.31855575868372943
  :	:
  (18244, 65107)	0.3592321755027422
  (18244, 75565)	0.4428702010968921
  (18244, 76189)	0.36745886654478976
  (18244, 85188)	0.6672760511882998
  (18244, 86659)	0.29890310786106034
  (18245, 469)	0.30438756855575866
  (18245, 222

In [12]:
def sparse_invert_nonzero(a: sparse.csr_matrix) -> sparse.csr_matrix:
    inverse = a.copy()
    inverse.data = 1 / inverse.data
    return inverse


def sparse_divide_nonzero(a: sparse.csr_matrix, b: sparse.csr_matrix) -> sparse.csr_matrix:
    return a.multiply(sparse_invert_nonzero(b))


def compute_recall(true: sparse.csr_matrix, predicted: sparse.csr_matrix) -> float:
    scores = sparse.lil_matrix(predicted.shape)
    scores[predicted.multiply(true).astype(bool)] = 1
    scores = sparse_divide_nonzero(scores.tocsr(), sparse.csr_matrix(true.sum(axis=1))).sum(axis=1)
    return scores.mean()


recall = compute_recall(true, predicted)
print(f"Recall @ {k}: {recall:.4f}")

Recall @ 20: 0.0369
