In [1]:
!pip install -r requirements.txt



In [9]:
# Tensorflow library. Used to implement machine learning models
import tensorflow as tf
# Numpy contains helpful functions for efficient mathematical calculations
import numpy as np
# Dataframe manipulation library
import pandas as pd
# Graph plotting library
import matplotlib.pyplot as plt
%matplotlib inline
# Gzip
import gzip

from IPython.display import display

from json import loads
from tqdm.notebook import tqdm
from typing import Iterable, Union
tqdm.pandas()

# Data loading

In [3]:
def parse_json(filename: str, read_max: int = None, attributes: Iterable[str] = None) -> pd.DataFrame:
    """
    Reads the file line by line, parsing each line as json.

    :param filename: The path to the datafile.
    :param read_max: The maximum number of lines to read from the datafile.
    :param attributes: The attributes of each JSON object that should be extracted; other attributes are ignored.
    """
    file = gzip.open(filename, "r")
    data = []
    for index, line in enumerate(tqdm(file)):
        if index == read_max:
            break
        entry = loads(line)
        if attributes is not None:
            entry = {key: entry[key] for key in attributes}
        data.append(entry)
    return pd.DataFrame.from_dict(data)

In [4]:
data_path = "data/"
books = f"{data_path}goodreads_books_comics_graphic.json.gz"
interactions = f"{data_path}goodreads_interactions_comics_graphic.json.gz"
reviews = f"{data_path}goodreads_reviews_comics_graphic.json.gz"

n = None

books_df = parse_json(books, n, ("book_id", "title"))
interactions_df = parse_json(interactions, n, ("user_id", "book_id", "rating", "date_updated"))

display(books_df.head(10))
display(interactions_df.head(10))

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [6]:
# Convert the columns to the correct types
interactions_df["date_updated"] = pd.to_datetime(interactions_df["date_updated"], format="%a %b %d %H:%M:%S %z %Y")
books_df["book_id"] = books_df["book_id"].astype("int64")
interactions_df["book_id"] = interactions_df["book_id"].astype("int64")

# Sort the interactions by user ID and the timestamp
interactions_df = interactions_df.sort_values(by=["user_id", "date_updated"], ascending=[True, True])

# Preprocessing

In [7]:
def preprocess(dataframe: pd.DataFrame, min_support: int = 5) -> pd.DataFrame:
    """
    Removes users with fewer than `min_support` interactions, and duplicate user-item pairs (which do not exist in the
    dataset anyway). Items with very few interactions are not removed, unless they have no interactions at all after
    removing infrequent users.
    """
    print(dataframe.shape[0], "initial rows")
    # Drop reconsumption items
    dataframe = dataframe.drop_duplicates(subset=["user_id", "book_id"])
    # Drop users with less than `min_support` interactions
    items_per_user = dataframe.groupby("user_id", as_index=False)["book_id"].size()
    items_per_user = items_per_user.rename({"size": "items_per_user"}, axis="columns")
    dataframe = pd.merge(dataframe, items_per_user, how="left", on=["user_id"])
    dataframe = dataframe[dataframe["items_per_user"] >= min_support]
    # Report and clean up after the preprocessing
    print(dataframe.shape[0], "rows after preprocessing")
    dataframe.drop(columns=["items_per_user"], inplace=True)
    return dataframe


display(interactions_df.head(10))
print(f"Number of unique users:", interactions_df["user_id"].nunique())
print(f"Number of unique items:", interactions_df["book_id"].nunique())
interactions_df = preprocess(interactions_df, min_support=5)
print(f"Number of unique users:", interactions_df["user_id"].nunique())
print(f"Number of unique items:", interactions_df["book_id"].nunique())
display(interactions_df.head(10))

Unnamed: 0,user_id,book_id,rating,date_updated
1651325,00004584d524ec468619e81b176cc991,271199,4,2013-06-21 10:23:44-07:00
1651324,00004584d524ec468619e81b176cc991,287380,4,2013-06-21 10:24:05-07:00
1651322,00004584d524ec468619e81b176cc991,287381,4,2013-06-21 10:24:31-07:00
1651316,00004584d524ec468619e81b176cc991,287382,4,2013-06-21 10:25:05-07:00
1651314,00004584d524ec468619e81b176cc991,287388,3,2013-06-21 10:25:13-07:00
1651312,00004584d524ec468619e81b176cc991,287385,4,2013-06-21 10:25:29-07:00
1651311,00004584d524ec468619e81b176cc991,287383,4,2013-06-21 10:25:37-07:00
1651308,00004584d524ec468619e81b176cc991,287364,5,2013-06-21 10:26:36-07:00
1651307,00004584d524ec468619e81b176cc991,287368,4,2013-06-21 10:26:43-07:00
1651306,00004584d524ec468619e81b176cc991,287371,4,2013-06-21 10:26:52-07:00


Number of unique users: 342415
Number of unique items: 89411
7347630 initial rows
6995891 rows after preprocessing
Number of unique users: 148438
Number of unique items: 89276


Unnamed: 0,user_id,book_id,rating,date_updated
0,00004584d524ec468619e81b176cc991,271199,4,2013-06-21 10:23:44-07:00
1,00004584d524ec468619e81b176cc991,287380,4,2013-06-21 10:24:05-07:00
2,00004584d524ec468619e81b176cc991,287381,4,2013-06-21 10:24:31-07:00
3,00004584d524ec468619e81b176cc991,287382,4,2013-06-21 10:25:05-07:00
4,00004584d524ec468619e81b176cc991,287388,3,2013-06-21 10:25:13-07:00
5,00004584d524ec468619e81b176cc991,287385,4,2013-06-21 10:25:29-07:00
6,00004584d524ec468619e81b176cc991,287383,4,2013-06-21 10:25:37-07:00
7,00004584d524ec468619e81b176cc991,287364,5,2013-06-21 10:26:36-07:00
8,00004584d524ec468619e81b176cc991,287368,4,2013-06-21 10:26:43-07:00
9,00004584d524ec468619e81b176cc991,287371,4,2013-06-21 10:26:52-07:00


In [10]:
def apply_consecutive_mapping(dataframe: pd.DataFrame, column: str, new_column: str, *additional: pd.DataFrame) -> None:
    """
    Generates a consecutive ID column for the values of an existing column. Also adds this column to additional data
    frames with the exact same mapping of old ID to new (consecutive) ID.
    """
    ids = {}

    def map_to_consecutive_ids(uuid: Union[int, np.int64]) -> int:
        """
        To be used with `pd.Dataframe.apply()` or `pd.Dataframe.progress_apply()`; returns a unique ID per distinct
        value.
        """
        if uuid not in ids:
            ids[uuid] = len(ids)
        return ids[uuid]

    dataframe[new_column] = dataframe[column].progress_apply(map_to_consecutive_ids)
    for frame in additional:
        frame[new_column] = frame[column].progress_apply(lambda old_id: ids.get(old_id, -1))


apply_consecutive_mapping(interactions_df, "user_id", "user_id_int")
apply_consecutive_mapping(interactions_df, "book_id", "book_id_int", books_df)

interactions_df = interactions_df[["user_id_int", "book_id_int", "date_updated", "rating"]]
interactions_df = interactions_df.rename(
    columns={"user_id_int": "user_id", "book_id_int": "item_id", "date_updated": "datetime"})

display(books_df.head(10))
display(interactions_df.head(10))

  0%|          | 0/6995891 [00:00<?, ?it/s]

  0%|          | 0/6995891 [00:00<?, ?it/s]

  0%|          | 0/89411 [00:00<?, ?it/s]

Unnamed: 0,book_id,title,book_id_int
0,25742454,The Switchblade Mamma,88842
1,30128855,Cruelle,52988
2,13571772,Captain America: Winter Soldier (The Ultimate ...,73011
3,35452242,Bounty Hunter 4/3: My Life in Combat from Mari...,60509
4,707611,"Superman Archives, Vol. 2",20130
5,2250580,"A.I. Revolution, Vol. 1",47138
6,27036536,"War Stories, Volume 3",30014
7,27036537,"Crossed, Volume 15",50287
8,27036538,"Crossed + One Hundred, Volume 2 (Crossed +100 #2)",50415
9,27036539,"War Stories, Volume 4",50006


Unnamed: 0,user_id,item_id,datetime,rating
0,0,0,2013-06-21 10:23:44-07:00,4
1,0,1,2013-06-21 10:24:05-07:00,4
2,0,2,2013-06-21 10:24:31-07:00,4
3,0,3,2013-06-21 10:25:05-07:00,4
4,0,4,2013-06-21 10:25:13-07:00,3
5,0,5,2013-06-21 10:25:29-07:00,4
6,0,6,2013-06-21 10:25:37-07:00,4
7,0,7,2013-06-21 10:26:36-07:00,5
8,0,8,2013-06-21 10:26:43-07:00,4
9,0,9,2013-06-21 10:26:52-07:00,4


In [5]:
# user_interactions_df = interactions_df.pivot(index='user_id', columns='book_id',values='rating')
# user_interactions_df.head()