In [38]:
!pip install -r requirements.txt

You should consider upgrading via the '/home/darin/rbm_3.9/bin/python -m pip install --upgrade pip' command.[0m


In [42]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import tensorflow as tf
import gzip
import torch
from IPython.display import display

from json import loads
from tqdm.notebook import tqdm
from typing import Iterable, Union, TypeVar
from scipy import sparse
from sklearn.model_selection import train_test_split


tqdm.pandas()

# Data loading

In [43]:
def parse_json(filename: str, read_max: int = None, attributes: Iterable[str] = None) -> pd.DataFrame:
    """
    Reads the file line by line, parsing each line as json.

    :param filename: The path to the datafile.
    :param read_max: The maximum number of lines to read from the datafile.
    :param attributes: The attributes of each JSON object that should be extracted; other attributes are ignored.
    """
    file = gzip.open(filename, "r")
    data = []
    for index, line in enumerate(tqdm(file)):
        if index == read_max:
            break
        entry = loads(line)
        if attributes is not None:
            entry = {key: entry[key] for key in attributes}
        data.append(entry)
    return pd.DataFrame.from_dict(data)

In [44]:
data_path = "data/"
books = f"{data_path}goodreads_books_comics_graphic.json.gz"
interactions = f"{data_path}goodreads_interactions_comics_graphic.json.gz"
reviews = f"{data_path}goodreads_reviews_comics_graphic.json.gz"

n = None

books_df = parse_json(books, n, ("book_id", "title"))
interactions_df = parse_json(interactions, n, ("user_id", "book_id", "rating", "date_updated"))

display(books_df.head(10))
display(interactions_df.head(10))

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Unnamed: 0,book_id,title
0,25742454,The Switchblade Mamma
1,30128855,Cruelle
2,13571772,Captain America: Winter Soldier (The Ultimate ...
3,35452242,Bounty Hunter 4/3: My Life in Combat from Mari...
4,707611,"Superman Archives, Vol. 2"
5,2250580,"A.I. Revolution, Vol. 1"
6,27036536,"War Stories, Volume 3"
7,27036537,"Crossed, Volume 15"
8,27036538,"Crossed + One Hundred, Volume 2 (Crossed +100 #2)"
9,27036539,"War Stories, Volume 4"


Unnamed: 0,user_id,book_id,rating,date_updated
0,8842281e1d1347389f2ab93d60773d4d,836610,0,Mon Aug 21 12:11:00 -0700 2017
1,8842281e1d1347389f2ab93d60773d4d,7648967,0,Fri Feb 24 08:59:44 -0800 2017
2,8842281e1d1347389f2ab93d60773d4d,15704307,0,Wed May 20 21:28:57 -0700 2015
3,8842281e1d1347389f2ab93d60773d4d,6902644,0,Sun Jun 01 17:25:23 -0700 2014
4,8842281e1d1347389f2ab93d60773d4d,9844623,0,Sun Sep 02 08:45:08 -0700 2012
5,8842281e1d1347389f2ab93d60773d4d,13163846,0,Tue Jul 24 14:36:13 -0700 2012
6,8842281e1d1347389f2ab93d60773d4d,1137635,0,Fri Mar 26 08:58:33 -0700 2010
7,8842281e1d1347389f2ab93d60773d4d,44735,0,Fri Jun 05 12:28:44 -0700 2009
8,8842281e1d1347389f2ab93d60773d4d,472331,0,Fri Jun 05 12:28:47 -0700 2009
9,8842281e1d1347389f2ab93d60773d4d,24815,5,Thu Apr 17 23:42:49 -0700 2008


In [45]:
# Convert the columns to the correct types
interactions_df["date_updated"] = pd.to_datetime(interactions_df["date_updated"], format="%a %b %d %H:%M:%S %z %Y")
books_df["book_id"] = books_df["book_id"].astype("int64")
interactions_df["book_id"] = interactions_df["book_id"].astype("int64")

# Sort the interactions by user ID and the timestamp
interactions_df = interactions_df.sort_values(by=["user_id", "date_updated"], ascending=[True, True])

# Preprocessing

In [46]:
def preprocess(dataframe: pd.DataFrame, min_support: int = 5) -> pd.DataFrame:
    """
    Removes users with fewer than `min_support` interactions, and duplicate user-item pairs (which do not exist in the
    dataset anyway). Items with very few interactions are not removed, unless they have no interactions at all after
    removing infrequent users.
    """
    print(dataframe.shape[0], "initial rows")
    # Drop reconsumption items
    dataframe = dataframe.drop_duplicates(subset=["user_id", "book_id"])
    # Drop users with less than `min_support` interactions
    items_per_user = dataframe.groupby("user_id", as_index=False)["book_id"].size()
    items_per_user = items_per_user.rename({"size": "items_per_user"}, axis="columns")
    dataframe = pd.merge(dataframe, items_per_user, how="left", on=["user_id"])
    dataframe = dataframe[dataframe["items_per_user"] >= min_support]
    # Report and clean up after the preprocessing
    print(dataframe.shape[0], "rows after preprocessing")
    dataframe.drop(columns=["items_per_user"], inplace=True)
    return dataframe


display(interactions_df.head(10))
print(f"Number of unique users:", interactions_df["user_id"].nunique())
print(f"Number of unique items:", interactions_df["book_id"].nunique())
interactions_df = preprocess(interactions_df, min_support=5)
print(f"Number of unique users:", interactions_df["user_id"].nunique())
print(f"Number of unique items:", interactions_df["book_id"].nunique())
display(interactions_df.head(10))

Unnamed: 0,user_id,book_id,rating,date_updated
1651325,00004584d524ec468619e81b176cc991,271199,4,2013-06-21 10:23:44-07:00
1651324,00004584d524ec468619e81b176cc991,287380,4,2013-06-21 10:24:05-07:00
1651322,00004584d524ec468619e81b176cc991,287381,4,2013-06-21 10:24:31-07:00
1651316,00004584d524ec468619e81b176cc991,287382,4,2013-06-21 10:25:05-07:00
1651314,00004584d524ec468619e81b176cc991,287388,3,2013-06-21 10:25:13-07:00
1651312,00004584d524ec468619e81b176cc991,287385,4,2013-06-21 10:25:29-07:00
1651311,00004584d524ec468619e81b176cc991,287383,4,2013-06-21 10:25:37-07:00
1651308,00004584d524ec468619e81b176cc991,287364,5,2013-06-21 10:26:36-07:00
1651307,00004584d524ec468619e81b176cc991,287368,4,2013-06-21 10:26:43-07:00
1651306,00004584d524ec468619e81b176cc991,287371,4,2013-06-21 10:26:52-07:00


Number of unique users: 342415
Number of unique items: 89411
7347630 initial rows
6995891 rows after preprocessing
Number of unique users: 148438
Number of unique items: 89276


Unnamed: 0,user_id,book_id,rating,date_updated
0,00004584d524ec468619e81b176cc991,271199,4,2013-06-21 10:23:44-07:00
1,00004584d524ec468619e81b176cc991,287380,4,2013-06-21 10:24:05-07:00
2,00004584d524ec468619e81b176cc991,287381,4,2013-06-21 10:24:31-07:00
3,00004584d524ec468619e81b176cc991,287382,4,2013-06-21 10:25:05-07:00
4,00004584d524ec468619e81b176cc991,287388,3,2013-06-21 10:25:13-07:00
5,00004584d524ec468619e81b176cc991,287385,4,2013-06-21 10:25:29-07:00
6,00004584d524ec468619e81b176cc991,287383,4,2013-06-21 10:25:37-07:00
7,00004584d524ec468619e81b176cc991,287364,5,2013-06-21 10:26:36-07:00
8,00004584d524ec468619e81b176cc991,287368,4,2013-06-21 10:26:43-07:00
9,00004584d524ec468619e81b176cc991,287371,4,2013-06-21 10:26:52-07:00


In [47]:
def apply_consecutive_mapping(dataframe: pd.DataFrame, column: str, new_column: str, *additional: pd.DataFrame) -> None:
    """
    Generates a consecutive ID column for the values of an existing column. Also adds this column to additional data
    frames with the exact same mapping of old ID to new (consecutive) ID.
    """
    ids = {}

    def map_to_consecutive_ids(uuid: Union[int, np.int64]) -> int:
        """
        To be used with `pd.Dataframe.apply()` or `pd.Dataframe.progress_apply()`; returns a unique ID per distinct
        value.
        """
        if uuid not in ids:
            ids[uuid] = len(ids)
        return ids[uuid]

    dataframe[new_column] = dataframe[column].progress_apply(map_to_consecutive_ids)
    for frame in additional:
        frame[new_column] = frame[column].progress_apply(lambda old_id: ids.get(old_id, -1))


apply_consecutive_mapping(interactions_df, "user_id", "user_id_int")
apply_consecutive_mapping(interactions_df, "book_id", "book_id_int", books_df)

interactions_df = interactions_df[["user_id_int", "book_id_int", "date_updated", "rating"]]
interactions_df = interactions_df.rename(
    columns={"user_id_int": "user_id", "book_id_int": "item_id", "date_updated": "datetime"})

display(books_df.head(10))
display(interactions_df.head(10))

  0%|          | 0/6995891 [00:00<?, ?it/s]

  0%|          | 0/6995891 [00:00<?, ?it/s]

  0%|          | 0/89411 [00:00<?, ?it/s]

Unnamed: 0,book_id,title,book_id_int
0,25742454,The Switchblade Mamma,88842
1,30128855,Cruelle,52988
2,13571772,Captain America: Winter Soldier (The Ultimate ...,73011
3,35452242,Bounty Hunter 4/3: My Life in Combat from Mari...,60509
4,707611,"Superman Archives, Vol. 2",20130
5,2250580,"A.I. Revolution, Vol. 1",47138
6,27036536,"War Stories, Volume 3",30014
7,27036537,"Crossed, Volume 15",50287
8,27036538,"Crossed + One Hundred, Volume 2 (Crossed +100 #2)",50415
9,27036539,"War Stories, Volume 4",50006


Unnamed: 0,user_id,item_id,datetime,rating
0,0,0,2013-06-21 10:23:44-07:00,4
1,0,1,2013-06-21 10:24:05-07:00,4
2,0,2,2013-06-21 10:24:31-07:00,4
3,0,3,2013-06-21 10:25:05-07:00,4
4,0,4,2013-06-21 10:25:13-07:00,3
5,0,5,2013-06-21 10:25:29-07:00,4
6,0,6,2013-06-21 10:25:37-07:00,4
7,0,7,2013-06-21 10:26:36-07:00,5
8,0,8,2013-06-21 10:26:43-07:00,4
9,0,9,2013-06-21 10:26:52-07:00,4


In [104]:
DataType = TypeVar("DataType", pd.DataFrame, sparse.csr_matrix)

def generate_random_split(data: DataType, seed: int = None) -> tuple[DataType, DataType]:
    return train_test_split(data, test_size=0.2, random_state=seed)


def create_sparse_matrix(dataframe: pd.DataFrame, shape: tuple[int, int] = None) -> sparse.csr_matrix:
    return sparse.csr_matrix((dataframe["rating"], (dataframe["user_id"], dataframe["item_id"])), shape=shape,
                             dtype=np.int8)

def convert_sparse_matrix_to_sparse_tensor(X) -> tf.SparseTensor:
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, np.float32(coo.data), coo.shape)



In [105]:
shape = (interactions_df["user_id"].max() + 1, interactions_df["item_id"].max() + 1)
interaction_matrix = create_sparse_matrix(interactions_df,shape)
normalized_matrix = interaction_matrix
normalized_matrix.data = (interaction_matrix.data/5.0)
train, test = generate_random_split(normalized_matrix, seed=1)


In [117]:
tensor_train = convert_sparse_matrix_to_sparse_tensor(train)
tensor_test = convert_sparse_matrix_to_sparse_tensor(test)
tensor_train.shape

TensorShape([118750, 89276])

# Restricted Boltzmann Machine

In [119]:
hiddenUnits = 20
visibleUnits = len(books_df)
print(len(books_df))
# Number of unique books
vb = tf.Variable(tf.zeros([visibleUnits]),tf.float32)

hb = tf.Variable(tf.zeros([hiddenUnits]), tf.float32)

W = tf.Variable(tf.zeros([visibleUnits, hiddenUnits]), tf.float32)

89411


We then move on to creating the visible and hidden layer units and setting their activation functions. In this case, we will be using the <code>tf.sigmoid</code> and <code>tf.relu</code> functions as nonlinear activations since it is commonly used in RBM's.

In [121]:
v0 = tf.zeros([visibleUnits], tf.float32)
print(len([v0]))
# Small test to check if the matrix prod works
tf.matmul([v0], W)

1


<tf.Tensor: shape=(1, 20), dtype=float32, numpy=
array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]], dtype=float32)>

In [124]:
# Input processing: Defining a function to return only the generated hidden states

def hidden_layer(v0_state, W, hb):
    # Probabilities of the hidden units
    h0_prob = tf.nn.sigmoid(tf.matmul([v0_state], W)+ hb)

    #Sample h  given x
    return tf.nn.relu(tf.sign(h0_prob - tf.random.uniform(tf.shape(h0_prob))))

# Print output of zeros input
h0 = hidden_layer(v0,W,hb)
print("First 15 hidden states: ", h0[0][0:15])

def reconstructed_output(h0_state, W, vb):
    v1_prob = tf.nn.sigmoid(tf.matmul(h0_state,tf.transpose(W))+vb)
    return tf.nn.relu(tf.sign(v1_prob - tf.random.uniform(tf.shape(v1_prob))))[0]

v1 = reconstructed_output(h0, W, vb)
print("Hidden state shape: ", h0.shape)
print("v0 state shape: ", v0.shape)
print("v1 state shape:", v1.shape)

First 15 hidden states:  tf.Tensor([0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1.], shape=(15,), dtype=float32)
Hidden state shape:  (1, 20)
v0 state shape:  (89411,)
v1 state shape: (89411,)


In [110]:
# Error function: Mean absolute

def error(v0_state, v1_state):
    return tf.reduce_mean(tf.square(v0_state - v1_state))

err = error(v0,v1)
print("error", err.numpy())

error 0.50009507


# Training the RBM

In [111]:
# Train the RBM
epochs = 5
batchsize = 500
errors = []
weights = []
K=1
alpha = 0.1 # Learning rate

In [112]:
print(tensor_train.shape)
print(tensor_test.shape)

(118750, 89276)
(29688, 89276)


In [113]:
train_ds = tf.data.Dataset.from_tensor_slices(tensor_train).batch(batchsize)
train_ds

<BatchDataset shapes: (None, 89276), types: tf.float32>

In [None]:
v0_state=v0
for epoch in range(epochs):
    batch_number = 0
    for batch_x in train_ds:

        for i_sample in range(batch_x.shape[0]):
            for k in range(K):

                v0_state = batch_x.values
                print(v0_state)
                print(v0_state.shape)
                break
                h0_state = hidden_layer(v0_state, W, hb)
                v1_state = reconstructed_output(h0_state, W, vb)
                h1_state = hidden_layer(v1_state, W, hb)

                delta_W = tf.matmul(tf.transpose([v0_state]), h0_state) - tf.matmul(tf.transpose([v1_state]), h1_state)
                W = W + alpha * delta_W

                vb = vb + alpha * tf.reduce_mean(v0_state - v1_state, 0)
                hb = hb + alpha * tf.reduce_mean(h0_state - h1_state, 0)

                v0_state = v1_state

            if i_sample == len(batch_x.values)-1:
                err = error(batch_x[i_sample], v1_state)
                errors.append(err)
                weights.append(W)
                print ( 'Epoch: %d' % (epoch + 1),
                        "batch #: %i " % batch_number, "of %i" % (tensor_train.shape[0]/batchsize),
                        "sample #: %i" % i_sample,
                        'reconstruction error: %f' % err)
        batch_number += 1




plt.plot(errors)
plt.ylabel('Error')
plt.xlabel('Epoch')
plt.show()

tf.Tensor([1.  1.  0.8 ... 0.  0.  0. ], shape=(23891,), dtype=float32)
(23891,)
tf.Tensor([1.  1.  0.8 ... 0.  0.  0. ], shape=(23891,), dtype=float32)
(23891,)
tf.Tensor([1.  1.  0.8 ... 0.  0.  0. ], shape=(23891,), dtype=float32)
(23891,)
tf.Tensor([1.  1.  0.8 ... 0.  0.  0. ], shape=(23891,), dtype=float32)
(23891,)
tf.Tensor([1.  1.  0.8 ... 0.  0.  0. ], shape=(23891,), dtype=float32)
(23891,)
tf.Tensor([1.  1.  0.8 ... 0.  0.  0. ], shape=(23891,), dtype=float32)
(23891,)
tf.Tensor([1.  1.  0.8 ... 0.  0.  0. ], shape=(23891,), dtype=float32)
(23891,)
tf.Tensor([1.  1.  0.8 ... 0.  0.  0. ], shape=(23891,), dtype=float32)
(23891,)
tf.Tensor([1.  1.  0.8 ... 0.  0.  0. ], shape=(23891,), dtype=float32)
(23891,)
tf.Tensor([1.  1.  0.8 ... 0.  0.  0. ], shape=(23891,), dtype=float32)
(23891,)
tf.Tensor([1.  1.  0.8 ... 0.  0.  0. ], shape=(23891,), dtype=float32)
(23891,)
tf.Tensor([1.  1.  0.8 ... 0.  0.  0. ], shape=(23891,), dtype=float32)
(23891,)
tf.Tensor([1.  1.  0.8 ... 0