In [1]:
!pip install -r requirements.txt

Collecting pandas==1.3.4
  Downloading pandas-1.3.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.5 MB)
[K     |████████████████████████████████| 11.5 MB 8.6 MB/s eta 0:00:01
[?25hCollecting scikit-learn==1.0.1
  Downloading scikit_learn-1.0.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (25.9 MB)
[K     |████████████████████████████████| 25.9 MB 11.3 MB/s eta 0:00:01
Collecting tqdm==4.62.3
  Downloading tqdm-4.62.3-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 5.2 MB/s  eta 0:00:01
Collecting pytz>=2017.3
  Downloading pytz-2021.3-py2.py3-none-any.whl (503 kB)
[K     |████████████████████████████████| 503 kB 10.2 MB/s eta 0:00:01
[?25hCollecting scipy>=1.1.0
  Downloading scipy-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.3 MB)
[K     |████████████████████████████████| 39.3 MB 3.6 MB/s eta 0:00:013
[?25hCollecting joblib>=0.11
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import tensorflow as tf
import gzip
from IPython.display import display

from json import loads
from tqdm.notebook import tqdm, trange
from typing import Generator, Iterable, Union, TypeVar
from scipy import sparse
from sklearn.model_selection import train_test_split, KFold


tqdm.pandas()

# Data loading

In [3]:
def parse_json(filename: str, read_max: int = None, attributes: Iterable[str] = None) -> pd.DataFrame:
    """
    Reads the file line by line, parsing each line as json.

    :param filename: The path to the datafile.
    :param read_max: The maximum number of lines to read from the datafile.
    :param attributes: The attributes of each JSON object that should be extracted; other attributes are ignored.
    """
    file = gzip.open(filename, "r")
    data = []
    for index, line in enumerate(tqdm(file)):
        if index == read_max:
            break
        entry = loads(line)
        if attributes is not None:
            entry = {key: entry[key] for key in attributes}
        data.append(entry)
    return pd.DataFrame.from_dict(data)

In [4]:
data_path = "data/"
books = f"{data_path}goodreads_books_comics_graphic.json.gz"
interactions = f"{data_path}goodreads_interactions_comics_graphic.json.gz"
reviews = f"{data_path}goodreads_reviews_comics_graphic.json.gz"

n = None

interactions_df = parse_json(interactions, n, ("user_id", "book_id", "rating"))

0it [00:00, ?it/s]

In [5]:
DataType = TypeVar("DataType", pd.DataFrame, sparse.csr_matrix)


def generate_random_split(data: DataType, seed: int = None) -> tuple:
    return train_test_split(data, test_size=0.2, random_state=seed)


def split_k_folds(data: sparse.csr_matrix, nr_folds: int = 5) -> Generator[tuple, None, None]:
    """
    Generates K train-test splits for K-fold cross validation.
    """
    kf = KFold(n_splits=nr_folds)
    yield from kf.split(data)


def create_sparse_matrix(dataframe: pd.DataFrame, shape: tuple = None) -> sparse.csr_matrix:
    return sparse.csr_matrix((dataframe["rating"], (dataframe["user_id"], dataframe["item_id"])), shape=shape,
                             dtype=np.int8)


def convert_sparse_matrix_to_sparse_tensor(X) -> tf.SparseTensor:
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, np.float32(coo.data), coo.shape)

In [6]:
interactions_df["rating"] = interactions_df["rating"].apply(lambda value: (value + 1) / 6.0)

# Restricted Boltzmann Machine

We then move on to creating the visible and hidden layer units and setting their activation functions. In this case, we
will be using the <code>tf.sigmoid</code> and <code>tf.relu</code> functions as nonlinear activations since it is
commonly used in RBM's.

In [7]:
# Input processing: Defining a function to return only the generated hidden states

def hidden_layer(v0_state, W, hb):
    h0_prob = tf.nn.sigmoid(tf.matmul(v0_state, W) + hb)
    return tf.nn.relu(tf.sign(h0_prob - tf.random.uniform(tf.shape(h0_prob))))


def reconstructed_output(h0_state, W, vb):
    v1_prob = tf.nn.sigmoid(tf.matmul(h0_state, tf.transpose(W)) + vb)
    return tf.nn.relu(tf.sign(v1_prob - tf.random.uniform(tf.shape(v1_prob))))


def gibbs_sampling(v0_state: tf.Tensor, weights: tf.Variable, bias_visible: tf.Variable, bias_hidden: tf.Variable,
                   nr_iterations: int = 1) -> tuple:
    """
    Performs gibbs sampling starting with the given input vector.
    :return: A tuple consisting of (in order):
        - The hidden layer `h0_state`, sampled from the given data vector `v0_state`.
        - The final visible layer `vk_state`.
        - The final hidden layer `hk_state`.
    """
    h0_state = hidden_layer(v0_state, weights, hb)
    vk_state = v0_state
    hk_state = h0_state
    for _ in range(nr_iterations):
        vk_state = reconstructed_output(hk_state, weights, bias_visible)
        hk_state = hidden_layer(vk_state, weights, bias_hidden)
    return h0_state, vk_state, hk_state

def get_k_recommendations(user_input: tf.Tensor, W: tf.Tensor, hb: tf.Tensor, vb: tf.Tensor,
                          k: int) -> sparse.csr_matrix:
    user_input = tf.convert_to_tensor(user_input, "float32")
    hh0 = tf.nn.sigmoid(tf.matmul(user_input, W) + hb)
    vv1 = tf.nn.sigmoid(tf.matmul(hh0, tf.transpose(W)) + vb)
    scores = vv1.numpy()
    scores[user_input.numpy() > 0] = 0
    recommendations = np.argpartition(-scores, k, 1)[:, :k]
    return recommendations

# Conditional RBM

In [8]:
def hidden_layer_conditional(v0_state, W, hb, r, D):
    h0_prob = tf.nn.sigmoid(tf.matmul(v0_state, W) + tf.matmul(r, D) + hb)
    return tf.nn.relu(tf.sign(h0_prob - tf.random.uniform(tf.shape(h0_prob))))


def reconstructed_output_conditional(h0_state, W, vb):
    v1_prob = tf.nn.sigmoid(tf.matmul(h0_state, tf.transpose(W)) + vb)
    return tf.nn.relu(tf.sign(v1_prob - tf.random.uniform(tf.shape(v1_prob))))


def gibbs_sampling_conditional(v0_state: tf.Tensor, weights: tf.Variable, r: tf.Tensor, cond_weights: tf.Variable,
                   bias_visible: tf.Variable, bias_hidden: tf.Variable, nr_iterations: int = 1) -> tuple:
    """
    Performs gibbs sampling starting with the given input vector.
    :return: A tuple consisting of (in order):
        - The hidden layer `h0_state`, sampled from the given data vector `v0_state`.
        - The final visible layer `vk_state`.
        - The final hidden layer `hk_state`.
    """
    h0_state = hidden_layer_conditional(v0_state, weights, hb, r, cond_weights)
    vk_state = v0_state
    hk_state = h0_state
    for _ in range(nr_iterations):
        vk_state = reconstructed_output_conditional(hk_state, weights, bias_visible)
        hk_state = hidden_layer_conditional(vk_state, weights, bias_hidden, r, cond_weights)
    return h0_state, vk_state, hk_state

def get_k_recommendations_conditional(user_input: tf.Tensor, r: tf.Tensor, W: tf.Tensor, D: tf.Tensor, hb: tf.Tensor,
                                      vb: tf.Tensor, k: int) -> sparse.csr_matrix:
    user_input = tf.convert_to_tensor(user_input, "float32")
    hh0 = tf.nn.sigmoid(tf.matmul(user_input, W) + tf.matmul(r, D) + hb)
    vv1 = tf.nn.sigmoid(tf.matmul(hh0, tf.transpose(W)) + vb)
    scores = vv1.numpy()
    scores[user_input.numpy() > 0] = 0
    recommendations = np.argpartition(-scores, k, 1)[:, :k]
    return recommendations

def get_r(data: tf.SparseTensor) -> tf.Tensor:
    values = np.ones(len(data.indices))
    indices = data.indices.numpy().T
    sparse_r = sparse.coo_matrix((values, indices), shape=data.shape)
    return tf.constant(sparse_r.toarray(), tf.float32)

In [9]:
def error(v0_state, v1_state):
    """
    Returns the sum of the squared reconstruction errors. This error is computed per batch, and should be accumulated
    per epoch. At the end of the epoch the total RMSE can then be computed from that sum.
    """
    return tf.reduce_sum(tf.reduce_mean(tf.square(v0_state - v1_state), 1))

# Evaluation metrics

In [10]:
def array_to_sparse_matrix(recommendations: np.ndarray) -> sparse.csr_matrix:
    user_ids = np.repeat(np.arange(shape[0]), recommendations.shape[1])
    item_ids = recommendations.flatten()
    scores = np.ones(shape[0] * recommendations.shape[1])
    return sparse.csr_matrix((scores, (user_ids, item_ids)), shape=shape)


def sparse_invert_nonzero(a: sparse.csr_matrix) -> sparse.csr_matrix:
    inverse = a.copy()
    inverse.data = 1 / inverse.data
    return inverse


def sparse_divide_nonzero(a: sparse.csr_matrix, b: sparse.csr_matrix) -> sparse.csr_matrix:
    return a.multiply(sparse_invert_nonzero(b))

In [11]:
def compute_recall(true: sparse.csr_matrix, predicted: sparse.csr_matrix) -> float:
    scores = sparse.lil_matrix(predicted.shape)
    scores[predicted.multiply(true).astype(bool)] = 1
    scores = sparse_divide_nonzero(scores.tocsr(), sparse.csr_matrix(true.sum(axis=1))).sum(axis=1)
    return scores.mean()


def get_ndcg(top_k: np.ndarray, actual_scores: np.ndarray):
    numerator = actual_scores[top_k]
    denom = np.log2(np.arange(2, len(top_k) + 2))
    # Calculate DCG and IDCG
    dcg = np.divide(numerator, denom).sum()
    if dcg == 0:
        return 0
    # Sort the scores based on relevance
    ideal_numerator = np.sort(numerator)[::-1]
    idcg = np.divide(ideal_numerator, denom).sum()
    ndcg = dcg / idcg
    return ndcg


def compute_ndcg(true: sparse.csr_matrix, recommendations: np.ndarray) -> float:
    ndcg_sum = 0
    for user_id in trange(true.shape[0], leave=False):
        ndcg_sum += get_ndcg(recommendations[user_id], true[user_id].toarray()[0])
    return ndcg_sum / true.shape[0]

# Training the RBM

In [12]:
def train_rbm(train: tf.data.Dataset, validation: None, weights: tf.Variable, vb: tf.Variable,
              hb: tf.Variable, cond_weights: tf.Variable = None) -> tuple:
    """
    The training loop of the (conditional) RBM.

    :param train: The training dataset.
    :param validation: The validation dataset.
    :param weights: The weights of the RBM.
    :param vb: The biases of the visible layer of the RBM.
    :param hb: The biases of the hidden layer of the RBM.
    :param cond_weights: The interaction vector weights of the conditional RBM.

    :return: A tuple containing the final weights, biases, training errors (RMSE), and validation errors.
    """
    train_errors = []
    validation_errors = []

    for _ in trange(epochs, leave=False):
        train_errors.append(0)
        train_iter = iter(train)
        for _ in trange(len(train), leave=False):
            batch = next(train_iter)
            v0_state = tf.sparse.to_dense(batch)

            if cond_weights is None:
                h0_state, vk_state, hk_state = gibbs_sampling(v0_state, weights, vb, hb, gibbs_sampling_iterations)
            else:
                r = get_r(batch)
                h0_state, vk_state, hk_state = gibbs_sampling_conditional(
                    v0_state, weights, r, cond_weights, vb, hb, gibbs_sampling_iterations)

            temp_0 = tf.matmul(tf.transpose(v0_state), tf.squeeze(h0_state))
            temp_k = tf.matmul(tf.transpose(vk_state), tf.squeeze(hk_state))
            delta = (temp_0 - temp_k) / v0_state.shape[0]

            weights = weights + alpha * delta
            vb = vb + alpha * tf.reduce_mean(v0_state - vk_state, 0)
            hb = hb + alpha * tf.reduce_mean(tf.squeeze(h0_state - hk_state), 0)
            if cond_weights is not None:
                cond_weights = cond_weights + alpha * tf.matmul(tf.transpose(r), h0_state - hk_state)

        # print(f"Epoch {epoch + 1:3}:  {train_errors[-1]:10.5}  ;  {validation_errors[-1]:10.5}")

    return weights, cond_weights, vb, hb, train_errors, validation_errors

In [13]:
def plot_errors(train_errors: list, validation_errors: list, file_name: str = None) -> None:
    plt.plot(range(1, epochs + 1), train_errors, label="Train")
    plt.plot(range(1, epochs + 1), validation_errors, label="Validation")
    plt.ylabel("Error")
    plt.xlabel("Epoch")
    plt.legend()
    if file_name is not None:
        plt.savefig("rmse.png")
    plt.show()

# Experiment

In [14]:
interactions_df = interactions_df.rename(columns={"user_id": "old_user", "book_id": "old_item"})
interactions_df["old_item"] = interactions_df["old_item"].astype(np.int64)

In [54]:
# Training hyperparameters
epochs = 5
batch_size = 100
gibbs_sampling_iterations = 1
nr_recommendations = 10
alpha = 0.0002  # Learning rate

hiddenUnits = 100
# visibleUnits = shape[1]

rng = tf.random.Generator.from_seed(1)

In [55]:
def generate_matrix(fold_df: pd.DataFrame, interactions_df: pd.DataFrame, shape: tuple) -> sparse.csr_matrix:
    temp = pd.merge(fold_df, interactions_df, how="left", on=["old_user", "old_item"])
    rows = temp["user_id"]
    cols = temp["item_id"]
    ratings = temp["rating"]
    return sparse.csr_matrix((ratings, (rows, cols)), shape=shape)

all_recommendations = []

for fold_nr in trange(5, leave=False):

    train_df = pd.read_csv(f"data/train_fold{fold_nr}.csv")
    test_df = pd.read_csv(f"data/test_fold{fold_nr}.csv")

    shape = (max(train_df["user_id"].max(), test_df["user_id"].max()) + 1,
             max(train_df["item_id"].max(), test_df["item_id"].max()) + 1)
    visibleUnits = shape[1]

    train = generate_matrix(train_df, interactions_df, shape)
    test = generate_matrix(test_df, interactions_df, shape)

    tensor_train = convert_sparse_matrix_to_sparse_tensor(train)
    train_ds = tf.data.Dataset.from_tensor_slices(tensor_train).batch(batch_size)

    W = tf.Variable(rng.normal([visibleUnits, hiddenUnits], 0, 0.1), tf.float32)
    # D = tf.Variable(rng.normal([visibleUnits, hiddenUnits], 0, 0.1), tf.float32)
    vb = tf.Variable(rng.normal([visibleUnits], 0, 0.1), tf.float32)
    hb = tf.Variable(rng.normal([hiddenUnits], 0, 0.1), tf.float32)

    W, _, vb, hb, _, _ = train_rbm(train_ds, None, W, vb, hb)

    recommendations = np.empty((shape[0], 5), dtype=np.int_)
    for batch_start in trange(0, train.shape[0], batch_size, leave=False):
        batch_end = batch_start + batch_size
        batch = train[batch_start: batch_end]
        recommendations[batch_start: batch_end] = get_k_recommendations(batch.toarray(), W, hb, vb, 5)

    all_recommendations.append([recommendations])

    recommendations = np.empty((shape[0], 10), dtype=np.int_)
    for batch_start in trange(0, train.shape[0], batch_size, leave=False):
        batch_end = batch_start + batch_size
        batch = train[batch_start: batch_end]
        recommendations[batch_start: batch_end] = get_k_recommendations(batch.toarray(), W, hb, vb, 10)

    all_recommendations[-1].append(recommendations)
    all_recommendations[-1].append(shape)
    all_recommendations[-1] = tuple(all_recommendations[-1])

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1480 [00:00<?, ?it/s]

  0%|          | 0/1480 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
metrics = []

for fold_nr, (recommendations_5, recommendations_10, shape) in enumerate(all_recommendations):
    metrics.append({})

    print(len(np.unique(recommendations_10)))
    print(np.unique(recommendations_10))

    test_df = pd.read_csv(f"data/test_fold{fold_nr}.csv")
    test = generate_matrix(test_df, interactions_df, shape)

    metrics[-1]["recall@5"] = compute_recall(test, array_to_sparse_matrix(recommendations_5))
    tqdm.write(f"Recall @ 5 = {metrics[-1]['recall@5']}")

    metrics[-1]["ncdg@5"] = compute_ndcg(test, recommendations_5)
    tqdm.write(f"NCDG @ 5 = {metrics[-1]['ncdg@5']}")

    metrics[-1]["recall@10"] = compute_recall(test, array_to_sparse_matrix(recommendations_10))
    tqdm.write(f"Recall @ 10 = {metrics[-1]['recall@10']}")

    metrics[-1]["ncdg@10"] = compute_ndcg(test, recommendations_10)
    tqdm.write(f"NCDG @ 10 = {metrics[-1]['ncdg@10']}")