In [1]:
import implicit
import os
import wget
import gzip
import random
import scipy
import tensorflow as tf
import torch
from torch import nn
from collections import defaultdict
from implicit import bpr
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split

implicit.__version__  # implicit 0.5.2 will report errors, we use 0.4.8 instead


'0.4.8'

Data is available at http://cseweb.ucsd.edu/~jmcauley/pml/data/. 
- Download and save to your own directory.
- Or, run following script to save it into `Chapter_5/data` folder automatically.

In [2]:
filenames = [
    'goodreads_fantasy.tsv',
    'goodreads_reviews_fantasy_paranormal.json.gz',
    'goodreads_reviews_comics_graphic.json.gz'
]

dataDir = './data'
url = 'http://jmcauley.ucsd.edu/pml_data'

if not os.path.exists(dataDir):
    os.makedirs(dataDir)
for filename in filenames:
    wget.download(os.path.join(url, filename), out=dataDir)
print("Done!")


Done!


# Latent factor model (`Surprise`)

Using the library's inbuilt data reader, extract tsv-formatted data

In [3]:
reader = Reader(line_format='user item rating', sep='\t')
data = Dataset.load_from_file(os.path.join(
    dataDir, "goodreads_fantasy.tsv"), reader=reader)


Standard latent-factor model

In [4]:
model = SVD()


Inbuilt functions to split into training and test fractions

In [5]:
trainset, testset = train_test_split(data, test_size=.25)


Fit the model and extract predictions

In [6]:
model.fit(trainset)
predictions = model.test(testset)


Estimate for a single (test) rating

In [7]:
predictions[0].est


4.383635103743483

MSE for model predictions (test set)

In [8]:
sse = 0
for p in predictions:
    sse += (p.r_ui - p.est)**2

print(sse / len(predictions))


1.1893084224607475


# Bayesian Personalized Ranking (`Implicit`)

In [9]:
def parseData(fname):
    for l in gzip.open(fname):
        d = eval(l)
        # Discard the reviews, to save memory when we don't use them
        del d['review_text']
        yield d


Full dataset of Goodreads fantasy reviews (fairly memory-hungry, could be replaced by something smaller)

In [10]:
data = list(parseData(os.path.join(
    dataDir, "goodreads_reviews_fantasy_paranormal.json.gz")))


In [11]:
random.shuffle(data)


Example from the dataset

In [12]:
data[0]


{'user_id': 'abbd2f73a185c026dcb37d12cb0399e3',
 'book_id': '18107099',
 'review_id': '5d0dd1c3de78e1679c2cc52d53571c43',
 'rating': 4,
 'date_added': 'Mon Mar 10 14:14:46 -0700 2014',
 'date_updated': 'Sun Mar 23 05:04:45 -0700 2014',
 'read_at': 'Sun Mar 23 05:04:45 -0700 2014',
 'started_at': 'Mon Mar 10 00:00:00 -0700 2014',
 'n_votes': 1,
 'n_comments': 2}

Build a few utility data structures. Since we'll be converting the data to a sparse interaction matrix, the main structure here is to assign each user/item to an ID from 0 to nUsers/nItems.

In [13]:
userIDs, itemIDs = {}, {}

for d in data:
    u, i = d['user_id'], d['book_id']
    if not u in userIDs:
        userIDs[u] = len(userIDs)
    if not i in itemIDs:
        itemIDs[i] = len(itemIDs)

nUsers, nItems = len(userIDs), len(itemIDs)


In [14]:
nUsers, nItems


(256088, 258212)

Convert dataset to sparse matrix. Only storing positive feedback instances (i.e., rated items).

In [15]:
Xiu = scipy.sparse.lil_matrix((nItems, nUsers))
for d in data:
    Xiu[itemIDs[d['book_id']], userIDs[d['user_id']]] = 1

Xui = Xiu.T.tocsr()


Bayesian Personalized Ranking model with 5 latent factors

In [16]:
model = bpr.BayesianPersonalizedRanking(factors=5, use_gpu=False)


Fit the model

In [17]:
model.fit(Xiu)


  0%|          | 0/100 [00:00<?, ?it/s]

Get recommendations for a particular user (the first one) and to get items related to (similar latent factors) to a particular item

In [18]:
recommended = model.recommend(0, Xui)
related = model.similar_items(0)


In [19]:
related


[(0, 0.99999994),
 (43805, 0.989635),
 (32972, 0.9887914),
 (71450, 0.9853674),
 (14346, 0.9845543),
 (102941, 0.98222655),
 (217171, 0.9775458),
 (191999, 0.976213),
 (59827, 0.97602165),
 (3941, 0.97505105)]

Extract user and item factors

In [20]:
itemFactors = model.item_factors
userFactors = model.user_factors


In [21]:
itemFactors[0]


array([ 0.55078983,  0.44424427, -0.47678474,  0.2886802 ,  1.491127  ,
       -0.25753266], dtype=float32)

# Latent factor model (`TensorFlow` and `PyTorch`)

In [22]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)


Goodreads comic book data

In [23]:
userIDs = {}
itemIDs = {}
interactions = []

for d in parse(os.path.join(dataDir, "goodreads_reviews_comics_graphic.json.gz")):
    u = d['user_id']
    i = d['book_id']
    r = d['rating']
    if not u in userIDs:
        userIDs[u] = len(userIDs)
    if not i in itemIDs:
        itemIDs[i] = len(itemIDs)
    interactions.append((u, i, r))


In [24]:
random.shuffle(interactions)
len(interactions)


542338

Split into train and test sets

In [25]:
nTrain = int(len(interactions) * 0.9)
nTest = len(interactions) - nTrain
interactionsTrain = interactions[:nTrain]
interactionsTest = interactions[nTrain:]


In [26]:
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)
for u, i, r in interactionsTrain:
    itemsPerUser[u].append(i)
    usersPerItem[i].append(u)


Mean rating, just for initialization

In [27]:
mu = sum([r for _, _, r in interactionsTrain]) / len(interactionsTrain)


## `TensorFlow` Implementation

Gradient descent optimizer, could experiment with learning rate

In [28]:
optimizer = tf.keras.optimizers.Adam(0.1)


Latent factor model tensorflow class

In [29]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(
            tf.random.normal([len(userIDs)], stddev=0.001))
        self.betaI = tf.Variable(
            tf.random.normal([len(itemIDs)], stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal(
            [len(userIDs), K], stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal(
            [len(itemIDs), K], stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +
                            tf.reduce_sum(self.betaI**2) +
                            tf.reduce_sum(self.gammaU**2) +
                            tf.reduce_sum(self.gammaI**2))

    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
            tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred

    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)


Initialize the model. Could experiment with number of factors and regularization rate.

In [30]:
modelLFM = LatentFactorModel(mu, 5, 0.00001)


2022-03-24 15:01:29.144658: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-03-24 15:01:29.144703: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: Zhankui
2022-03-24 15:01:29.144710: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: Zhankui
2022-03-24 15:01:29.144809: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.103.1
2022-03-24 15:01:29.144836: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.103.1
2022-03-24 15:01:29.144843: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.103.1
2022-03-24 15:01:29.145591: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in

Training step (for the batch-based model from Chapter 5)

In [31]:
def trainingStep(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u, i, r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU, sampleI, sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(
                                  gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()


Run 100 iterations (really 100 batches) of gradient descent

In [32]:
for i in range(100):
    obj = trainingStep(modelLFM, interactionsTrain)
    if (i % 10 == 9):
        print("iteration " + str(i+1) + ", objective = " + str(obj))


iteration 10, objective = 0.5309396
iteration 20, objective = 0.5107579
iteration 30, objective = 0.5197936
iteration 40, objective = 0.52355415
iteration 50, objective = 0.5162568
iteration 60, objective = 0.51113695
iteration 70, objective = 0.50594246
iteration 80, objective = 0.5036332
iteration 90, objective = 0.5052657
iteration 100, objective = 0.5083184


Prediction for a particular user/item pair

In [33]:
u, i, r = interactionsTest[0]


In [34]:
modelLFM.predict(userIDs[u], itemIDs[i]).numpy()


3.8191333

## `PyTorch` Implementation

Latent factor model tensorflow class

In [35]:
class LatentFactorModel(nn.Module):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = nn.Parameter(torch.Tensor([mu]))
        # Initialize to small random values
        self.betaU = nn.Embedding(len(userIDs), 1)
        nn.init.normal_(self.betaU.weight, std=0.001)
        self.betaI = nn.Embedding(len(itemIDs), 1)
        nn.init.normal_(self.betaI.weight, std=0.001)
        self.gammaU = nn.Embedding(len(userIDs), K)
        nn.init.normal_(self.gammaU.weight, std=0.001)
        self.gammaI = nn.Embedding(len(itemIDs), K)
        nn.init.normal_(self.gammaI.weight, std=0.001)
        self.lamb = lamb

    # Prediction
    def forward(self, u, i):
        p = self.alpha + self.betaU(u).squeeze(-1) + self.betaI(i).squeeze(-1) + \
            (self.gammaU(u) * self.gammaI(i)).sum(dim=-1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (torch.sum(self.betaU.weight**2) +
                            torch.sum(self.betaI.weight**2) +
                            torch.sum(self.gammaU.weight**2) +
                            torch.sum(self.gammaI.weight**2))

    # Loss
    def loss(self, u, i, r):
        pred = self.forward(u, i)
        # match l2_loss in tensorflow (i.e., sum(x**2) / 2)
        return nn.functional.mse_loss(pred, r) / 2 + self.reg()


Initialize the model. Could experiment with number of factors and regularization rate.

Gradient descent optimizer, could experiment with learning rate.

In [36]:
modelLFM = LatentFactorModel(mu, 5, 0.00001)
optimizer = torch.optim.Adam(modelLFM.parameters(), lr=0.1)


Training step (for the batch-based model from Chapter 5)

In [37]:

def trainingStep(model, interactions):
    # gradient reset
    optimizer.zero_grad()
    # data generation
    Nsamples = 50000
    sampleU, sampleI, sampleR = [], [], []
    for _ in range(Nsamples):
        u, i, r = random.choice(interactions)
        sampleU.append(userIDs[u])
        sampleI.append(itemIDs[i])
        sampleR.append(r)
    sampleU, sampleI, sampleR = torch.LongTensor(
        sampleU), torch.LongTensor(sampleI), torch.Tensor(sampleR)
    # loss calculation
    loss = model.loss(sampleU, sampleI, sampleR)
    # gradient calculation
    loss.backward()
    # weight update
    optimizer.step()
    return loss.item()


Run 100 iterations (really 100 batches) of gradient descent

In [38]:

for i in range(100):
    loss = trainingStep(modelLFM, interactionsTrain)
    if (i % 10 == 9):
        print("iteration " + str(i+1) + ", loss = " + str(loss))


iteration 10, loss = 0.5609049797058105
iteration 20, loss = 0.5348504781723022
iteration 30, loss = 0.5309914350509644
iteration 40, loss = 0.525666356086731
iteration 50, loss = 0.5169359445571899
iteration 60, loss = 0.5135524272918701
iteration 70, loss = 0.5127942562103271
iteration 80, loss = 0.5091582536697388
iteration 90, loss = 0.5123672485351562
iteration 100, loss = 0.509558916091919


Prediction for a particular user/item pair

In [39]:

u, i, r = interactionsTest[0]


In [40]:
modelLFM(torch.LongTensor([userIDs[u]]), torch.LongTensor([itemIDs[i]])).item()


3.3774330615997314

# Bayesian personalized ranking (`TensorFlow` and `PyTorch`)

In [41]:
items = list(itemIDs.keys())


## `TensorFlow` Implementation

Batch-based version from Chapter 5

In [42]:
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(
            tf.random.normal([len(itemIDs)], stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal(
            [len(userIDs), K], stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal(
            [len(itemIDs), K], stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb

    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +
                            tf.nn.l2_loss(self.gammaU) +
                            tf.nn.l2_loss(self.gammaI))

    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))


In [43]:
optimizer = tf.keras.optimizers.Adam(0.1)


In [44]:
modelBPR = BPRbatch(5, 0.00001)


In [45]:
def trainingStepBPR(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u, i, _ = random.choice(interactions)  # positive sample
            j = random.choice(items)  # negative sample
            while j in itemsPerUser[u]:
                j = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])

        loss = model(sampleU, sampleI, sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(
                                  gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()


Run 100 batches of gradient descent

In [46]:
for i in range(100):
    obj = trainingStepBPR(modelBPR, interactions)
    if (i % 10 == 9):
        print("iteration " + str(i+1) + ", objective = " + str(obj))


iteration 10, objective = 0.52969164
iteration 20, objective = 0.47901565
iteration 30, objective = 0.47639802
iteration 40, objective = 0.47310835
iteration 50, objective = 0.47093636
iteration 60, objective = 0.47369748
iteration 70, objective = 0.47500864
iteration 80, objective = 0.4744386
iteration 90, objective = 0.4741045
iteration 100, objective = 0.46990678


Prediction for a particular user/item pair. Note that this is an unnormalized score (which can be used for ranking)

In [47]:
u, i, _ = interactionsTest[0]


In [48]:
# In this case just a score (that can be used for ranking), rather than a prediction of a rating
modelBPR.predict(userIDs[u], itemIDs[i]).numpy()


3.420997

## `PyTorch` Implementation

Batch-based version from Chapter 5

In [49]:
class BPRbatch(nn.Module):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize to small random values
        self.betaI = nn.Embedding(len(itemIDs), 1)
        nn.init.normal_(self.betaI.weight, std=0.001)
        self.gammaU = nn.Embedding(len(userIDs), K)
        nn.init.normal_(self.gammaU.weight, std=0.001)
        self.gammaI = nn.Embedding(len(itemIDs), K)
        nn.init.normal_(self.gammaI.weight, std=0.001)
        self.lamb = lamb

    # Prediction
    def forward(self, u, i):
        p = self.betaI(i).squeeze(-1) + (self.gammaU(u)
                                         * self.gammaI(i)).sum(dim=-1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (torch.sum(self.betaI.weight**2) / 2 +
                            torch.sum(self.gammaU.weight**2) / 2 +
                            torch.sum(self.gammaI.weight**2) / 2)  # match l2_loss in tensorflow (i.e., sum(x**2) / 2)

    # Loss
    def loss(self, u, i, j):
        x_ui = self.forward(u, i)
        x_uj = self.forward(u, j)
        return - torch.sigmoid(x_ui - x_uj).log().mean() + self.reg()


In [50]:
modelBPR = BPRbatch(5, 0.00001)
optimizer = torch.optim.Adam(modelBPR.parameters(), lr=0.1)


In [51]:
def trainingStepBPR(model, interactions):
    # gradient reset
    optimizer.zero_grad()
    # data generation
    Nsamples = 50000
    sampleU, sampleI, sampleJ = [], [], []
    for _ in range(Nsamples):
        u, i, _ = random.choice(interactions)  # positive sample
        j = random.choice(items)  # negative sample
        while j in itemsPerUser[u]:
            j = random.choice(items)
        sampleU.append(userIDs[u])
        sampleI.append(itemIDs[i])
        sampleJ.append(itemIDs[j])
    sampleU, sampleI, sampleJ = torch.LongTensor(
        sampleU), torch.LongTensor(sampleI), torch.LongTensor(sampleJ)
    # loss calculation
    loss = model.loss(sampleU, sampleI, sampleJ)
    # gradient calculation
    loss.backward()
    # weight update
    optimizer.step()
    return loss.item()


Run 100 batches of gradient descent

In [52]:
for i in range(100):
    loss = trainingStepBPR(modelBPR, interactionsTrain)
    if (i % 10 == 9):
        print("iteration " + str(i+1) + ", loss = " + str(loss))


iteration 10, loss = 0.5430517196655273
iteration 20, loss = 0.49784502387046814
iteration 30, loss = 0.48478055000305176
iteration 40, loss = 0.48229995369911194
iteration 50, loss = 0.4817325472831726
iteration 60, loss = 0.47607892751693726
iteration 70, loss = 0.4738577902317047
iteration 80, loss = 0.4703591465950012
iteration 90, loss = 0.4687100052833557
iteration 100, loss = 0.46969711780548096


Prediction for a particular user/item pair. Note that this is an unnormalized score (which can be used for ranking)

In [53]:
u, i, _ = interactionsTest[0]


In [54]:
modelLFM(torch.LongTensor([userIDs[u]]), torch.LongTensor([itemIDs[i]])).item()


3.3774330615997314

# Exercises

### 5.1 (`TensorFlow` Implementation)

Adapt the latent factor model above, simply deleting any terms associated with latent factors

In [59]:
optimizer = tf.keras.optimizers.Adam(0.1)


In [60]:
class LatentFactorModelBiasOnly(tf.keras.Model):
    def __init__(self, mu, lamb):
        super(LatentFactorModelBiasOnly, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(
            tf.random.normal([len(userIDs)], stddev=0.001))
        self.betaI = tf.Variable(
            tf.random.normal([len(itemIDs)], stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i]
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +
                            tf.reduce_sum(self.betaI**2))

    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        pred = self.alpha + beta_u + beta_i
        return pred

    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)


In [61]:
modelBiasOnly = LatentFactorModelBiasOnly(mu, 0.00001)


In [62]:
def trainingStepBiasOnly(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u, i, r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU, sampleI, sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(
                                  gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()


In [63]:
for i in range(50):
    obj = trainingStepBiasOnly(modelBiasOnly, interactionsTrain)
    if (i % 10 == 9):
        print("iteration " + str(i+1) + ", objective = " + str(obj))


iteration 10, objective = 0.5325793
iteration 20, objective = 0.5129128
iteration 30, objective = 0.49926892
iteration 40, objective = 0.4964696
iteration 50, objective = 0.4938159


Compute the MSEs for a model which always predicts the mean, versus one which involves bias terms

In [64]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x, y in zip(predictions, labels)]
    return sum(differences) / len(differences)


In [65]:
alwaysPredictMean = [mu for _ in interactionsTest]
labels = [r for _, _, r in interactionsTest]


In [66]:
MSE(alwaysPredictMean, labels)


1.3588585417861048

In [67]:
biasOnlyPredictions =\
    [modelBiasOnly.predict(userIDs[u], itemIDs[i]).numpy()
     for u, i, _ in interactionsTest]


In [68]:
biasOnlyPredictions[0]


3.642407

In [69]:
MSE(biasOnlyPredictions, labels)


0.9985292734595719

### 5.1 (`PyTorch` Implementation)

In [70]:
class LatentFactorModelBiasOnly(nn.Module):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModelBiasOnly, self).__init__()
        # Initialize to average
        self.alpha = nn.Parameter(torch.Tensor([mu]))
        # Initialize to small random values
        self.betaU = nn.Embedding(len(userIDs), 1)
        nn.init.normal_(self.betaU.weight, std=0.001)
        self.betaI = nn.Embedding(len(itemIDs), 1)
        nn.init.normal_(self.betaI.weight, std=0.001)
        self.lamb = lamb

    # Prediction
    def forward(self, u, i):
        p = self.alpha + self.betaU(u).squeeze(-1) + self.betaI(i).squeeze(-1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (torch.sum(self.betaU.weight**2) +
                            torch.sum(self.betaI.weight**2))

    # Loss
    def loss(self, u, i, r):
        pred = self.forward(u, i)
        # match l2_loss in tensorflow (i.e., sum(x**2) / 2)
        return nn.functional.mse_loss(pred, r) / 2 + self.reg()


modelBiasOnly = LatentFactorModelBiasOnly(mu, 5, 0.00001)
optimizer = torch.optim.Adam(modelBiasOnly.parameters(), lr=0.1)


def trainingStepBiasOnly(model, interactions):
    # gradient reset
    optimizer.zero_grad()
    # data generation
    Nsamples = 50000
    sampleU, sampleI, sampleR = [], [], []
    for _ in range(Nsamples):
        u, i, r = random.choice(interactions)
        sampleU.append(userIDs[u])
        sampleI.append(itemIDs[i])
        sampleR.append(r)
    sampleU, sampleI, sampleR = torch.LongTensor(
        sampleU), torch.LongTensor(sampleI), torch.Tensor(sampleR)
    # loss calculation
    loss = model.loss(sampleU, sampleI, sampleR)
    # gradient calculation
    loss.backward()
    # weight update
    optimizer.step()
    return loss.item()


for i in range(50):
    loss = trainingStepBiasOnly(modelBiasOnly, interactionsTrain)
    if (i % 10 == 9):
        print("iteration " + str(i+1) + ", loss = " + str(loss))


iteration 10, loss = 0.5496252179145813
iteration 20, loss = 0.5112858414649963
iteration 30, loss = 0.5048273801803589
iteration 40, loss = 0.5003482103347778
iteration 50, loss = 0.48966529965400696


Compute the MSEs for a model which always predicts the mean, versus one which involves bias terms

In [71]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x, y in zip(predictions, labels)]
    return sum(differences) / len(differences)


alwaysPredictMean = [mu for _ in interactionsTest]
labels = [r for _, _, r in interactionsTest]

MSE(alwaysPredictMean, labels)


1.3588585417861048

In [72]:
biasOnlyPredictions = \
    [modelBiasOnly(torch.LongTensor([userIDs[u]]),
                   torch.LongTensor([itemIDs[i]])).item() for u, i, _ in interactionsTest]


In [73]:
biasOnlyPredictions[0]


3.7933297157287598

In [74]:
MSE(biasOnlyPredictions, labels)


1.001839570283527

### 5.2 (`TensorFlow` Implementation)

Performance of a complete latent factor model (the latent factor model implementation is as the same as LatentFactorMOdel in the examples above)

In [75]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(
            tf.random.normal([len(userIDs)], stddev=0.001))
        self.betaI = tf.Variable(
            tf.random.normal([len(itemIDs)], stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal(
            [len(userIDs), K], stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal(
            [len(itemIDs), K], stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +
                            tf.reduce_sum(self.betaI**2) +
                            tf.reduce_sum(self.gammaU**2) +
                            tf.reduce_sum(self.gammaI**2))

    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
            tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred

    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)


def trainingStep(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u, i, r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU, sampleI, sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(
                                  gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()


optimizer = tf.keras.optimizers.Adam(0.1)
modelLFM = LatentFactorModel(mu, 10, 0.00001)


In [76]:
for i in range(50):
    obj = trainingStep(modelLFM, interactionsTrain)
    if (i % 10 == 9):
        print("iteration " + str(i+1) + ", objective = " + str(obj))


iteration 10, objective = 0.5405107
iteration 20, objective = 0.523776
iteration 30, objective = 0.53565085
iteration 40, objective = 0.5329913
iteration 50, objective = 0.53082687


In [77]:
predictions = [modelLFM.predict(userIDs[u], itemIDs[i]).numpy()
               for u, i, _ in interactionsTest]


In [78]:
MSE(predictions, labels)


1.022115720965297

(probably needs a little more tuning in terms of number of latent factors, learning rate, etc.)

### 5.2 (`PyTorch` Implementation)
Performance of a complete latent factor model (the latent factor model implementation is as the same as LatentFactorMOdel in the examples above)

In [79]:
class LatentFactorModel(nn.Module):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = nn.Parameter(torch.Tensor([mu]))
        # Initialize to small random values
        self.betaU = nn.Embedding(len(userIDs), 1)
        nn.init.normal_(self.betaU.weight, std=0.001)
        self.betaI = nn.Embedding(len(itemIDs), 1)
        nn.init.normal_(self.betaI.weight, std=0.001)
        self.gammaU = nn.Embedding(len(userIDs), K)
        nn.init.normal_(self.gammaU.weight, std=0.001)
        self.gammaI = nn.Embedding(len(itemIDs), K)
        nn.init.normal_(self.gammaI.weight, std=0.001)
        self.lamb = lamb

    # Prediction
    def forward(self, u, i):
        p = self.alpha + self.betaU(u).squeeze(-1) + self.betaI(i).squeeze(-1) + \
            (self.gammaU(u) * self.gammaI(i)).sum(dim=-1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (torch.sum(self.betaU.weight**2) +
                            torch.sum(self.betaI.weight**2) +
                            torch.sum(self.gammaU.weight**2) +
                            torch.sum(self.gammaI.weight**2))

    # Loss
    def loss(self, u, i, r):
        pred = self.forward(u, i)
        # match l2_loss in tensorflow (i.e., sum(x**2) / 2)
        return nn.functional.mse_loss(pred, r) / 2 + self.reg()


modelLFM = LatentFactorModel(mu, 10, 0.00001)
optimizer = torch.optim.Adam(modelLFM.parameters(), lr=0.1)


def trainingStep(model, interactions):
    # gradient reset
    optimizer.zero_grad()
    # data generation
    Nsamples = 50000
    sampleU, sampleI, sampleR = [], [], []
    for _ in range(Nsamples):
        u, i, r = random.choice(interactions)
        sampleU.append(userIDs[u])
        sampleI.append(itemIDs[i])
        sampleR.append(r)
    sampleU, sampleI, sampleR = torch.LongTensor(
        sampleU), torch.LongTensor(sampleI), torch.Tensor(sampleR)
    # loss calculation
    loss = model.loss(sampleU, sampleI, sampleR)
    # gradient calculation
    loss.backward()
    # weight update
    optimizer.step()
    return loss.item()


for i in range(50):
    loss = trainingStep(modelLFM, interactionsTrain)
    if (i % 10 == 9):
        print("iteration " + str(i+1) + ", loss = " + str(loss))


iteration 10, loss = 0.5724738836288452
iteration 20, loss = 0.5555902719497681
iteration 30, loss = 0.5478214621543884
iteration 40, loss = 0.5373783707618713
iteration 50, loss = 0.5360277891159058


In [80]:
predictions = [modelLFM(torch.LongTensor([userIDs[u]]), torch.LongTensor(
    [itemIDs[i]])).item() for u, i, _ in interactionsTest]
MSE(predictions, labels)


1.0284807624281143

### 5.3

Experiment with rounding the predictions

In [81]:
predictionsRounded = [int(p + 0.5) for p in predictions]


In [82]:
MSE(predictionsRounded, labels)


1.1147066415901463

Seems to result in worse performance. For a rough explanation, consider a random variable that takes a value of "1" half the time and "2" half the time; in terms of the MSE, always predicting 1.5 (and always incurring moderate errors) is preferable to always predicting either of 1 or 2 (and incurring a large error half the time).

### 5.4 (`TensorFlow` Implementation)

Following the BPR code from examples above

In [83]:
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(
            tf.random.normal([len(itemIDs)], stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal(
            [len(userIDs), K], stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal(
            [len(itemIDs), K], stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb

    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +
                            tf.nn.l2_loss(self.gammaU) +
                            tf.nn.l2_loss(self.gammaI))

    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))


def trainingStepBPR(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u, i, _ = random.choice(interactions)  # positive sample
            j = random.choice(items)  # negative sample
            while j in itemsPerUser[u]:
                j = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])

        loss = model(sampleU, sampleI, sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(
                                  gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()


optimizer = tf.keras.optimizers.Adam(0.1)
modelBPR = BPRbatch(10, 0.00001)


In [84]:
for i in range(50):
    obj = trainingStepBPR(modelBPR, interactionsTrain)
    if (i % 10 == 9):
        print("iteration " + str(i+1) + ", objective = " + str(obj))


iteration 10, objective = 0.5254287
iteration 20, objective = 0.48577458
iteration 30, objective = 0.48419735
iteration 40, objective = 0.48588872
iteration 50, objective = 0.48943776


In [85]:
interactionsTestPerUser = defaultdict(set)
itemSet = set()
for u, i, _ in interactionsTest:
    interactionsTestPerUser[u].add(i)
    itemSet.add(i)


AUC implementation

In [86]:
def AUCu(u, N):  # N samples per user
    win = 0
    if N > len(interactionsTestPerUser[u]):
        N = len(interactionsTestPerUser[u])
    positive = random.sample(interactionsTestPerUser[u], N)
    negative = random.sample(itemSet.difference(interactionsTestPerUser[u]), N)
    for i, j in zip(positive, negative):
        si = modelBPR.predict(userIDs[u], itemIDs[i]).numpy()
        sj = modelBPR.predict(userIDs[u], itemIDs[j]).numpy()
        if si > sj:
            win += 1
    return win/N


In [87]:
def AUC():
    av = []
    for u in interactionsTestPerUser:
        av.append(AUCu(u, 10))
    return sum(av) / len(av)


In [88]:
AUC()


0.7919068070383867

### 5.4 (`PyTorch` Implementation)

In [89]:
class BPRbatch(nn.Module):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize to small random values
        self.betaI = nn.Embedding(len(itemIDs), 1)
        nn.init.normal_(self.betaI.weight, std=0.001)
        self.gammaU = nn.Embedding(len(userIDs), K)
        nn.init.normal_(self.gammaU.weight, std=0.001)
        self.gammaI = nn.Embedding(len(itemIDs), K)
        nn.init.normal_(self.gammaI.weight, std=0.001)
        self.lamb = lamb

    # Prediction
    def forward(self, u, i):
        p = self.betaI(i).squeeze(-1) + (self.gammaU(u)
                                         * self.gammaI(i)).sum(dim=-1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (torch.sum(self.betaI.weight**2) / 2 +
                            torch.sum(self.gammaU.weight**2) / 2 +
                            torch.sum(self.gammaI.weight**2) / 2)  # match l2_loss in tensorflow (i.e., sum(x**2) / 2)

    # Loss
    def loss(self, u, i, j):
        x_ui = self.forward(u, i)
        x_uj = self.forward(u, j)
        return - torch.sigmoid(x_ui - x_uj).log().mean() + self.reg()


modelBPR = BPRbatch(10, 0.00001)
optimizer = torch.optim.Adam(modelBPR.parameters(), lr=0.1)


def trainingStepBPR(model, interactions):
    # gradient reset
    optimizer.zero_grad()
    # data generation
    Nsamples = 50000
    sampleU, sampleI, sampleJ = [], [], []
    for _ in range(Nsamples):
        u, i, _ = random.choice(interactions)  # positive sample
        j = random.choice(items)  # negative sample
        while j in itemsPerUser[u]:
            j = random.choice(items)
        sampleU.append(userIDs[u])
        sampleI.append(itemIDs[i])
        sampleJ.append(itemIDs[j])
    sampleU, sampleI, sampleJ = torch.LongTensor(
        sampleU), torch.LongTensor(sampleI), torch.LongTensor(sampleJ)
    # loss calculation
    loss = model.loss(sampleU, sampleI, sampleJ)
    # gradient calculation
    loss.backward()
    # weight update
    optimizer.step()
    return loss.item()


for i in range(50):
    loss = trainingStepBPR(modelBPR, interactionsTrain)
    if (i % 10 == 9):
        print("iteration " + str(i+1) + ", loss = " + str(loss))


iteration 10, loss = 0.5562138557434082
iteration 20, loss = 0.5252612829208374
iteration 30, loss = 0.5091630816459656
iteration 40, loss = 0.5088016986846924
iteration 50, loss = 0.5018982887268066


In [90]:
interactionsTestPerUser = defaultdict(set)
itemSet = set()
for u, i, _ in interactionsTest:
    interactionsTestPerUser[u].add(i)
    itemSet.add(i)


AUC Implementation

In [91]:
def AUCu(u, N):  # N samples per user
    win = 0
    if N > len(interactionsTestPerUser[u]):
        N = len(interactionsTestPerUser[u])
    positive = random.sample(interactionsTestPerUser[u], N)
    negative = random.sample(itemSet.difference(interactionsTestPerUser[u]), N)
    for i, j in zip(positive, negative):
        user, i, j = torch.LongTensor([userIDs[u]]), torch.LongTensor(
            [itemIDs[i]]), torch.LongTensor([itemIDs[j]])
        si = modelBPR(user, i).item()
        sj = modelBPR(user, j).item()
        if si > sj:
            win += 1
    return win/N


In [92]:
def AUC():
    av = []
    for u in interactionsTestPerUser:
        av.append(AUCu(u, 10))
    return sum(av) / len(av)


In [93]:
AUC()


0.7893403814456449