In [2]:
import os 
import wget
import gzip
import random
import scipy
import tensorflow as tf
import torch 
from torch import nn
from collections import defaultdict
from implicit import bpr
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split

import implicit
implicit.__version__ # implicit 0.5.2 will report errors, we use 0.4.8 instead

  from .autonotebook import tqdm as notebook_tqdm


'0.4.8'

Data is available at http://cseweb.ucsd.edu/~jmcauley/pml/data/. 
- Download and save to your own directory.
- Or, run following script to save it into `Chapter_5/data` folder automatically.

In [3]:
filenames = [
    'goodreads_fantasy.tsv', 
    'goodreads_reviews_fantasy_paranormal.json.gz', 
    'goodreads_reviews_comics_graphic.json.gz'
]

dataDir = './data'
url = 'http://jmcauley.ucsd.edu/pml_data'

if not os.path.exists(dataDir):
    os.makedirs(dataDir)
for filename in filenames:
    wget.download(os.path.join(url, filename), out=dataDir)
print("Done!")

Done!


# Latent factor model (`Surprise`)

Using the library's inbuilt data reader, extract tsv-formatted data

In [38]:
reader = Reader(line_format='user item rating', sep='\t')
data = Dataset.load_from_file(os.path.join(dataDir, "goodreads_fantasy.tsv"), reader=reader)

Standard latent-factor model

In [39]:
model = SVD()

Inbuilt functions to split into training and test fractions

In [40]:
trainset, testset = train_test_split(data, test_size=.25)

Fit the model and extract predictions

In [41]:
model.fit(trainset)
predictions = model.test(testset)

Estimate for a single (test) rating

In [42]:
predictions[0].est

3.41674281295034

MSE for model predictions (test set)

In [43]:
sse = 0
for p in predictions:
    sse += (p.r_ui - p.est)**2

print(sse / len(predictions))

1.18535715288669


# Bayesian Personalized Ranking (`Implicit`)

In [44]:
def parseData(fname):
    for l in gzip.open(fname):
        d = eval(l)
        del d['review_text'] # Discard the reviews, to save memory when we don't use them
        yield d

Full dataset of Goodreads fantasy reviews (fairly memory-hungry, could be replaced by something smaller)

In [45]:
data = list(parseData(os.path.join(dataDir, "goodreads_reviews_fantasy_paranormal.json.gz")))

In [46]:
random.shuffle(data)

Example from the dataset

In [47]:
data[0]

{'user_id': '660fecee4a96d682c821d0f36b56d073',
 'book_id': '13648459',
 'review_id': '7f59659b0f3ea6fdcf0943699bfd361d',
 'rating': 5,
 'date_added': 'Sun Apr 14 19:20:01 -0700 2013',
 'date_updated': 'Sun Apr 14 19:21:12 -0700 2013',
 'read_at': 'Sun Apr 14 00:00:00 -0700 2013',
 'started_at': '',
 'n_votes': 2,
 'n_comments': 0}

Build a few utility data structures. Since we'll be converting the data to a sparse interaction matrix, the main structure here is to assign each user/item to an ID from 0 to nUsers/nItems.

In [48]:
userIDs,itemIDs = {},{}

for d in data:
    u,i = d['user_id'],d['book_id']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)

nUsers,nItems = len(userIDs),len(itemIDs)

In [49]:
nUsers,nItems

(256088, 258212)

Convert dataset to sparse matrix. Only storing positive feedback instances (i.e., rated items).

In [50]:
Xiu = scipy.sparse.lil_matrix((nItems, nUsers))
for d in data:
    Xiu[itemIDs[d['book_id']],userIDs[d['user_id']]] = 1
    
Xui = Xiu.T.tocsr()

Bayesian Personalized Ranking model with 5 latent factors

In [51]:
model = bpr.BayesianPersonalizedRanking(factors = 5, use_gpu = False)

Fit the model

In [52]:
model.fit(Xiu)

100%|██████████| 100/100 [00:11<00:00,  8.89it/s, train_auc=90.07%, skipped=1.81%]


Get recommendations for a particular user (the first one) and to get items related to (similar latent factors) to a particular item

In [53]:
recommended = model.recommend(0, Xui)
related = model.similar_items(0)

In [54]:
related

[(0, 1.0000001),
 (46541, 0.9904368),
 (112012, 0.98807365),
 (114601, 0.98681265),
 (63634, 0.9843552),
 (154377, 0.9819736),
 (55813, 0.981672),
 (136790, 0.98098093),
 (7517, 0.9808126),
 (99516, 0.9805903)]

Extract user and item factors

In [55]:
itemFactors = model.item_factors
userFactors = model.user_factors

In [56]:
itemFactors[0]

array([-0.1206227 , -0.6256624 , -0.8490893 ,  0.689599  , -0.50638354,
       -0.19383602], dtype=float32)

# Latent factor model (`TensorFlow` and `PyTorch`)

In [3]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

Goodreads comic book data

In [4]:
userIDs = {}
itemIDs = {}
interactions = []

for d in parse(os.path.join(dataDir, "goodreads_reviews_comics_graphic.json.gz")):
    u = d['user_id']
    i = d['book_id']
    r = d['rating']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((u,i,r))

In [5]:
random.shuffle(interactions)
len(interactions)

542338

Split into train and test sets

In [6]:
nTrain = int(len(interactions) * 0.9)
nTest = len(interactions) - nTrain
interactionsTrain = interactions[:nTrain]
interactionsTest = interactions[nTrain:]

In [7]:
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)
for u,i,r in interactionsTrain:
    itemsPerUser[u].append(i)
    usersPerItem[i].append(u)

Mean rating, just for initialization

In [8]:
mu = sum([r for _,_,r in interactionsTrain]) / len(interactionsTrain)

## `TensorFlow` Implementation

Gradient descent optimizer, could experiment with learning rate

In [10]:
optimizer = tf.keras.optimizers.Adam(0.1)

Latent factor model tensorflow class

In [11]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

Initialize the model. Could experiment with number of factors and regularization rate.

In [13]:
modelLFM = LatentFactorModel(mu, 5, 0.00001)

Training step (for the batch-based model from Chapter 5)

In [14]:
def trainingStep(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

Run 100 iterations (really 100 batches) of gradient descent

In [15]:
for i in range(100):
    obj = trainingStep(modelLFM, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.5413822
iteration 20, objective = 0.513759
iteration 30, objective = 0.5110235
iteration 40, objective = 0.51673234
iteration 50, objective = 0.5165422
iteration 60, objective = 0.5135466
iteration 70, objective = 0.51165915
iteration 80, objective = 0.51350456
iteration 90, objective = 0.51042646
iteration 100, objective = 0.5097919


Prediction for a particular user/item pair

In [16]:
u,i,r = interactionsTest[0]

In [None]:
modelLFM.predict(userIDs[u], itemIDs[i]).numpy()

## `PyTorch` Implementation

Latent factor model tensorflow class

In [None]:
class LatentFactorModel(nn.Module):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = nn.Parameter(torch.Tensor([mu]))
        # Initialize to small random values
        self.betaU = nn.Embedding(len(userIDs), 1)
        nn.init.normal_(self.betaU.weight, std=0.001)
        self.betaI = nn.Embedding(len(itemIDs), 1)
        nn.init.normal_(self.betaI.weight, std=0.001)
        self.gammaU = nn.Embedding(len(userIDs), K)
        nn.init.normal_(self.gammaU.weight, std=0.001)
        self.gammaI = nn.Embedding(len(itemIDs), K)
        nn.init.normal_(self.gammaI.weight, std=0.001)
        self.lamb = lamb

    # Prediction
    def forward(self, u, i):
        p = self.alpha + self.betaU(u).squeeze(-1) + self.betaI(i).squeeze(-1) + \
            (self.gammaU(u) * self.gammaI(i)).sum(dim=-1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (torch.sum(self.betaU.weight**2) + \
                            torch.sum(self.betaI.weight**2) + \
                            torch.sum(self.gammaU.weight**2) + \
                            torch.sum(self.gammaI.weight**2))
    
    # Loss
    def loss(self, u, i, r):
        pred = self.forward(u, i)
        return nn.functional.mse_loss(pred, r) + self.reg() # mse_loss in pytorch is 2 * l2_loss in tensorflow


Initialize the model. Could experiment with number of factors and regularization rate.

Gradient descent optimizer, could experiment with learning rate.

In [14]:
modelLFM = LatentFactorModel(mu, 5, 0.00001)
optimizer = torch.optim.Adam(modelLFM.parameters(), lr=0.1)

Training step (for the batch-based model from Chapter 5)

In [None]:

def trainingStep(model, interactions):  
    # gradient reset
    optimizer.zero_grad() 
    # data generation
    Nsamples = 50000
    sampleU, sampleI, sampleR = [], [], []
    for _ in range(Nsamples):
        u,i,r = random.choice(interactions)
        sampleU.append(userIDs[u])
        sampleI.append(itemIDs[i])
        sampleR.append(r)
    sampleU, sampleI, sampleR = torch.LongTensor(sampleU), torch.LongTensor(sampleI), torch.Tensor(sampleR)
    # loss calculation
    loss = model.loss(sampleU, sampleI, sampleR)
    # gradient calculation
    loss.backward()
    # weight update
    optimizer.step()
    return loss.item()


Run 100 iterations (really 100 batches) of gradient descent

In [None]:

for i in range(100):
    loss = trainingStep(modelLFM, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", loss = " + str(loss))


Prediction for a particular user/item pair

In [None]:

u,i,r = interactionsTest[0]

In [13]:
modelLFM(torch.LongTensor([userIDs[u]]), torch.LongTensor([itemIDs[i]])).item()

4.046658039093018

# Bayesian personalized ranking (`TensorFlow` and `PyTorch`)

In [16]:
items = list(itemIDs.keys())

## `TensorFlow` Implementation

Batch-based version from Chapter 5

In [55]:
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb

    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaU) +\
                            tf.nn.l2_loss(self.gammaI))
    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))

In [56]:
optimizer = tf.keras.optimizers.Adam(0.1)

In [57]:
modelBPR = BPRbatch(5, 0.00001)

In [58]:
def trainingStepBPR(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u,i,_ = random.choice(interactions) # positive sample
            j = random.choice(items) # negative sample
            while j in itemsPerUser[u]:
                j = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])

        loss = model(sampleU,sampleI,sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

Run 100 batches of gradient descent

In [59]:
for i in range(100):
    obj = trainingStepBPR(modelBPR, interactions)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.5303632
iteration 20, objective = 0.47583044
iteration 30, objective = 0.4722096
iteration 40, objective = 0.47280103
iteration 50, objective = 0.47669053
iteration 60, objective = 0.47630626
iteration 70, objective = 0.47247666
iteration 80, objective = 0.4745618
iteration 90, objective = 0.47154313
iteration 100, objective = 0.46945256


Prediction for a particular user/item pair. Note that this is an unnormalized score (which can be used for ranking)

In [60]:
u,i,_ = interactionsTest[0]

In [61]:
# In this case just a score (that can be used for ranking), rather than a prediction of a rating
modelBPR.predict(userIDs[u], itemIDs[i]).numpy()

0.6619426

## `PyTorch` Implementation

Batch-based version from Chapter 5

In [46]:
class BPRbatch(nn.Module):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize to small random values
        self.betaI = nn.Embedding(len(itemIDs), 1)
        nn.init.normal_(self.betaI.weight, std=0.001)
        self.gammaU = nn.Embedding(len(userIDs), K)
        nn.init.normal_(self.gammaU.weight, std=0.001)
        self.gammaI = nn.Embedding(len(itemIDs), K)
        nn.init.normal_(self.gammaI.weight, std=0.001)
        self.lamb = lamb

    # Prediction
    def forward(self, u, i):
        p = self.betaI(i).squeeze(-1) + (self.gammaU(u) * self.gammaI(i)).sum(dim=-1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (torch.sum(self.betaI.weight**2) + \
                            torch.sum(self.gammaU.weight**2) + \
                            torch.sum(self.gammaI.weight**2))
    
    # Loss
    def loss(self, u, i, j):
        x_ui = self.forward(u, i)
        x_uj = self.forward(u, j)
        return - torch.sigmoid(x_ui - x_uj).log().mean() + self.reg() # mse_loss in pytorch is 2 * l2_loss in tensorflow


In [47]:
modelBPR = BPRbatch(5, 0.00001)
optimizer = torch.optim.Adam(modelBPR.parameters(), lr=0.1)

In [48]:
def trainingStepBPR(model, interactions):  
    # gradient reset
    optimizer.zero_grad() 
    # data generation
    Nsamples = 50000
    sampleU, sampleI, sampleJ = [], [], []
    for _ in range(Nsamples):
        u,i,_ = random.choice(interactions) # positive sample
        j = random.choice(items) # negative sample
        while j in itemsPerUser[u]:
            j = random.choice(items)
        sampleU.append(userIDs[u])
        sampleI.append(itemIDs[i])
        sampleJ.append(itemIDs[j])
    sampleU, sampleI, sampleJ = torch.LongTensor(sampleU), torch.LongTensor(sampleI), torch.LongTensor(sampleJ)
    # loss calculation
    loss = model.loss(sampleU, sampleI, sampleJ)
    # gradient calculation
    loss.backward()
    # weight update
    optimizer.step()
    return loss.item()

Run 100 batches of gradient descent

In [49]:
for i in range(100):
    loss = trainingStepBPR(modelBPR, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", loss = " + str(loss))

iteration 10, loss = 0.5781843066215515
iteration 20, loss = 0.5346540212631226
iteration 30, loss = 0.5334715843200684
iteration 40, loss = 0.531741738319397
iteration 50, loss = 0.5284981727600098
iteration 60, loss = 0.529007077217102
iteration 70, loss = 0.5291609764099121
iteration 80, loss = 0.5280734896659851
iteration 90, loss = 0.5251089930534363
iteration 100, loss = 0.5288001298904419


Prediction for a particular user/item pair. Note that this is an unnormalized score (which can be used for ranking)

In [50]:
u, i, _ = interactionsTest[0]

In [51]:
modelLFM(torch.LongTensor([userIDs[u]]), torch.LongTensor([itemIDs[i]])).item()

3.779024362564087

# Exercises

### 5.1 (`TensorFlow` Implementation)

Adapt the latent factor model above, simply deleting any terms associated with latent factors

In [62]:
class LatentFactorModelBiasOnly(tf.keras.Model):
    def __init__(self, mu, lamb):
        super(LatentFactorModelBiasOnly, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i]
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        pred = self.alpha + beta_u + beta_i
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [63]:
modelBiasOnly = LatentFactorModelBiasOnly(mu, 0.00001)

In [64]:
def trainingStepBiasOnly(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
        (grad, var) in zip(gradients, model.trainable_variables)
        if grad is not None)
    return loss.numpy()

In [65]:
for i in range(50):
    obj = trainingStepBiasOnly(modelBiasOnly, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.57053554
iteration 20, objective = 0.5389685
iteration 30, objective = 0.5308695
iteration 40, objective = 0.52404946
iteration 50, objective = 0.516751


Compute the MSEs for a model which always predicts the mean, versus one which involves bias terms

In [66]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [67]:
alwaysPredictMean = [mu for _ in interactionsTest]
labels = [r for _,_,r in interactionsTest]

In [68]:
MSE(alwaysPredictMean, labels)

1.3267613992499294

In [69]:
biasOnlyPredictions =\
    [modelBiasOnly.predict(userIDs[u],itemIDs[i]).numpy() for u,i,_ in interactionsTest]

In [70]:
biasOnlyPredictions[0]

4.478474

In [71]:
MSE(biasOnlyPredictions, labels)

1.0027213071269891

### 5.1 (`PyTorch` Implementation)

In [60]:
class LatentFactorModelBiasOnly(nn.Module):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModelBiasOnly, self).__init__()
        # Initialize to average
        self.alpha = nn.Parameter(torch.Tensor([mu]))
        # Initialize to small random values
        self.betaU = nn.Embedding(len(userIDs), 1)
        nn.init.normal_(self.betaU.weight, std=0.001)
        self.betaI = nn.Embedding(len(itemIDs), 1)
        nn.init.normal_(self.betaI.weight, std=0.001)
        self.lamb = lamb

    # Prediction
    def forward(self, u, i):
        p = self.alpha + self.betaU(u).squeeze(-1) + self.betaI(i).squeeze(-1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (torch.sum(self.betaU.weight**2) + \
                            torch.sum(self.betaI.weight**2))
    
    # Loss
    def loss(self, u, i, r):
        pred = self.forward(u, i)
        return nn.functional.mse_loss(pred, r) + self.reg() # mse_loss in pytorch is 2 * l2_loss in tensorflow

modelBiasOnly = LatentFactorModelBiasOnly(mu, 5, 0.00001)
optimizer = torch.optim.Adam(modelBiasOnly.parameters(), lr=0.1)

def trainingStepBiasOnly(model, interactions):  
    # gradient reset
    optimizer.zero_grad() 
    # data generation
    Nsamples = 50000
    sampleU, sampleI, sampleR = [], [], []
    for _ in range(Nsamples):
        u,i,r = random.choice(interactions)
        sampleU.append(userIDs[u])
        sampleI.append(itemIDs[i])
        sampleR.append(r)
    sampleU, sampleI, sampleR = torch.LongTensor(sampleU), torch.LongTensor(sampleI), torch.Tensor(sampleR)
    # loss calculation
    loss = model.loss(sampleU, sampleI, sampleR)
    # gradient calculation
    loss.backward()
    # weight update
    optimizer.step()
    return loss.item()

for i in range(50):
    loss = trainingStepBiasOnly(modelBiasOnly, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", loss = " + str(loss))

iteration 10, loss = 1.0379294157028198
iteration 20, loss = 0.9627811908721924
iteration 30, loss = 0.9267162084579468
iteration 40, loss = 0.925595760345459
iteration 50, loss = 0.9355825781822205


Compute the MSEs for a model which always predicts the mean, versus one which involves bias terms

In [61]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

alwaysPredictMean = [mu for _ in interactionsTest]
labels = [r for _,_,r in interactionsTest]

MSE(alwaysPredictMean, labels)

1.3254129874908986

In [62]:
biasOnlyPredictions = \
    [modelBiasOnly(torch.LongTensor([userIDs[u]]), \
         torch.LongTensor([itemIDs[i]])).item() for u,i,_ in interactionsTest]

In [63]:
biasOnlyPredictions[0]

3.969268321990967

In [64]:
MSE(biasOnlyPredictions, labels)

0.9567523000636151

### 5.2 (`TensorFlow` Implementation)

Performance of a complete latent factor model (the latent factor model implementation is as the same as LatentFactorMOdel in the examples above)

In [72]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

def trainingStep(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

optimizer = tf.keras.optimizers.Adam(0.1)
modelLFM = LatentFactorModel(mu, 10, 0.00001)

In [73]:
for i in range(50):
    obj = trainingStep(modelLFM, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.539807
iteration 20, objective = 0.5344611
iteration 30, objective = 0.532373
iteration 40, objective = 0.53467935
iteration 50, objective = 0.5289248


In [74]:
predictions = [modelLFM.predict(userIDs[u],itemIDs[i]).numpy() for u,i,_ in interactionsTest]

In [75]:
MSE(predictions, labels)

1.0097459352107794

(probably needs a little more tuning in terms of number of latent factors, learning rate, etc.)

### 5.2 (`PyTorch` Implementation)
Performance of a complete latent factor model (the latent factor model implementation is as the same as LatentFactorMOdel in the examples above)

In [67]:
class LatentFactorModel(nn.Module):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = nn.Parameter(torch.Tensor([mu]))
        # Initialize to small random values
        self.betaU = nn.Embedding(len(userIDs), 1)
        nn.init.normal_(self.betaU.weight, std=0.001)
        self.betaI = nn.Embedding(len(itemIDs), 1)
        nn.init.normal_(self.betaI.weight, std=0.001)
        self.gammaU = nn.Embedding(len(userIDs), K)
        nn.init.normal_(self.gammaU.weight, std=0.001)
        self.gammaI = nn.Embedding(len(itemIDs), K)
        nn.init.normal_(self.gammaI.weight, std=0.001)
        self.lamb = lamb

    # Prediction
    def forward(self, u, i):
        p = self.alpha + self.betaU(u).squeeze(-1) + self.betaI(i).squeeze(-1) + \
            (self.gammaU(u) * self.gammaI(i)).sum(dim=-1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (torch.sum(self.betaU.weight**2) + \
                            torch.sum(self.betaI.weight**2) + \
                            torch.sum(self.gammaU.weight**2) + \
                            torch.sum(self.gammaI.weight**2))
    
    # Loss
    def loss(self, u, i, r):
        pred = self.forward(u, i)
        return nn.functional.mse_loss(pred, r) + self.reg() # mse_loss in pytorch is 2 * l2_loss in tensorflow

modelLFM = LatentFactorModel(mu, 10, 0.00001)
optimizer = torch.optim.Adam(modelLFM.parameters(), lr=0.1)

def trainingStep(model, interactions):  
    # gradient reset
    optimizer.zero_grad() 
    # data generation
    Nsamples = 50000
    sampleU, sampleI, sampleR = [], [], []
    for _ in range(Nsamples):
        u,i,r = random.choice(interactions)
        sampleU.append(userIDs[u])
        sampleI.append(itemIDs[i])
        sampleR.append(r)
    sampleU, sampleI, sampleR = torch.LongTensor(sampleU), torch.LongTensor(sampleI), torch.Tensor(sampleR)
    # loss calculation
    loss = model.loss(sampleU, sampleI, sampleR)
    # gradient calculation
    loss.backward()
    # weight update
    optimizer.step()
    return loss.item()

for i in range(50):
    loss = trainingStep(modelLFM, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", loss = " + str(loss))

iteration 10, loss = 1.0803641080856323
iteration 20, loss = 1.0124554634094238
iteration 30, loss = 0.9824608564376831
iteration 40, loss = 0.9526613354682922
iteration 50, loss = 0.9274086952209473


In [68]:
predictions = [modelLFM(torch.LongTensor([userIDs[u]]), torch.LongTensor([itemIDs[i]])).item() for u,i,_ in interactionsTest]
MSE(predictions, labels)

1.020429780222474

### 5.3

Experiment with rounding the predictions

In [76]:
predictionsRounded = [int(p + 0.5) for p in predictions]

In [77]:
MSE(predictionsRounded, labels)

1.0911789652247668

Seems to result in worse performance. For a rough explanation, consider a random variable that takes a value of "1" half the time and "2" half the time; in terms of the MSE, always predicting 1.5 (and always incurring moderate errors) is preferable to always predicting either of 1 or 2 (and incurring a large error half the time).

### 5.4 (`TensorFlow` Implementation)

Following the BPR code from examples above

In [78]:
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb

    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaU) +\
                            tf.nn.l2_loss(self.gammaI))
    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))

def trainingStepBPR(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u,i,_ = random.choice(interactions) # positive sample
            j = random.choice(items) # negative sample
            while j in itemsPerUser[u]:
                j = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])

        loss = model(sampleU,sampleI,sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

optimizer = tf.keras.optimizers.Adam(0.1)
modelBPR = BPRbatch(10, 0.00001)

In [79]:
for i in range(50):
    obj = trainingStepBPR(modelBPR, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

In [None]:
interactionsTestPerUser = defaultdict(set)
itemSet = set()
for u,i,_ in interactionsTest:
    interactionsTestPerUser[u].add(i)
    itemSet.add(i)

AUC implementation

In [None]:
def AUCu(u, N): # N samples per user
    win = 0
    if N > len(interactionsTestPerUser[u]):
        N = len(interactionsTestPerUser[u])
    positive = random.sample(interactionsTestPerUser[u],N)
    negative = random.sample(itemSet.difference(interactionsTestPerUser[u]),N)
    for i,j in zip(positive,negative):
        si = modelBPR.predict(userIDs[u], itemIDs[i]).numpy()
        sj = modelBPR.predict(userIDs[u], itemIDs[j]).numpy()
        if si > sj:
            win += 1
    return win/N

In [None]:
def AUC():
    av = []
    for u in interactionsTestPerUser:
        av.append(AUCu(u, 10))
    return sum(av) / len(av)

In [None]:
AUC()

0.7884310302103841

### 5.4 (`PyTorch` Implementation)

In [69]:
class BPRbatch(nn.Module):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize to small random values
        self.betaI = nn.Embedding(len(itemIDs), 1)
        nn.init.normal_(self.betaI.weight, std=0.001)
        self.gammaU = nn.Embedding(len(userIDs), K)
        nn.init.normal_(self.gammaU.weight, std=0.001)
        self.gammaI = nn.Embedding(len(itemIDs), K)
        nn.init.normal_(self.gammaI.weight, std=0.001)
        self.lamb = lamb

    # Prediction
    def forward(self, u, i):
        p = self.betaI(i).squeeze(-1) + (self.gammaU(u) * self.gammaI(i)).sum(dim=-1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (torch.sum(self.betaI.weight**2) + \
                            torch.sum(self.gammaU.weight**2) + \
                            torch.sum(self.gammaI.weight**2))
    
    # Loss
    def loss(self, u, i, j):
        x_ui = self.forward(u, i)
        x_uj = self.forward(u, j)
        return - torch.sigmoid(x_ui - x_uj).log().mean() + self.reg() # mse_loss in pytorch is 2 * l2_loss in tensorflow

modelBPR = BPRbatch(10, 0.00001)
optimizer = torch.optim.Adam(modelBPR.parameters(), lr=0.1)

def trainingStepBPR(model, interactions):  
    # gradient reset
    optimizer.zero_grad() 
    # data generation
    Nsamples = 50000
    sampleU, sampleI, sampleJ = [], [], []
    for _ in range(Nsamples):
        u,i,_ = random.choice(interactions) # positive sample
        j = random.choice(items) # negative sample
        while j in itemsPerUser[u]:
            j = random.choice(items)
        sampleU.append(userIDs[u])
        sampleI.append(itemIDs[i])
        sampleJ.append(itemIDs[j])
    sampleU, sampleI, sampleJ = torch.LongTensor(sampleU), torch.LongTensor(sampleI), torch.LongTensor(sampleJ)
    # loss calculation
    loss = model.loss(sampleU, sampleI, sampleJ)
    # gradient calculation
    loss.backward()
    # weight update
    optimizer.step()
    return loss.item()

for i in range(50):
    loss = trainingStepBPR(modelBPR, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", loss = " + str(loss))    

iteration 10, loss = 0.5897132158279419
iteration 20, loss = 0.5543973445892334
iteration 30, loss = 0.5526553392410278
iteration 40, loss = 0.547688364982605
iteration 50, loss = 0.5417863130569458


In [74]:
interactionsTestPerUser = defaultdict(set)
itemSet = set()
for u,i,_ in interactionsTest:
    interactionsTestPerUser[u].add(i)
    itemSet.add(i)

AUC Implementation

In [81]:
def AUCu(u, N): # N samples per user
    win = 0
    if N > len(interactionsTestPerUser[u]):
        N = len(interactionsTestPerUser[u])
    positive = random.sample(interactionsTestPerUser[u],N)
    negative = random.sample(itemSet.difference(interactionsTestPerUser[u]),N)
    for i,j in zip(positive,negative):
        user, i, j = torch.LongTensor([userIDs[u]]), torch.LongTensor([itemIDs[i]]), torch.LongTensor([itemIDs[j]])
        si = modelBPR(user, i).item()
        sj = modelBPR(user, j).item()
        if si > sj:
            win += 1
    return win/N

In [82]:
def AUC():
    av = []
    for u in interactionsTestPerUser:
        av.append(AUCu(u, 10))
    return sum(av) / len(av)

In [83]:
AUC()

0.7810019841269845