# Recommendations in pytorch using triplet loss
Along the lines of BPR [1]. 

[1] Rendle, Steffen, et al. "BPR: Bayesian personalized ranking from implicit feedback." Proceedings of the Twenty-Fifth Conference on Uncertainty in Artificial Intelligence. AUAI Press, 2009.

This is implemented (more efficiently) in LightFM (https://github.com/lyst/lightfm). See the MovieLens example (https://github.com/lyst/lightfm/blob/master/examples/movielens/example.ipynb) for results comparable to this notebook.

## Set up the architecture
A simple dense layer for both users and items: this is exactly equivalent to latent factor matrix when multiplied by binary user and item indices. There are three inputs: users, positive items, and negative items. In the triplet objective we try to make the positive item rank higher than the negative item for that user.

Because we want just one single embedding for the items, we use shared weights for the positive and negative item inputs (a siamese architecture).

This is all very simple but could be made arbitrarily complex, with more layers, conv layers and so on. I expect we'll be seeing a lot of papers doing just that.


In [1]:
from __future__ import print_function

import numpy as np
import itertools
import data
import metrics

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from joblib import Parallel, delayed
from functools import partial

In [2]:
n_latent = 20
batch_size = 64

# Read data
train, test = data.get_movielens_data()
train = train.tocsr()
n_users, n_items = train.shape
print(n_users, n_items)


944 1683


In [3]:
class FactNet(nn.Module):
    def __init__(self,
                 n_users, n_items,
                 n_latent,
                ):
        super(FactNet, self).__init__()
        self.user_embedding_layer = nn.Embedding(n_users, n_latent)
        self.item_embedding_layer = nn.Embedding(n_items, n_latent)  # both pos and neg items share these params
        init.uniform(self.user_embedding_layer.weight, -0.05, 0.05)
        init.uniform(self.item_embedding_layer.weight, -0.05, 0.05)  # default was normal
        
    def predict_score(self, uid, iid):
        user_embedding = self.user_embedding_layer(uid)
        item_embedding = self.item_embedding_layer(iid)
        score = (user_embedding * item_embedding).sum(dim=1)
        return score
    
        
    def forward(self, uid, pid, nid):
        # lulzy forward for loss computation
        user_embedding = self.user_embedding_layer(uid)
        pos_item_embedding = self.item_embedding_layer(pid)
        neg_item_embedding = self.item_embedding_layer(nid)
        
        # torch.dot doesnt take in axis :sadface:
        pos_pred = (user_embedding * pos_item_embedding).sum(dim=1)
        neg_pred = (user_embedding * neg_item_embedding).sum(dim=1)
        return pos_pred, neg_pred

    
class TripletLoss(nn.Module):
    def __init__(self, loss='sigmoid'):
        super(TripletLoss, self).__init__()
        loss_d = {
            'sigmoid': self.forward_sigmoid,
            'hinge': self.forward_hinge,
        }
        self.forward = loss_d[loss]

    def forward_sigmoid(self, pos_pred, neg_pred):
        loss = 1.0 - torch.sigmoid(pos_pred - neg_pred)
        return loss.mean()
    
    def forward_hinge(self, pos_pred, neg_pred):
        loss = torch.clamp(1.0 + neg_pred - pos_pred, min=0.0)
        return loss.mean()


net = FactNet(n_users, n_items, n_latent)
criterion = TripletLoss()
# optimizer = optim.Adam(net.parameters())
# optimizer = optim.Adadelta(net.parameters())
optimizer = optim.Adagrad(net.parameters())

In [4]:
def batcher(iterable, batch_size):
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, batch_size))
        if not chunk:
            return
        yield np.array(chunk, dtype=np.int64)

In [5]:
from bisect import bisect_left

def bsearch_membership(csr_mat, row_ind, col_ind):
    """ ehh, this is slower than `in`
    csr_mat : interaction matrix
    row_ind : user_ind
    col_ind : item_ind
    """
    start_idx = csr_mat.indptr[row_ind]
    stop_idx = csr_mat.indptr[row_ind+1]
    pos_vals = csr_mat.indices[start_idx:stop_idx]
    insertion_index = bisect_left(pos_vals, col_ind)
    return (insertion_index < (stop_idx-start_idx)) and (col_ind == pos_vals[insertion_index])

def get_row_nz(csr_mat, col_ind):
    """faster than csr_mat.get_row for some reason"""
    start_idx = csr_mat.indptr[col_ind]
    stop_idx = csr_mat.indptr[col_ind+1]
    return csr_mat.indices[start_idx:stop_idx]
    

# Negative sample generator

def sample_neg(user_ind, interactions,
               max_samples=1000,
               violating_cond=lambda neg_item_ind:True):
    # Given a user index, sample a negative item
    user_pos_item_inds = get_row_nz(interactions, user_ind)

    n_sampled = 0
    while n_sampled < max_samples:
        neg_item_ind = np.random.randint(0, n_items)
        n_sampled += 1
        if neg_item_ind not in user_pos_item_inds:
            if violating_cond(neg_item_ind):
                break
    return neg_item_ind, n_sampled

In [6]:
# Sample from training interactions
nnz_users, nnz_items = train.nonzero()
# preload at the start of every epoch (next epoch can be done on cpu while gpu trains)
nid_preload, n_sampled_preload = zip(*(sample_neg(uid, train) for uid in nnz_users))

shuffle_ind = np.arange(len(nnz_users))

In [7]:
%%time
num_epochs = 10

for epoch in range(num_epochs):
    print('Epoch %s' % epoch, end='\t')
    
    # Resample negs per pos interaction
    nid_preload, n_sampled_preload = map(np.array, zip(*(sample_neg(uid, train) for uid in nnz_users)))

    np.random.shuffle(shuffle_ind)

    uid_gen = batcher(nnz_users[shuffle_ind], batch_size)
    pid_gen = batcher(nnz_items[shuffle_ind], batch_size)
    nid_gen = batcher(nid_preload[shuffle_ind], batch_size)
    n_samp_gen = batcher(n_sampled_preload[shuffle_ind], batch_size)
    
    for uid_batch, pid_batch, nid_batch in zip(uid_gen, pid_gen, nid_gen):
        user_input = Variable(torch.LongTensor(uid_batch))
        p_item_input = Variable(torch.LongTensor(pid_batch))
        n_item_input = Variable(torch.LongTensor(nid_batch))
        out = net(user_input, p_item_input, n_item_input)
        loss = criterion(*out)
        loss.backward()
        optimizer.step()

    print('AUC: train:{}\t test:{}'.format(metrics.full_auc_pytorch(net, train),
                                           metrics.full_auc_pytorch(net, test)))

Epoch 0	AUC: train:0.874332141053	 test:0.827984265212
Epoch 1	AUC: train:0.891638065107	 test:0.848330492851
Epoch 2	AUC: train:0.899276415027	 test:0.855707976277
Epoch 3	AUC: train:0.905289473245	 test:0.860596252704
Epoch 4	AUC: train:0.909699714718	 test:0.864117215455
Epoch 5	AUC: train:0.913067882038	 test:0.867346372079
Epoch 6	AUC: train:0.915902530391	 test:0.869994445547
Epoch 7	AUC: train:0.918714925594	 test:0.872135623793
Epoch 8	AUC: train:0.920984074984	 test:0.873703225204
Epoch 9	AUC: train:0.92295357669	 test:0.875112524169
CPU times: user 43.9 s, sys: 204 ms, total: 44.1 s
Wall time: 20.8 s
