In [1]:
#Bayesian Personalized Ranking, provided at Xiangnan He, Neural Collaborative Filtering as a baseline

In [2]:
#For Reference: 
#Factor = 8, HitRate = 0.63, NDCG = 0.36
#Factor = 16, HR = 0.66, NDCG = 0.39
#Factor = 32, HR = 0.68, NDCG = 0.41
#Factor = 64, HR = 0.68, NDCG = 0.41-0.42

In [4]:
import numpy as np
import theano
import theano.tensor as T
import time
import multiprocessing as mp
from math import *

In [11]:
class BPR(object):
    def __init__(self, train, test, num_user, num_item, num_records, factors, learning_rate, reg_rate, init_mean, init_stddev, negative_pairs):
        self.train = train
        self.test = test
        self.num_user = num_user
        self.num_item = num_item
        self.num_records = num_records
        self.factors = factors
        self.learning_rate = learning_rate
        self.reg_rate = reg_rate
        self.init_mean = init_mean
        self.init_stddev = init_stddev
        self.negative_pairs = negative_pairs
        
        gamma_U_init = np.random_normal(loc = init_mean, scale = init_stddev, size = (num_user, factors))
        gamma_I_init = np.random_normal(loc = init_mean, scale = init_stddev, size = (num_item, factors))
        
        self.gamma_U = theano.shared(value = U_init.astype(theano.config.floatX), name = 'U', borrow = True)
        self.gamma_I = theano.shared(value = I_init.astype(theano.config.floatX), name = 'I', borrow = True)
        #Define initial gamma_u and gamma_i as shared variables
        
        self.items_of_user = [[] for i in range(num_user)]
        for ul in train:
            ul.rstrip('\n')
            ulint = [int(x) for x in ul.split(',')]
            self.items_of_user[ulint[0]].append(ulint[1])
        #Build item-of-user
        
        u = T.lvector('u')
        i = T.lvector('i')
        j = T.lvector('j')
        lr = T.scalar('lr')
        
        x_ui = T.dot(self.gamma_U[u], self.gamma_I[i]).diagonal()
        x_uj = T.dot(self.gamma_U[u], self.gamma_I[j]).diagonal()
        regularization = self.reg_rate * ((self.gamma_U[u]**2).sum() + (self.gamma_I[i]**2).sum() + (self.gamma_I[j]**2).sum())
        loss = regularization - T.sum(T.log(T.nnet.sigmoid(x_ui - x_uj)))
        #Calculate loss
        
        self.SGDoptimizer = theano.function([u,i,j,lr], [], updates = [(self.gamma_U, self.gamma_U - lr*T.grad(loss, self.gamma_U)), \
                                                                       (self.gamma_I, self.gamma_I - lr*T.grad(loss, self.gamma_I))])
        #SGD optimizer
    
    def build_model(self, num_epoch = 100, batch_size = 1000):
        print("Training BPR with learning_rate:%f, regularization:%f, #factors:%d, #epoch:%d, batch_size:%d"%\
             (self.learning_rate, self.reg_rate, self.factors, num_epoch, batch_size))
        for ep in range(num_epoch):
            t1 = time.time()
            for iter in range(self.num_records//batch_size):
                batch_u, batch_i, batch_j = self.get_next_batch(batch_size)
                self.SGDoptimizer(batch_u, batch_i, batch_j, self.learning_rate)
            
            #evaluate model
            t2 = time.time()
            #Get time difference for a batch
            self.U_cur = self.gamma_U.eval()
            self.I_cur = self.gamma_I.eval()
            topK = 10
            hit_rate, NDCG = self.evaluate_model(top = topK, negative_pairs = self.negative_pairs)
            print("Epoch = %d[%.1f s], HitRatio@%d = %f, NDCG%d = %f[%.1f s]"%\
                  (ep, t2-t1, topK, hit_rate, topK, NDCG, time.time()-t2))
    
    def predict(self, u, i):
        return np.dot(self.U_cur[u], self.I_cur[i])
    
    def evaluate_model(self, negative_pairs, top = 10):
        hit = 0
        tNDCG = 0
        for i in range(self.num_user):
            rating_truth = self.predict(i, test[i][1])
            rating_neg = []
            for j in range(100):
                rating_neg.append((self.predict(i, negative_pairs[i][j]), 0))
            rating_neg.append((rating_truth, 1))
            rating_neg.sort()
            rating_neg.reverse()
            rank = [r[1] for r in rating_neg]
            rank_truth = rank.index(1)
            if rank_truth < top:
                hit += 1
                tNDCG += log(2)/log(rank_truth + 2)
        
        return (hit/num_user, tNDCG/num_user)
                
            
            

In [13]:
train_data = []
with open('train_data.txt', 'r') as infile:
    for r in infile:
        r.rstrip('\n')
        rint = [int(x) for x in r.split(',')]
        train_data.append((rint[0], rint[1]))

In [14]:
test_data = []
with open('test_data.txt', 'r') as infile:
    for r in infile:
        r.strip('\n')
        rint = [int(x) for x in r.split(',')]
        test_data.append((rint[0], rint[1]))

In [15]:
neg_pairs = [[] for i in range(6040)]
with open('negative_pairs.txt', 'r') as infile:
    for r in infile:
        r.rstrip('\n')
        rint = [int(x) for x in r.split(',')]
        neg_pairs[rint[0]].append(rint[1])