## **Neural Collaborative Filtering**

This notebook is based on the implementation by the author of the "Neural Collaborative Filtering" paper. [Link](https://github.com/hexiangnan/neural_collaborative_filtering
) 

Neural Collaborative Filtering
This is our implementation for the paper:

Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu and Tat-Seng Chua (2017). Neural Collaborative Filtering. In Proceedings of WWW '17, Perth, Australia, April 03-07, 2017.

Three collaborative filtering models: Generalized Matrix Factorization (GMF), Multi-Layer Perceptron (MLP), and Neural Matrix Factorization (NeuMF). To target the models for implicit feedback and ranking task, we optimize them using log loss with negative sampling.

Please cite our WWW'17 paper if you use our codes. Thanks!

Author: Dr. Xiangnan He (http://www.comp.nus.edu.sg/~xiangnan/)[link text](https://)

In [None]:
'''
Created on Aug 9, 2016
Updated on May 20, 2018

Keras Implementation of Generalized Matrix Factorization (GMF) recommender model in:
He Xiangnan et al. Neural Collaborative Filtering. In WWW 2017.  

@original author: Xiangnan He (xiangnanhe@gmail.com)
@Updated and placed on notebooks: Guy Shtar (shtar@post.bgu.ac.il)
'''

import numpy as np
import tensorflow as T
from tensorflow import keras
from keras import backend as K
from keras import initializers
from keras.initializers import RandomNormal
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, Reshape,  Flatten, Dropout
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from keras.regularizers import l2
from keras.layers import Multiply, Concatenate
from time import time
import multiprocessing as mp
import sys
import math
import argparse

Dataset handling

In [None]:
import scipy.sparse as sp
import numpy as np

class Dataset(object):
    '''
    classdocs
    '''

    def __init__(self, path):
        '''
        Constructor
        '''
        self.trainMatrix = self.load_rating_file_as_matrix(path + ".train.rating")
        self.testRatings = self.load_rating_file_as_list(path + ".test.rating")
        self.testNegatives = self.load_negative_file(path + ".test.negative")
        assert len(self.testRatings) == len(self.testNegatives)
        
        self.num_users, self.num_items = self.trainMatrix.shape
        
    def load_rating_file_as_list(self, filename):
        ratingList = []
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                user, item = int(arr[0]), int(arr[1])
                ratingList.append([user, item])
                line = f.readline()
        return ratingList
    
    def load_negative_file(self, filename):
        negativeList = []
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                negatives = []
                for x in arr[1: ]:
                    negatives.append(int(x))
                negativeList.append(negatives)
                line = f.readline()
        return negativeList
    
    def load_rating_file_as_matrix(self, filename):
        '''
        Read .rating file and Return dok matrix.
        The first line of .rating file is: num_users\t num_items
        '''
        # Get number of users and items
        num_users, num_items = 0, 0
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                u, i = int(arr[0]), int(arr[1])
                num_users = max(num_users, u)
                num_items = max(num_items, i)
                line = f.readline()
        # Construct matrix
        mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                user, item, rating = int(arr[0]), int(arr[1]), float(arr[2])
                if (rating > 0):
                    mat[user, item] = 1.0
                line = f.readline()    
        return mat

Model Evaluation

In [None]:
import math
import heapq # for retrieval topK
import multiprocessing
import numpy as np
from time import time
#from numba import jit, autojit

# Global variables that are shared across processes
_model = None
_testRatings = None
_testNegatives = None
_K = None

def evaluate_model(model, testRatings, testNegatives, K, num_thread):
    """
    Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
    Return: score of each test rating.
    """
    global _model
    global _testRatings
    global _testNegatives
    global _K
    _model = model
    _testRatings = testRatings
    _testNegatives = testNegatives
    _K = K
        
    hits, ndcgs = [],[]
    if(num_thread > 1): # Multi-thread
        pool = multiprocessing.Pool(processes=num_thread)
        res = pool.map(eval_one_rating, range(len(_testRatings)))
        pool.close()
        pool.join()
        hits = [r[0] for r in res]
        ndcgs = [r[1] for r in res]
        return (hits, ndcgs)
    # Single thread
    for idx in range(len(_testRatings)):
        (hr,ndcg) = eval_one_rating(idx)
        hits.append(hr)
        ndcgs.append(ndcg)      
    return (hits, ndcgs)

def eval_one_rating(idx):
    rating = _testRatings[idx]
    items = _testNegatives[idx]
    u = rating[0]
    gtItem = rating[1]
    items.append(gtItem)
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), u, dtype = 'int32')
    predictions = _model.predict([users, np.array(items)], 
                                 batch_size=100, verbose=0)
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    items.pop()
    
    # Evaluate top rank list
    ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
    hr = getHitRatio(ranklist, gtItem)
    ndcg = getNDCG(ranklist, gtItem)
    return (hr, ndcg)

def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

In [None]:
!ls drive/MyDrive/RS/Data

item_price.csv	   ml-1m.test.negative	ml-1m.train.rating
item_priceold.csv  ml-1m.test.rating


You can download the dataset from the author's github or from my drive:

[ml-1m.test.negative](https://https://drive.google.com/file/d/1v3XEN7pjtsjzxx5fuioNbIzMOyom1nNn/view?usp=sharing)

[ml-1m.test.rating](https://drive.google.com/file/d/1TldYS-vtNVAFPXvDTYuuhi3cky37yXK0/view?usp=sharing)

[ml-1m.train.rating](https://https://drive.google.com/file/d/1rFxJ8rG9LVczeCmKC7AXo2Y6sFXo7maG/view?usp=sharing)

In [None]:
# Loading data
path='drive/MyDrive/RS/Data/'
dataset='ml-1m'
#dataset='pinterest-20'
t1 = time()
dataset = Dataset(path + dataset)
train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
num_users, num_items = train.shape
print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d" 
      %(time()-t1, num_users, num_items, train.nnz, len(testRatings)))

Load data done [20.5 s]. #user=6040, #item=3706, #train=994169, #test=6040


In [None]:
num_items

3706

In [None]:
type(train)

scipy.sparse.dok.dok_matrix

Add Negative Sampling to Train Data

In [None]:
def get_train_instances(train, num_negatives):
    user_input, item_input, labels = [],[],[]
    num_users = train.shape[0]
    for (u, i) in train.keys():
        # positive instance
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_items)
            while (u, j) in train:
                j = np.random.randint(num_items)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

Define three models: GMF, MLP and NMF

In [None]:
def get_GMF_model(num_users, num_items, latent_dim, regs=[[0,0]]):
    #Generalized Matrix Factorization
    
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embedding',
                                   embeddings_regularizer = l2(regs[0][0]), input_length=1,embeddings_initializer=RandomNormal(mean=0.0, stddev=0.01)) #init = init_normal,
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embedding',
                                   embeddings_regularizer = l2(regs[0][1]), input_length=1,embeddings_initializer=RandomNormal(mean=0.0, stddev=0.01))  #init = init_normal, 
    
    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(MF_Embedding_User(user_input))
    item_latent = Flatten()(MF_Embedding_Item(item_input))
    
    # Element-wise product of user and item embeddings 
    predict_vector = Multiply()([user_latent, item_latent]) #merge([user_latent, item_latent], mode = 'mul')
    
    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = 'prediction')(predict_vector)
    
    model = Model(inputs=[user_input, item_input], outputs=prediction)
    return model


def get_MLP_model(num_users, num_items, latent_dim, regs=[[0,0],0,0], layers = [20,10]):
    #Multi-Layer Perceptron
    
    assert len(layers) + 1 == len(regs), 'the number of regs is equal to number of layers + the embedding layer'
    num_layer = len(layers) #Number of layers in the MLP
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    MLP_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embedding',
                                   embeddings_regularizer = l2(regs[0][0]), input_length=1,embeddings_initializer=RandomNormal(mean=0.0, stddev=0.01)) #init = init_normal,
    MLP_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embedding',
                                   embeddings_regularizer = l2(regs[0][1]), input_length=1,embeddings_initializer=RandomNormal(mean=0.0, stddev=0.01)) #init = init_normal,
    
    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(MLP_Embedding_User(user_input))
    item_latent = Flatten()(MLP_Embedding_Item(item_input))
    
    # Concatenation of embedding layers
    vector = Concatenate(axis=-1)([user_latent, item_latent])#merge([user_latent, item_latent], mode = 'concat')
    
    # MLP layers
    for idx in range(num_layer):
        layer = Dense(layers[idx], kernel_regularizer = l2(regs[idx+1]), activation='relu', name = 'layer%d' %idx)
        vector = layer(vector)
        
    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = 'prediction')(vector)
    model = Model(inputs=[user_input, item_input], outputs=prediction)
    return model

def get_NMF_model(num_users, num_items, latent_dim_GMF, latent_dim_MLP, reg_GMF=[[0,0]], regs_MLP=[[0,0],0,0], layers=[20,10]):
    #Neural matrix factorization
    assert len(layers) + 1 == len(regs_MLP), 'the number of regs is equal to number of layers + the embedding layer'
    num_layer = len(layers) #Number of layers in the MLP

    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')
    
    # Embedding layer
    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim_GMF, name = 'MF_user_embedding',
                                   embeddings_regularizer = l2(reg_GMF[0][0]), input_length=1,embeddings_initializer=RandomNormal(mean=0.0, stddev=0.01)) #init = init_normal,
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim_GMF, name = 'MF_item_embedding',
                                   embeddings_regularizer = l2(reg_GMF[0][1]), input_length=1,embeddings_initializer=RandomNormal(mean=0.0, stddev=0.01))  #init = init_normal, 
    
    MLP_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim_MLP, name = 'MLP_user_embedding',
                                   embeddings_regularizer = l2(regs_MLP[0][0]), input_length=1,embeddings_initializer=RandomNormal(mean=0.0, stddev=0.01)) #init = init_normal,
    MLP_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim_MLP, name = 'MLP_item_embedding',
                                   embeddings_regularizer = l2(regs_MLP[0][1]), input_length=1,embeddings_initializer=RandomNormal(mean=0.0, stddev=0.01)) #init = init_normal,
    
    # MF part
    mf_user_latent = Flatten()(MF_Embedding_User(user_input))
    mf_item_latent = Flatten()(MF_Embedding_Item(item_input))
    mf_vector = Multiply()([mf_user_latent, mf_item_latent]) #merge([mf_user_latent, mf_item_latent], mode = 'mul') # element-wise multiply

    # MLP part 
    mlp_user_latent = Flatten()(MLP_Embedding_User(user_input))
    mlp_item_latent = Flatten()(MLP_Embedding_Item(item_input))
    mlp_vector = Concatenate(axis=-1)([mlp_user_latent, mlp_item_latent])#merge([mlp_user_latent, mlp_item_latent], mode = 'concat')
    for idx in range(num_layer):
        layer =  Dense(layers[idx], kernel_regularizer = l2(regs_MLP[idx+1]), activation='tanh', name = 'layer%d' %idx)
        mlp_vector = layer(mlp_vector)

    # Concatenate MF and MLP parts
    predict_vector = Concatenate(axis=-1)([mf_vector, mlp_vector])
    
    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = "prediction")(predict_vector)    
    model = Model(inputs=[user_input, item_input], outputs=prediction)
    
    return model

In [None]:
num_factors = 8 #size of embedding size. Can be split to 4 different params potentially.
num_negatives = 4 #how many negative samples per positive sample?
learning_rate = 0.001
epochs = 10
batch_size = 256
verbose = 1
write_model=False
topK = 10 #used to evaluate the model. Top K recommendations are used.
evaluation_threads = 1 
model_out_file = 'Pretrain/%s_GMF_%d_%d.h5' %(dataset, num_factors, time())

# Build model
#model = get_GMF_model(num_users, num_items, num_factors, regs = [[0,0]])
#model = get_MLP_model(num_users, num_items, num_factors, regs = [[0,0],0,0,0], layers = [32,16,8])
model = get_NMF_model(num_users, num_items, latent_dim_GMF=num_factors, latent_dim_MLP=num_factors, reg_GMF=[[0,0]],
                      regs_MLP=[[0,0],0,0,0], layers=[32,16,8])
print(model.summary())

model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy')

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
MLP_user_embedding (Embedding)  (None, 1, 8)         48320       user_input[0][0]                 
__________________________________________________________________________________________________
MLP_item_embedding (Embedding)  (None, 1, 8)         29648       item_input[0][0]                 
______________________________________________________________________________________________

Random Baseline

In [None]:
# Init performance
t1 = time()
(hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK, evaluation_threads)
hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
print('Init: HR = %.4f, NDCG = %.4f\t [%.1f s]' % (hr, ndcg, time()-t1))


Init: HR = 0.0892, NDCG = 0.0402	 [238.8 s]


In [None]:
import numpy as np
item_price = {}
for (u,i) in train.keys():  
  if i not in item_price:
      item_price[i] = np.random.choice([1,2,4,9,25],p=[0.35,0.25,0.2,0.15,0.05])

In [None]:
len(item_price)

3704

In [None]:
item_price_sorted = {k: v for k, v in sorted(item_price.items(), key=lambda item: item[0])}

In [None]:
path

'drive/MyDrive/RS/Data/'

In [None]:
import csv
with open(path+'item_price_test.csv', 'w') as f:
    f.write('item,price'+'\n')
    for key in item_price_sorted.keys():
        f.write("%s,%s\n"%(key,item_price_sorted[key]))

In [None]:
import numpy as np
item_price = {}
with open(path+'item_price.csv', 'w') as f:
  for i in range(num_items):  
      item_price[i] = np.random.choice([1,2,4,9,25],p=[0.35,0.25,0.2,0.15,0.05])
      f.write("%s,%s\n"%(i,item_price[i]))

In [None]:
len(item_price)

3706

In [None]:
# Train model
best_hr, best_ndcg, best_iter = hr, ndcg, -1
for epoch in range(epochs):
    t1 = time()
    # Generate training instances
    user_input, item_input, labels = get_train_instances(train, num_negatives)

    # Training
    hist = model.fit([np.array(user_input), np.array(item_input)], #input
                     np.array(labels), # labels 
                     batch_size=batch_size, epochs=1, verbose=0, shuffle=True)
    t2 = time()

    # Evaluation
    if epoch %verbose == 0:
        (hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK, evaluation_threads)
        hr, ndcg, loss = np.array(hits).mean(), np.array(ndcgs).mean(), hist.history['loss'][0]
        print('Iteration %d [%.1f s]: HR = %.4f, NDCG = %.4f, loss = %.4f [%.1f s]' 
              % (epoch,  t2-t1, hr, ndcg, loss, time()-t2))
        if hr > best_hr:
            best_hr, best_ndcg, best_iter = hr, ndcg, epoch
            if write_model:
                model.save_weights(model_out_file, overwrite=True)

print("End. Best Iteration %d:  HR = %.4f, NDCG = %.4f. " %(best_iter, best_hr, best_ndcg))
if write_model:
    print("The best GMF model is saved to %s" %(model_out_file))

Iteration 0 [54.4 s]: HR = 0.5955, NDCG = 0.3354, loss = 0.3196 [226.4 s]
Iteration 1 [54.7 s]: HR = 0.6316, NDCG = 0.3651, loss = 0.2771 [228.6 s]
Iteration 2 [55.1 s]: HR = 0.6435, NDCG = 0.3755, loss = 0.2683 [226.7 s]
Iteration 3 [53.9 s]: HR = 0.6575, NDCG = 0.3832, loss = 0.2641 [227.7 s]
Iteration 4 [53.2 s]: HR = 0.6573, NDCG = 0.3845, loss = 0.2614 [226.3 s]
Iteration 5 [53.3 s]: HR = 0.6608, NDCG = 0.3875, loss = 0.2592 [227.8 s]
Iteration 6 [53.8 s]: HR = 0.6636, NDCG = 0.3917, loss = 0.2575 [227.2 s]
Iteration 7 [52.9 s]: HR = 0.6659, NDCG = 0.3908, loss = 0.2559 [230.8 s]
Iteration 8 [53.1 s]: HR = 0.6677, NDCG = 0.3920, loss = 0.2546 [227.9 s]
Iteration 9 [53.3 s]: HR = 0.6684, NDCG = 0.3918, loss = 0.2536 [226.0 s]
End. Best Iteration 9:  HR = 0.6684, NDCG = 0.3918. 
