In [3]:
import scipy.sparse as sp
import pandas as pd
import numpy as np


class Dataset(object):
    def __init__(self, path):
        self.trainMatrix = self.load_rating_file_as_matrix(path + ".train.rating")
        self.testRatings = self.load_rating_file_as_list(path + ".test.rating")
        self.testNegatives = self.load_negative_file(path + ".test.negative")
        assert len(self.testRatings) == len(self.testNegatives)
        
        self.num_users, self.num_items = self.trainMatrix.shape

    def load_rating_file_as_list(self, filename):
        df = pd.read_csv(filename, sep="\t")
        ratingList = list(zip(df.userid.tolist(), df.itemid.tolist()))
        return ratingList
    
    def load_negative_file(self, filename):
        df = pd.read_csv(filename, sep="\t")
        negativeList = df.iloc[:, 1:].values.tolist()
        return negativeList

    def load_rating_file_as_matrix(self, filename):
        df = pd.read_csv(filename, sep="\t")
        num_users = df.userid.max()
        num_items = df.itemid.max()
        mat = sp.dok_matrix((num_users + 1, num_items + 1), dtype=np.float32)
        interactions = df[['userid', 'itemid']].values.tolist()
        # [(0, 2969), (0, 1178), (0, 1574), (0, 957)]
        for user, item in interactions:
            mat[user, item] = 1.
        # [((0, 2969), 1.0), ((0, 1178), 1.0), ((0, 1574), 1.0), ((0, 957), 1.0)]
        return mat

In [4]:
import math
import heapq
import numpy as np


def evaluate_model(model, testRatings, testNegatives, topK):

    global _model
    global _testRatings
    global _testNegatives
    global _topK

    _model = model
    _testRatings = testRatings
    _testNegatives = testNegatives
    _topK = topK

    hits, ndcgs = [],[]
    for idx in range(len(_testRatings)):
        (hr,ndcg) = eval_one_rating(idx)
        hits.append(hr)
        ndcgs.append(ndcg)
    return (hits, ndcgs)


def eval_one_rating(idx):

    rating = _testRatings[idx]
    items = _testNegatives[idx]
    u = rating[0]
    gtItem = rating[1]
    items.append(gtItem)
    map_item_score = {}
    users = np.full(len(items), u, dtype = 'int32')
    predictions = _model.predict([users, np.array(items)],
                                 batch_size=100, verbose=0)

    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    items.pop()

    ranklist = heapq.nlargest(_topK, map_item_score, key=map_item_score.get)
    hr = getHitRatio(ranklist, gtItem)
    ndcg = getNDCG(ranklist, gtItem)
    return (hr, ndcg)


def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0


def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0


In [5]:
def MLP(n_users, n_items, layers, dropouts, reg):
    num_layer = len(layers)  # Number of layers in the MLP

    user = Input(shape=(1,), dtype='int32', name='user_input')
    item = Input(shape=(1,), dtype='int32', name='item_input')

    # user and item embeddings
    MLP_Embedding_User = Embedding(
        input_dim=n_users,
        output_dim=int(layers[0] / 2),
        name='user_embedding',
        embeddings_initializer='normal',
        embeddings_regularizer=l2(reg),
        input_length=1)
    MLP_Embedding_Item = Embedding(
        input_dim=n_items,
        output_dim=int(layers[0] / 2),
        name='item_embedding',
        embeddings_initializer='normal',
        embeddings_regularizer=l2(reg),
        input_length=1)

    # 扁平化
    user_latent = Flatten()(MLP_Embedding_User(user))
    item_latent = Flatten()(MLP_Embedding_Item(item))

    vector = concatenate([user_latent, item_latent])

    # MLP layers
    for idx in range(1, num_layer):
        layer = Dense(layers[idx], activation="relu", kernel_regularizer=l2(reg), name="layer{}".format(idx))
        vector = layer(vector)
        vector = Dropout(dropouts[idx - 1])(vector)

    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name='prediction')(vector)

    model = Model(inputs=[user, item], outputs=prediction)

    return model

In [6]:
def GMF(n_users, n_items, n_emb, reg):
    user = Input(shape=(1,), dtype='int32', name='user_input')
    item = Input(shape=(1,), dtype='int32', name='item_input')

    # user and item embeddings
    MF_Embedding_User = Embedding(
        input_dim=n_users,
        output_dim=n_emb,
        name='user_embedding',
        embeddings_initializer='normal',
        embeddings_regularizer=l2(reg),
        input_length=1)
    MF_Embedding_Item = Embedding(
        input_dim=n_items,
        output_dim=n_emb,
        name='item_embedding',
        embeddings_initializer='normal',
        embeddings_regularizer=l2(reg),
        input_length=1)

    # Flatten and multiply
    user_latent = Flatten()(MF_Embedding_User(user))
    item_latent = Flatten()(MF_Embedding_Item(item))
    predict_vector = multiply([user_latent, item_latent])

    #  output layer
    prediction = Dense(1, activation='sigmoid',
                       kernel_regularizer=l2(reg),
                       kernel_initializer='lecun_uniform',
                       name='prediction')(predict_vector)

    # Model
    model = Model(inputs=[user, item], outputs=prediction)

    return model

In [8]:
import numpy as np
import math

def get_train_instances(train, n_items, n_neg, testNegatives):
    user, item, labels = [],[],[]
    n_users = train.shape[0]
    for (u, i) in train.keys():
        # positive instance
        user.append(u)
        item.append(i)
        labels.append(1)
        # negative instances: we also need to make sure they are not in the
        # test dataset
        for t in range(n_neg):
            j = np.random.randint(n_items)
            while ((u, j) in train.keys()) or (j in testNegatives[u]):
                j = np.random.randint(n_items)
            user.append(u)
            item.append(j)
            labels.append(0)
    return np.array(user), np.array(item), np.array(labels)


def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0


def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

In [9]:
import numpy as np
import pandas as pd
import os
import heapq
import keras

from keras import initializers
from keras.models import Model, load_model, save_model
from keras.layers import Dense, Embedding, Input, Dropout, Flatten, concatenate, multiply
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from keras.regularizers import l2

from time import time

import pdb


def NeuMF(n_users, n_items, n_emb, layers, dropouts, reg_mf, reg_mlp, reg_out):

    num_layer = len(layers) #Number of layers in the MLP

    user = Input(shape=(1,), dtype='int32', name = 'user_input')
    item = Input(shape=(1,), dtype='int32', name = 'item_input')

    # user and item embeddings [?, 1, 8]
    MF_Embedding_User = Embedding(input_dim = n_users, output_dim = n_emb,
        name = 'mf_user_embedding', embeddings_initializer='normal',
        embeddings_regularizer=l2(reg_mf), input_length=1)
    # [?, 1, 8]
    MF_Embedding_Item = Embedding(input_dim = n_items, output_dim = n_emb,
        name = 'mf_item_embedding', embeddings_initializer='normal',
        embeddings_regularizer=l2(reg_mf), input_length=1)
    
    # [?, 1, 32]
    MLP_Embedding_User = Embedding(input_dim = n_users, output_dim = int(layers[0]/2),
        name = 'mlp_user_embedding', embeddings_initializer='normal',
        embeddings_regularizer=l2(reg_mlp), input_length=1)
    # [?, 1, 32]
    MLP_Embedding_Item = Embedding(input_dim = n_items, output_dim = int(layers[0]/2),
        name = 'mlp_item_embedding', embeddings_initializer='normal',
        embeddings_regularizer=l2(reg_mlp), input_length=1)

    # GMF part
    # 扁平化以后，是一个一维度的向量
    # [?, 8]
    mf_user_latent = Flatten()(MF_Embedding_User(user))
    mf_item_latent = Flatten()(MF_Embedding_Item(item))
    # [?, 8]
    mf_vector = multiply([mf_user_latent, mf_item_latent])

    # MLP part
    # [?, 32]
    mlp_user_latent = Flatten()(MLP_Embedding_User(user))
    mlp_item_latent = Flatten()(MLP_Embedding_Item(item))
    # [?, 64]
    mlp_vector = concatenate([mlp_user_latent, mlp_item_latent])
    # [32,16,8]
    for idx in range(1, num_layer):
        layer = Dense(layers[idx], activation="relu", kernel_regularizer=l2(reg_mlp), name = "layer{}".format(idx))
        mlp_vector = layer(mlp_vector)
        mlp_vector = Dropout(dropouts[idx-1])(mlp_vector)
    
    # mlp_vector: [?, 8]
    # [?, 16]
    predict_vector = concatenate([mf_vector, mlp_vector])

    # Final prediction layer
    # [?, 1]
    prediction = Dense(1, activation='sigmoid',
        kernel_regularizer=l2(reg_out),
        kernel_initializer='lecun_uniform',
        name = 'prediction')(predict_vector)

    # Model
    model = Model(inputs=[user, item], outputs=prediction)

    return model


def load_pretrain_model(model, gmf_model, mlp_model, num_layers):

    # MF embeddings
    gmf_user_embeddings = gmf_model.get_layer('user_embedding').get_weights()
    gmf_item_embeddings = gmf_model.get_layer('item_embedding').get_weights()
    model.get_layer('mf_user_embedding').set_weights(gmf_user_embeddings)
    model.get_layer('mf_item_embedding').set_weights(gmf_item_embeddings)

    # MLP embeddings
    mlp_user_embeddings = mlp_model.get_layer('user_embedding').get_weights()
    mlp_item_embeddings = mlp_model.get_layer('item_embedding').get_weights()
    model.get_layer('mlp_user_embedding').set_weights(mlp_user_embeddings)
    model.get_layer('mlp_item_embedding').set_weights(mlp_item_embeddings)

    # MLP layers
    for i in range(1, num_layers):
        mlp_layer_weights = mlp_model.get_layer("layer{}".format(i)).get_weights()
        model.get_layer("layer{}".format(i)).set_weights(mlp_layer_weights)

    # Prediction weights
    gmf_prediction = gmf_model.get_layer('prediction').get_weights()
    mlp_prediction = mlp_model.get_layer('prediction').get_weights()
    new_weights = np.concatenate((gmf_prediction[0], mlp_prediction[0]), axis=0)
    new_b = gmf_prediction[1] + mlp_prediction[1]
    model.get_layer('prediction').set_weights([0.5*new_weights, 0.5*new_b])

    return model


if __name__ == '__main__':
    # dir
    datadir = "Data_Javier/"
    dataname = "ml-1m"
    modeldir = "models"
    
    # general parameter
    epochs = 20
    batch_size = 256
    lr = 0.01
    learner = "adam"
    
    # GMF
    n_emb = 8
    reg_mf = 0.0
    
    # MLP
    layers = [64,32,16,8]
    reg_mlp = 0.0
    dropouts = [0,0,0]
    
    # Output layer
    reg_out = 0.0
    
    # Pretrained model
    freeze = False
    
    mf_pretrain_name = ""
    mlp_pretrain_name = ""
    # mf_pretrain_name = "keras_GMF_bs_1024_reg_00_lr_0001_n_emb_16.h5"
    # mlp_pretrain_name = "keras_MLP_bs_256_reg_00_lr_0001_n_emb_128_ll_64_dp_wodp.h5"

    # Experiment
    validate_every = 1
    save_model = True
    n_neg = 4
    topK = 10
    
    
    mf_pretrain = os.path.join(modeldir, mf_pretrain_name)
    mlp_pretrain = os.path.join(modeldir, mlp_pretrain_name)
    with_pretrained = "with_pretrain" if os.path.isfile(mf_pretrain) else "without_pretrain"
    is_frozen = "frozen" if freeze else "trainable"

    modelfname = "keras_NeuMF" + \
        "_" + with_pretrained + \
        "_" + is_frozen + \
        "_" + learner + \
        ".h5"

    modelpath = os.path.join(modeldir, modelfname)
    resultsdfpath = os.path.join(modeldir, 'results_df.p')

    # Loading data
    dataset = Dataset(os.path.join(datadir, dataname))
    train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
    n_users, n_items = train.shape    
    
    # 构建模型
    model = NeuMF(n_users, n_items, n_emb, layers, dropouts, reg_mf, reg_mlp, reg_out)
    if freeze:
            for layer in model.layers[:-2]:
                layer.trainable = False
        if learner.lower() == "adagrad":
            model.compile(optimizer=Adagrad(lr=lr), loss='binary_crossentropy')
        elif learner.lower() == "rmsprop":
            model.compile(optimizer=RMSprop(lr=lr), loss='binary_crossentropy')
        elif learner.lower() == "adam":
            model.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy')
        else:
            model.compile(optimizer=SGD(lr=lr), loss='binary_crossentropy')
            
    # 加载预训练模型
    if os.path.isfile(mf_pretrain) and os.path.isfile(mlp_pretrain):
        gmf_model = GMF(n_users, n_items, n_emb, reg_mf)
        gmf_model.load_weights(mf_pretrain)
        mlp_model = MLP(n_users, n_items, layers, dropouts, reg_mlp)
        mlp_model.load_weights(mlp_pretrain)
        model = load_pretrain_model(model, gmf_model, mlp_model, len(layers))
        print("Load pretrained GMF {} and MLP {} models done. ".format(mf_pretrain, mlp_pretrain))
   

    best_hr, best_ndcg, best_iter = 0,0,0
    for epoch in range(1,epochs+1):
        t1 = time()
        # 生成数据集
        user, item, labels = get_train_instances(train, n_items, n_neg, testNegatives)
        # 训练模型
        hist = model.fit([user, item], labels, batch_size=batch_size, epochs=1, verbose=0, shuffle=True)
        t2 = time()
        if epoch % validate_every ==0:
            (hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK)
            hr, ndcg, loss = np.array(hits).mean(), np.array(ndcgs).mean(), hist.history['loss'][0]
            print("Iteration {}: {:.2f}s, HR = {:.4f}, NDCG = {:.4f}, loss = {:.4f}, validated in {:.2f}s"
                .format(epoch, t2-t1, hr, ndcg, loss, time()-t2))
            if hr > best_hr:
                best_hr, best_ndcg, best_iter, train_time = hr, ndcg, epoch, t2-t1
                if save_model:
                    model.save_weights(modelpath, overwrite=True)

    print("End. Best Iteration {}:  HR = {:.4f}, NDCG = {:.4f}. ".format(best_iter, best_hr, best_ndcg))
    if save_model:
        print("The best NeuCF model is saved to {}".format(modelpath))









Iteration 1: 164.85s, HR = 0.5460, NDCG = 0.3035, loss = 0.3364, validated in 11.84s
Iteration 2: 164.55s, HR = 0.6136, NDCG = 0.3416, loss = 0.2956, validated in 16.15s
Iteration 3: 158.22s, HR = 0.6161, NDCG = 0.3475, loss = 0.2836, validated in 16.66s
Iteration 4: 148.73s, HR = 0.6343, NDCG = 0.3657, loss = 0.2748, validated in 15.02s
Iteration 5: 161.11s, HR = 0.6502, NDCG = 0.3780, loss = 0.2677, validated in 14.43s
Iteration 6: 142.70s, HR = 0.6531, NDCG = 0.3813, loss = 0.2625, validated in 11.48s
Iteration 7: 95.38s, HR = 0.6553, NDCG = 0.3844, loss = 0.2589, validated in 4.93s
Iteration 8: 98.71s, HR = 0.6636, NDCG = 0.3894, loss = 0.2557, validated in 5.98s
Iteration 9: 86.98s, HR = 0.6680, NDCG = 0.3943, loss = 0.2533, validated in 5.04s
Iteration 10: 81.48s, HR = 0.6642, NDCG = 0.3923, loss = 0.2515, validated in 5.08s
Iteration 11: 81.73s, HR = 0.6694, NDCG = 0.3968, loss = 0.2497, validated in 4.84s
Iteration 12: 81.75s, HR = 0.6611, NDCG = 0.3869, loss = 0.2480, 