In [1]:
import scipy.sparse as sp
import pandas as pd
import numpy as np


class Dataset(object):
    def __init__(self, path):
        self.trainMatrix = self.load_rating_file_as_matrix(path + ".train.rating")
        self.testRatings = self.load_rating_file_as_list(path + ".test.rating")
        self.testNegatives = self.load_negative_file(path + ".test.negative")
        assert len(self.testRatings) == len(self.testNegatives)
        
        self.num_users, self.num_items = self.trainMatrix.shape

    def load_rating_file_as_list(self, filename):
        df = pd.read_csv(filename, sep="\t")
        ratingList = list(zip(df.userid.tolist(), df.itemid.tolist()))
        return ratingList
    
    def load_negative_file(self, filename):
        df = pd.read_csv(filename, sep="\t")
        negativeList = df.iloc[:, 1:].values.tolist()
        return negativeList
    
    def load_rating_file_as_matrix(self, filename):
        df = pd.read_csv(filename, sep="\t")
        num_users = df.userid.max()
        num_items = df.itemid.max()
        mat = sp.dok_matrix((num_users + 1, num_items + 1), dtype=np.float32)
        interactions = df[['userid', 'itemid']].values.tolist()
        # [(0, 2969), (0, 1178), (0, 1574), (0, 957)]
        for user, item in interactions:
            mat[user, item] = 1.
        # [((0, 2969), 1.0), ((0, 1178), 1.0), ((0, 1574), 1.0), ((0, 957), 1.0)]
        return mat

In [2]:
import math
import heapq
import numpy as np


def evaluate_model(model, testRatings, testNegatives, topK):

    global _model
    global _testRatings
    global _testNegatives
    global _topK

    _model = model
    _testRatings = testRatings
    _testNegatives = testNegatives
    _topK = topK

    hits, ndcgs = [],[]
    for idx in range(len(_testRatings)):
        (hr,ndcg) = eval_one_rating(idx)
        hits.append(hr)
        ndcgs.append(ndcg)
    return (hits, ndcgs)


def eval_one_rating(idx):
    rating = _testRatings[idx]
    items = _testNegatives[idx]
    u = rating[0]
    gtItem = rating[1]
    items.append(gtItem)
    map_item_score = {}
    users = np.full(len(items), u, dtype = 'int32')
    predictions = _model.predict([users, np.array(items)],
                                 batch_size=100, verbose=0)

    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    items.pop()

    ranklist = heapq.nlargest(_topK, map_item_score, key=map_item_score.get)
    hr = getHitRatio(ranklist, gtItem)
    ndcg = getNDCG(ranklist, gtItem)
    return (hr, ndcg)


def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0


def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

def get_train_instances(train, n_items, n_neg, testNegatives):
    user, item, labels = [],[],[]
    n_users = train.shape[0]
    for (u, i) in train.keys():
        # positive instance
        user.append(u)
        item.append(i)
        labels.append(1)
        # negative instances: we also need to make sure they are not in the
        # test dataset
        for t in range(n_neg):
            j = np.random.randint(n_items)
            while ((u, j) in train.keys()) or (j in testNegatives[u]):
                j = np.random.randint(n_items)
            user.append(u)
            item.append(j)
            labels.append(0)
    return np.array(user), np.array(item), np.array(labels)

In [3]:
import numpy as np
import pandas as pd
import os
import heapq
import keras
import multiprocessing as mp
from keras import initializers
from keras.models import Model, load_model, save_model
from keras.layers import Dense, Embedding, Input, merge, Flatten, multiply
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from keras.regularizers import l2

from time import time

def GMF(n_users, n_items, n_emb, reg):
    user = Input(shape=(1,), dtype='int32', name='user_input')
    item = Input(shape=(1,), dtype='int32', name='item_input')

    # user and item embeddings
    # [?, 1, 8]
    MF_Embedding_User = Embedding(
        input_dim=n_users,
        output_dim=n_emb,
        name='user_embedding',
        embeddings_initializer='normal',
        embeddings_regularizer=l2(reg),
        input_length=1)
    # [?, 1, 8]
    MF_Embedding_Item = Embedding(
        input_dim=n_items,
        output_dim=n_emb,
        name='item_embedding',
        embeddings_initializer='normal',
        embeddings_regularizer=l2(reg),
        input_length=1)

    # Flatten and multiply
    # [?, 8]
    user_latent = Flatten()(MF_Embedding_User(user))
    # [?, 8]
    item_latent = Flatten()(MF_Embedding_Item(item))
    # [?, 8]
    predict_vector = multiply([user_latent, item_latent])

    # output layer
    prediction = Dense(1, activation='sigmoid',
                       kernel_regularizer=l2(reg),
                       kernel_initializer='lecun_uniform',
                       name='prediction')(predict_vector)

    # Model
    model = Model(inputs=[user, item], outputs=prediction)

    return model


if __name__ == '__main__':
    datadir = "Data_Javier/"
    dataname = "ml-1m"
    modeldir = "models"
    n_emb = 8
    reg = 0.0
    batch_size = 256
    epochs = 20
    learner = "adam"
    lr = 0.01
    validate_every = 1
    save_model = True
    topK = 10
    n_neg = 4

    modelfname = "keras_GMF" + \
                 "_".join(["_bs", str(batch_size)]) + \
                 "_".join(["_reg", str(reg).replace(".", "")]) + \
                 "_".join(["_lr", str(lr).replace(".", "")]) + \
                 "_".join(["_n_emb", str(n_emb)]) + ".h5"
    modelpath = os.path.join(modeldir, modelfname)
    resultsdfpath = os.path.join(modeldir, 'results_df.p')

    dataset = Dataset(os.path.join(datadir, dataname))
    train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
    n_users, n_items = train.shape

    model = GMF(n_users, n_items, n_emb, reg)
    if learner.lower() == "adagrad":
        model.compile(optimizer=Adagrad(lr=lr), loss='binary_crossentropy')
    elif learner.lower() == "rmsprop":
        model.compile(optimizer=RMSprop(lr=lr), loss='binary_crossentropy')
    elif learner.lower() == "adam":
        model.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy')
    else:
        model.compile(optimizer=SGD(lr=lr), loss='binary_crossentropy')

    best_hr, best_ndcg, best_iter = 0, 0, 0
    for epoch in range(1, epochs + 1):
        t1 = time()
        user, item, labels = get_train_instances(train, n_items, n_neg, testNegatives)
        hist = model.fit([user, item], labels, batch_size=batch_size, epochs=1, verbose=0, shuffle=True)
        t2 = time()
        if epoch % validate_every == 0:
            (hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK)
            hr, ndcg, loss = np.array(hits).mean(), np.array(ndcgs).mean(), hist.history['loss'][0]
            print("Iteration {}: {:.2f}s, HR = {:.4f}, NDCG = {:.4f}, loss = {:.4f}, validated in {:.2f}s"
                  .format(epoch, t2 - t1, hr, ndcg, loss, time() - t2))
            if hr > best_hr:
                best_hr, best_ndcg, best_iter, train_time = hr, ndcg, epoch, t2 - t1
                if save_model:
                    model.save_weights(modelpath, overwrite=True)

    print("End. Best Iteration {}:  HR = {:.4f}, NDCG = {:.4f}. "
          .format(best_iter, best_hr, best_ndcg))
    if save_model:
        print("The best GMF model is saved to {}".format(modelpath))

Using TensorFlow backend.


Iteration 1: 36.37s, HR = 0.5954, NDCG = 0.3313, loss = 0.3269, validated in 4.13s
Iteration 2: 43.90s, HR = 0.6096, NDCG = 0.3423, loss = 0.2915, validated in 5.66s
Iteration 3: 43.02s, HR = 0.6202, NDCG = 0.3505, loss = 0.2870, validated in 4.78s
Iteration 4: 39.48s, HR = 0.6258, NDCG = 0.3517, loss = 0.2831, validated in 4.93s
Iteration 5: 39.76s, HR = 0.6187, NDCG = 0.3482, loss = 0.2803, validated in 4.68s
Iteration 6: 40.26s, HR = 0.6298, NDCG = 0.3570, loss = 0.2796, validated in 5.31s
Iteration 7: 90.90s, HR = 0.6298, NDCG = 0.3583, loss = 0.2798, validated in 7.91s
Iteration 8: 54.74s, HR = 0.6371, NDCG = 0.3623, loss = 0.2797, validated in 6.67s
Iteration 9: 58.15s, HR = 0.6308, NDCG = 0.3549, loss = 0.2800, validated in 6.82s
Iteration 10: 57.15s, HR = 0.6310, NDCG = 0.3594, loss = 0.2806, validated in 5.66s
Iteration 11: 38.04s, HR = 0.6334, NDCG = 0.3597, loss = 0.2811, validated in 4.28s
Iteration 12: 36.78s, HR = 0.6356, NDCG = 0.3640, loss = 0.2814, validated in 5.63s
I