# Testing LightFM

In [1]:
TEST_CODE = "1561003029.019894"
CHOSEN_EPOCH = 600

MODEL_PATH = "../log/{}/models/epoch_{}".format(TEST_CODE, CHOSEN_EPOCH)
LOG_PATH = "../log/{}/log.txt".format(TEST_CODE)

In [2]:
import pickle

from datetime import datetime

from tqdm import tqdm
from sklearn.model_selection import train_test_split

from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k

In [3]:
import pickle
import os

class Logger:

    def set_default_filename(self, filename):
        self.default_filename = filename

    def create_session_folder(self, path):
        try:  
            os.makedirs(path)
        except OSError:  
            print ("Creation of the directory %s failed" % path)
        else:  
            print ("     =====> Successfully created the directory %s \n" % path)

        try:
            os.makedirs(path + "models/")
        except OSError:  
            print ("Creation of the model directory failed")
        else:  
            print ("     =====> Successfully created the model directory")


    def log(self, text):
        with open(self.default_filename, 'a') as f:
            f.writelines(text)
            f.write("\n")

    def save_model(self, model, filename):
        pickle.dump(model, open(filename, 'wb'))

In [4]:
ratings_pivot_csr_filename = "../data/intersect-20m/ratings.csr"

ratings_pivot = pickle.load(open(ratings_pivot_csr_filename, 'rb'))
train, test = random_train_test_split(ratings_pivot, test_percentage=0.2)

train_csr = train.tocsr()
test_csr = test.tocsr()

### ========== Models ==========

In [5]:
model = pickle.load(open(MODEL_PATH, 'rb'))

### ========== Preparation ==========

In [6]:
from scipy.sparse import identity

user_identity = identity(train.shape[0])
item_identity = identity(train.shape[1])

In [7]:
test_user, test_item = test.nonzero()

### ========== Predict ==========

In [8]:
def get_top_suggestion(sample_user, k):
    
    test_item_idx = [i for i in range(0, test.shape[1])]    
    
    prediction = model.predict(user_ids=sample_user, item_ids=test_item_idx, user_features=user_identity, item_features=item_identity)
    prediction = [(prediction[i], i) for i in range(0, len(prediction))]
    prediction = sorted(prediction, reverse=True)
    
    return prediction[:k]


def get_top_truth(sample_user, k):
    
    truth = []

    user_ratings = test_csr[sample_user].todense().tolist()[0]
    user_rated_item = test_csr[sample_user].nonzero()[1]
    for item in user_rated_item:
        truth.append((user_ratings[item], item))

    user_ratings = train_csr[sample_user].todense().tolist()[0]
    user_rated_item = train_csr[sample_user].nonzero()[1]
    for item in user_rated_item:
        truth.append((user_ratings[item], item))

    truth = sorted(truth, reverse=True)
    
    return truth[:k]

In [9]:
def get_intersect_pred_truth(prediction, truth, k):
    pred_item_set = {x[1] for x in prediction[:k]}
    truth_item_set = {x[1] for x in truth[:k]}
    
    return pred_item_set.intersection(truth_item_set)

In [10]:
def check_precision_at_k(sample_user, k):
    prediction = get_top_suggestion(sample_user, k)
    truth = get_top_truth(sample_user, k)
    
    intersect = get_intersect_pred_truth(prediction, truth, k)
    return intersect, len(intersect)/k

In [11]:
from tqdm import tqdm
import numpy as np

n = 500
k = 10

sample_user = [np.random.randint(1, 138000) for i in range(0, n)]

prec_with_train = []
for i in tqdm(sample_user):
    _, p = check_precision_at_k(i, k)
    prec_with_train.append(p)

100%|██████████| 500/500 [00:20<00:00, 25.18it/s]


In [12]:
print("Prec@k :", np.average(prec_with_train))

Prec@k : 0.091


## Check diversity

In [13]:
offset = 0 # discard top n suggestion
n = 3000
k = 10

sample_user = [np.random.randint(0, 138493) for i in range(0, k)]

intersect = {x[1] for x in get_top_suggestion(sample_user[0], k + offset)[offset:]}
uni = intersect
for i in range(1, n):

    try:
        s = {x[1] for x in get_top_suggestion(sample_user[i], k + offset)[offset:]}
        intersect = intersect.intersection(s)
        uni = uni.union(s)
    except:
        pass
    
print("\nintersect")
print(intersect, len(intersect))
print("\nunion")
print(uni, len(uni))
print("\ndistinct rate")
print((len(uni)) / (n*k))


intersect
{5058, 10659, 13122, 7304, 14025, 11536, 2073} 7

union
{5058, 13122, 7304, 14025, 911, 11536, 2073, 13277, 10659, 1321, 6825, 10796, 9334} 13

distinct rate
0.0004333333333333333


# ===================

In [None]:
# sample_user = [32, 1949, 1128, 4321, 7828, 8242, 2119, 1827, 6240, 12282]
sample_user = [np.random.randint(0, 138493) for i in range(0, 10)]
# offset = int(138493 * 0.9)
offset = 0
sample_user = [i + offset for i in sample_user]

In [14]:
sample_user = ['520169',
 '566966',
 '582374',
 '504296',
 '510204',
 '623623',
 '615870',
 '628970',
 '583750',
 '597239']

sample_user = [int(x) - 500001 for x in sample_user]

In [15]:
for user in sample_user:
    prediction = get_top_suggestion(user, 10)
    prediction = [x[1] for x in prediction]
    
    truth = get_top_truth(user, 10)
    truth = [x[1] for x in truth]
    
    print((prediction))
#     display((truth))
#     display(check_precision_at_k(user, 10))
#     display("==================")

[5058, 10659, 7304, 2073, 13122, 14025, 11536, 6825, 1321, 911]
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 1321, 6825, 911]
[13712, 8497, 13405, 10659, 5058, 7304, 1839, 2073, 13310, 13122]
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 6825, 1321, 10796]
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 1321, 6825, 911]
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 6825, 1321, 911]
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 1321, 6825, 911]
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 6825, 1321, 911]
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 6825, 1321, 10796]
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 1321, 6825, 911]
