 # **Test KPRN Result**

In [1]:
TEST_CODE = "2019-07-19 05:47:47"
MODEL_DIR = "../logs/{}".format(TEST_CODE)
CHOSEN_EPOCH = 5

MAX_SEED_NUM = 3
MAX_RELATION_NUM = 15

# > Config

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True

sess = tf.Session(config=config)
set_session(sess)

Using TensorFlow backend.


In [4]:
from tqdm import tqdm

# > Load model

In [5]:
import os
trained_weights = sorted(os.listdir(MODEL_DIR))
choosen_weight = "{}/{}".format(MODEL_DIR, trained_weights[CHOSEN_EPOCH - 1])

In [6]:
from keras.models import load_model
from keras.models import Model

model = load_model(choosen_weight);

W0725 11:02:18.091543 140396732733184 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0725 11:02:18.096777 140396732733184 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0725 11:02:18.121473 140396732733184 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0725 11:02:18.232672 140396732733184 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_w

# > Load data

## >> Read data

In [7]:
file_ratings_re = open("../data/ratings_re.csv").readlines()
file_triples_idx = open("../data/triples_idx.txt").readlines()

file_moviesIdx = open("../data/moviesIdx.txt").readlines() 
file_types = open("../data/types.txt").readlines() 
file_entities = open("../data/entities.txt").readlines()
file_relations = open("../data/relations.txt").readlines()

In [8]:
# create entity id -> name mapping

entity_id_to_name = {}
for line in file_moviesIdx:
    movie_title, entity_id = line.strip().split()
    entity_id_to_name[entity_id] = movie_title
    
for line in file_entities:
    entity_name, entity_id = line.strip().split()
    entity_id_to_name[entity_id] = entity_name

In [9]:
# create movie title -> entity type mapping and list of entity for each type

movie_title_to_entity_type = {}
entity_list_with_type = {}

for line in file_types:
    
    # movie title -> entity type
    entity, entity_type = line.strip().split('\t')
    movie_title_to_entity_type[entity] = entity_type
    
    # entity for each type
    if entity_type not in entity_list_with_type:
        entity_list_with_type[entity_type] = []
    
    entity_list_with_type[entity_type].append(entity)

In [10]:
# Create relation id to name mapping

relation_id_to_name = {}
for line in file_relations:
    relation_name, relation_id = line.strip().split()
    relation_id = int(relation_id)
    relation_id += 200000
    
    # last 2 relation : spouse and relative has no inverse
    if relation_id < 200023:
        relation_id_to_name[str(relation_id + 1)] = relation_name + "_inverse"
    
    relation_id_to_name[str(relation_id)] = relation_name

In [11]:
list_of_type = [
    "Category",
    "Company",
    "Country",
    "Genre",
    "Movie",
    "Person",
    "User",
    
    "#PAD_TOKEN",
    "#UNK_ENTITY_TYPE",
]

entity_type_to_id = {}
list_of_type_with_id = []

i = 0
for t in list_of_type:
    
    entity_type_to_id[t] = i
    list_of_type_with_id.append("{}\t{}".format(t, i))
    
    i += 1

In [12]:
# Load KG from cache

import pickle
kg_path = pickle.load(open("../data/cache_kg_path", "rb"))

In [13]:
user_entity_id_padding = 500000
relation_rated_good_by_id = '200026'
relation_given_good_rating_id = '200027'
good_movie_rating_threshold = 4

## >> Prepare data

In [14]:
import threading

In [15]:
IRREGULAR_RELATION = ['200024', '200025']
NUM_OF_ITEMS = len(file_moviesIdx)

def generate_path_from_entity_to_all_others(entity_id, keep_relation_ratio=0.5, max_relation_num=30, max_entity_per_relation=1):

    generated_paths = []
    
    r1 = kg_path[entity_id]
    for e2_id in range(0, NUM_OF_ITEMS):
        e2_id = str(e2_id)
        # Skip if e1 == e2 (path to itself)
        if e2_id == entity_id:
            continue

        r2 = kg_path[e2_id]
        intersect_relations = set(r1.keys()).intersection(set(r2.keys()))
        
        # downsample relation
        if len(intersect_relations) < max_relation_num:
            n_intersect_relation = len(intersect_relations)
        else:
            n_intersect_relation = max(max_relation_num, int(len(intersect_relations) * keep_relation_ratio))
            
        intersect_relations = list(intersect_relations)
        np.random.shuffle(intersect_relations)

        for relation in intersect_relations[:n_intersect_relation]:

            t1 = r1[relation]
            t2 = r2[relation]
            intersect_entity = list(set(t1).intersection(set(t2)))
            np.random.shuffle(intersect_entity)
            
            for mid_entity in intersect_entity[:max_entity_per_relation]:
                
                inverse_relation = relation if (relation in IRREGULAR_RELATION) else str(int(relation) + 1)        
                path = "{} {} {} {} {}".format(entity_id, relation, mid_entity, inverse_relation, e2_id)
                generated_paths.append(path)
    
    return generated_paths

In [16]:
def run_thread(thread_id, thread_items, result):
    for item_id in (thread_items):
        result += generate_path_from_entity_to_all_others(item_id, keep_relation_ratio=0.5, max_relation_num=MAX_RELATION_NUM, max_entity_per_relation=1)
            
def generate_all_path_from_user(user_id, max_seed_num=10, max_seed_ratio=0.2):
    
    user_interacted_items = []
    for relation in kg_path[user_id]:
        user_interacted_items += kg_path[user_id][relation]
    
    user_interacted_items = sorted(set(user_interacted_items))
    
    # downsample seeds
    if len(user_interacted_items) < max_seed_num:
        n_seed = len(user_interacted_items)
    else:
        n_seed = max(max_seed_num, int(len(user_interacted_items) * max_seed_ratio))
    
    # Randomize
    np.random.shuffle(user_interacted_items)
    user_interacted_items = user_interacted_items[:n_seed]
    
    # ====== Threading ======
    user_paths = []
    threads = []
    for i in range(0, n_seed):
        
        # Split item id equally
        thread_items = user_interacted_items[i::n_seed]
        thread = threading.Thread(target=run_thread, args=(i, thread_items, user_paths))    
        thread.start()
        threads.append(thread)
    
    for i in range(0, n_seed):
        threads[i].join()
        
    user_paths = ["{} {} {}".format(user_id, "200027", x) for x in user_paths]
    return user_paths

In [17]:
def get_type_from_entity_id(entity_id):
    
    # user
    if int(entity_id) > user_entity_id_padding:
        return "User"
    elif entity_id_to_name[entity_id] in movie_title_to_entity_type:
        return movie_title_to_entity_type[entity_id_to_name[entity_id]]
    else:
        return "#UNK_ENTITY_TYPE"

In [18]:
import numpy as np

END_RELATION = '200030'
GIVEN_RATING_RELATION = '200027'
BATCH_COUNT = 72

def reformat_user_path(user_paths):
    
    new_paths = []
    labels = []
    
    for path in user_paths:
        e1, r1, e2, r2, e3, r3, e4 = path.strip().split()

        t1 = entity_type_to_id[get_type_from_entity_id(e1)]
        t2 = entity_type_to_id[get_type_from_entity_id(e2)]
        t3 = entity_type_to_id[get_type_from_entity_id(e3)]
        t4 = entity_type_to_id[get_type_from_entity_id(e4)]

        r4 = END_RELATION

        entity_rated = kg_path[e1][GIVEN_RATING_RELATION]
        label = 1 if e4 in entity_rated else 0
        
        new_paths.append([
            [e1, t1, r1],
            [e2, t2, r2],
            [e3, t3, r3],
            [e4, t4, r4],
        ])
        
        labels.append(label)
        
    return np.array(new_paths).astype('int'), np.array(labels).astype('int')

In [19]:
def path_to_string(paths):
    string_paths = []
    for path in paths:
        
        entity_string = []
        for entity in path.split():
            entity_string.append(get_entity_name(str(entity)))
        entity_string = " -> ".join(entity_string)
        
        string_paths.append(entity_string)

    return string_paths

# > Evaluation

## >> Evaluation function

In [20]:
def _get_k_prediction(X_test, y_pred, k=10, get_best=True, max_pooling_size=-1):
    """
    max_pooling_size -1 means without pooling
    """
    if max_pooling_size > 0:
        return _get_top_k_items_with_score_pooling(X_test, y_pred, k=10, max_pooling_size=max_pooling_size, get_best=get_best)
    else:
        return _get_top_k_items_without_score_pooling(X_test, y_pred, k=10, get_best=get_best)

def _get_top_k_items_without_score_pooling(X_test, y_pred, k=10, get_best=True):
    """
    Get the top-k items from X_test based on y_pred scores, 
    pick all path in sorted X_test until the amount of unique items is equal to k
    """
    choosen_paths = []
    choosen_items = set()

    path_scores = sorted(zip(X_test, y_pred), key=lambda x: x[1], reverse=get_best)
    for path, score in path_scores:
        choosen_paths.append((path, score))
        choosen_items.add(path[3][0]) # Add the last item
        
        if len(choosen_items) >= k:
            break
            
    return choosen_paths, choosen_items

def _get_top_k_items_with_score_pooling(X_test, y_pred, k=10, max_pooling_size=5, get_best=True):
    """
    Get the top-k items from X_test based on averaged y_pred scores,
    pick 'max_pooling_size' paths for each item, rank item based on average score, pick top k items.
    """
    item_paths = {}
    average_item_score = {}

    path_scores = sorted(zip(X_test, y_pred), key=lambda x: x[1], reverse=get_best)
    for path, score in path_scores:

        ending_item = path[3][0] # The last item before END RELATION

        if ending_item not in item_paths:
            item_paths[ending_item] = []
            average_item_score[ending_item] = []

        if len(item_paths[ending_item]) < max_pooling_size:
            item_paths[ending_item].append((path, score))
            average_item_score[ending_item].append(score)

    # Calculate average
    for item in average_item_score:
        average_item_score[item] = sum(average_item_score[item]) / len(average_item_score[item])

    sorted_average_score = sorted(average_item_score.items(), key=lambda kv: kv[1], reverse=get_best)
    choosen_items = {key for key, v in sorted_average_score[:k]}

    choosen_paths = []
    for item_id in choosen_items:
        choosen_paths += item_paths[item_id]

    return choosen_paths, choosen_items

In [21]:
def get_suggestion(user_id, k=10, max_pooling_size=-1):
    user_paths = generate_all_path_from_user(user_id, max_seed_num=MAX_SEED_NUM, max_seed_ratio=0.3)
    X_test, y_true = reformat_user_path(user_paths)
    y_pred = model.predict(X_test, batch_size=2048, verbose=1)
    
    top_paths, top_choosen_items = _get_k_prediction(X_test, y_pred, k=10, get_best=True, max_pooling_size=max_pooling_size)
    worst_path, worst_choosen_items = _get_k_prediction(X_test, y_pred, k=10, get_best=False, max_pooling_size=max_pooling_size)
    
    return top_paths, top_choosen_items, worst_path, worst_choosen_items

In [22]:
def get_top_truth(user_id):
    return set(kg_path[user_id][GIVEN_RATING_RELATION])

In [23]:
def check_precision(pred, truth, k=10):    
    
    # Make sure same type
    pred = {str(x) for x in pred}
    truth = {str(x) for x in truth}

    intersect = pred.intersection(truth)
    
    len_intersect = len(intersect)
    len_truth = len(truth) if 0 < len(truth) <= k else k
    
    return intersect, len_intersect / len_truth

## >> Reasoning

In [24]:
file_ratings_re = open("../data/ratings_re.csv").readlines()
file_triples_idx = open("../data/triples_idx.txt").readlines()

file_moviesIdx = open("../data/moviesIdx.txt").readlines() 
file_types = open("../data/types.txt").readlines() 
file_entities = open("../data/entities.txt").readlines()
file_relations = open("../data/relations.txt").readlines()

In [25]:
# create entity id -> name mapping
entity_id_to_name = {}
for line in file_moviesIdx:
    movie_title, entity_id = line.strip().split()
    entity_id_to_name[entity_id] = movie_title
    
for line in file_entities:
    entity_name, entity_id = line.strip().split()
    entity_id_to_name[entity_id] = entity_name

In [26]:
# Create relation id to name mapping
relation_id_to_name = {}
for line in file_relations:
    relation_name, relation_id = line.strip().split()
    relation_id = int(relation_id)
    relation_id += 200000
    
    # last 2 relation : spouse and relative has no inverse
    if relation_id < 200023:
        relation_id_to_name[str(relation_id + 1)] = relation_name + "_inverse"
    
    relation_id_to_name[str(relation_id)] = relation_name
    
relation_id_to_name['200030'] = "END"

In [27]:
def get_entity_name(entity_id):
    
    if entity_id in entity_id_to_name:
        return entity_id_to_name[entity_id]
    elif entity_id in relation_id_to_name:
        return relation_id_to_name[entity_id]
    else:
        return "user_{}".format(entity_id)

In [28]:
def get_reasoning_paths(paths, truth_items=None):
    reasoning_paths = []
    for i in range(0, len(paths)):
        user_path = paths[i]
        user_truth = truth_items[i]

        for path in user_path: 

            score = path[1]
            reason_path = [str(score)]

            for sequence in path[0]:
                entity, _, relation = sequence
                e = get_entity_name(str(entity))
                r = get_entity_name(str(relation))

                reason_path.append(e)
                reason_path.append(r)
                
                if str(relation) == END_RELATION:
                    watched = 'watched' if str(entity) in user_truth else "nope"
                    reason_path.append(watched)

            reason = " -> ".join(reason_path)
            reasoning_paths.append(reason)
    return reasoning_paths

In [29]:
import pandas as pd

def get_reasoning_paths_df(paths, truth_items=None):
    reasoning_paths = []
    for i in range(0, len(paths)):
        user_path = paths[i]
        user_truth = truth_items[i]

        for path in user_path: 

            score = path[1]
            reason_path = [str(score)]

            for sequence in path[0]:
                entity, _, relation = sequence
                e = get_entity_name(str(entity))
                r = get_entity_name(str(relation))

                reason_path.append(e)
                reason_path.append(r)
                
                if str(relation) == END_RELATION:
                    watched = 'watched' if str(entity) in user_truth else "nope"
                    reason_path.append(watched)

            reasoning_paths.append(reason_path)
    return pd.DataFrame(reasoning_paths)

    df = None
    for i in range(0, 10):
        _df = get_reasoning_paths_df([top_paths[i]], [truth_items[i]])    
        if df is None:
            df = _df
        else:
            df = pd.concat([df, _df], axis=0)

    df.to_csv("KPRN_pooling5.csv")

In [30]:
def compare_prediction_truth(predictions, truths):
    
    predictions = [get_entity_name(str(p)) for p in predictions]
    truths = [get_entity_name(str(t)) for t in truths]
    
    print("Predictions : ")
    for p in sorted(predictions):
        is_watched = "watched" if str(p) in truths else "nope"
        print("{} > {}".format(p, is_watched))
    
    print("\n")
    print("Truth : ")
    for t in sorted(truths):
        print(t)

## >> Run evaluation

In [34]:
import numpy as np

k = 10
n = 10

sample_user = np.random.randint(500001, 630000, n)
sample_user = [str(x) for x in sample_user]

top_paths = []
top_items = []
worst_paths = []
worst_items = []

truth_items = []

n_paths = []
intersects = []
scores = []

all_intersect = None
all_union = None

for user in tqdm(sample_user):
    top_suggested_path, top_suggested_items, worst_suggested_path, worst_suggested_items = get_suggestion(user, k=k, max_pooling_size=5)
    top_truth_items = get_top_truth(user)
    
    intersect, score = check_precision(top_suggested_items, top_truth_items)
    
    top_paths.append(top_suggested_path)
    top_items.append(top_suggested_items)
    
    worst_paths.append(worst_suggested_path)
    worst_items.append(worst_suggested_items)
    
    intersects.append(intersect)
    truth_items.append(top_truth_items)
    
    n_paths.append(len(top_suggested_path))
    scores.append(score)
    
    if all_intersect is None:
        all_intersect = top_suggested_items
    else:
        all_intersect = all_intersect.intersection(top_suggested_items)
   
    if all_union is None:
        all_union = top_suggested_items
    else:
        all_union = all_union.union(top_suggested_items)
   

  0%|          | 0/10 [00:00<?, ?it/s]



 10%|█         | 1/10 [00:20<03:03, 20.35s/it]



 20%|██        | 2/10 [01:00<03:30, 26.30s/it]



 30%|███       | 3/10 [02:07<04:28, 38.39s/it]



 40%|████      | 4/10 [02:31<03:24, 34.12s/it]

KeyboardInterrupt: 

In [46]:
print("Prec@k score:", np.average(scores))
# print("top_suggested_items:", top_suggested_items)
# print("truth_items:", truth_items)

print("\nintersect")
print(all_intersect, len(all_intersect))
print("\nunion")
print(all_union, len(all_union))
print("\ndistinct rate")
print((len(all_union)) / (n*k))

Prec@k score: 0.18666666666666668

intersect
{11536, 10659} 2

union
{5058, 8962, 14025, 3401, 4687, 11536, 911, 1168, 12436, 10388, 1557, 2073, 11677, 5277, 5278, 5279, 5409, 546, 10659, 676, 8672, 11494, 616, 10796, 11502, 1392, 6834, 5619, 9846, 4791, 5629} 31

distinct rate
0.31


In [83]:
# sample_user_idx = 0
# get_reasoning_paths([top_paths[sample_user_idx]], [truth_items[sample_user_idx]])
# compare_prediction_truth(top_items[sample_user_idx], truth_items[sample_user_idx])

Predictions : 
http://dbpedia.org/resource/Dances_with_Wolves > nope
http://dbpedia.org/resource/Fargo_(film) > nope
http://dbpedia.org/resource/Forrest_Gump > nope
http://dbpedia.org/resource/Good_Will_Hunting > nope
http://dbpedia.org/resource/Goodfellas > nope
http://dbpedia.org/resource/Monty_Python_and_the_Holy_Grail > nope
http://dbpedia.org/resource/Pulp_Fiction > nope
http://dbpedia.org/resource/Raiders_of_the_Lost_Ark > nope
http://dbpedia.org/resource/Schindler's_List > nope
http://dbpedia.org/resource/Toy_Story > nope


Truth : 
http://dbpedia.org/resource/Dazed_and_Confused_(film)
http://dbpedia.org/resource/Deliverance_(1919_film)
http://dbpedia.org/resource/Dumbo
http://dbpedia.org/resource/Fast_Times_at_Ridgemont_High
http://dbpedia.org/resource/Lost_in_Translation_(film)
http://dbpedia.org/resource/Mean_Creek


In [32]:
sample_user = ['520169',
 '566966',
 '582374',
 '504296',
 '510204',
 '623623',
 '615870',
 '628970',
 '583750',
 '597239']

autorec_result = [[5058, 10659, 7304, 2073, 13122, 14025, 11536, 6825, 1321, 911],
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 1321, 6825, 911],
[13712, 8497, 13405, 10659, 5058, 7304, 1839, 2073, 13310, 13122],
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 6825, 1321, 10796],
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 1321, 6825, 911],
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 6825, 1321, 911],
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 1321, 6825, 911],
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 6825, 1321, 911],
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 6825, 1321, 10796],
[5058, 10659, 7304, 2073, 13122, 14025, 11536, 1321, 6825, 911]]
        
top_truth = [get_top_truth(x) for x in sample_user]

for i in range(0, len(autorec_result)):
    line = autorec_result[i]
    for entity in line:
        entity_name = get_entity_name(str(entity))
        watched = "watched" if str(entity) in top_truth[i] else "nope"
            
        print("{} > {}".format(entity_name, watched))
    print()

http://dbpedia.org/resource/Forrest_Gump > nope
http://dbpedia.org/resource/Pulp_Fiction > nope
http://dbpedia.org/resource/Jurassic_Park_(film) > nope
http://dbpedia.org/resource/Braveheart_(1925_film) > nope
http://dbpedia.org/resource/Terminator_2:_Judgment_Day > nope
http://dbpedia.org/resource/Toy_Story > nope
http://dbpedia.org/resource/Schindler's_List > nope
http://dbpedia.org/resource/Independence_Day_(2000_film) > nope
http://dbpedia.org/resource/Batman_(1989_film) > nope
http://dbpedia.org/resource/Apollo_13_(film) > nope

http://dbpedia.org/resource/Forrest_Gump > nope
http://dbpedia.org/resource/Pulp_Fiction > nope
http://dbpedia.org/resource/Jurassic_Park_(film) > nope
http://dbpedia.org/resource/Braveheart_(1925_film) > nope
http://dbpedia.org/resource/Terminator_2:_Judgment_Day > nope
http://dbpedia.org/resource/Toy_Story > watched
http://dbpedia.org/resource/Schindler's_List > nope
http://dbpedia.org/resource/Batman_(1989_film) > nope
http://dbpedia.org/resource/Indepe

## >> Custom test

In [None]:
def run_filtered_path_thread(thread_id, thread_items, result, keep_list=None):
    print("Thread {} : start".format(thread_id))
    for item_id in (thread_items):
        _result = generate_path_from_entity_to_all_others(item_id, keep_relation_ratio=1, max_relation_num=99, max_entity_per_relation=99999)
        _filtered = [x for x in _result if x.split()[-1] in keep_list]
        result += _filtered
    print("Thread {} : exit".format(thread_id))
    
def generate_filtered_path_from_user(user_id, n_truth_tail=10, n_false_tail=10):
    
    user_interacted_items = []
    for relation in kg_path[user_id]:
        user_interacted_items += kg_path[user_id][relation]
    user_interacted_items = sorted(set(user_interacted_items))
    
    print("user_interacted_items:", user_interacted_items)
    
    # Random select truth items
    np.random.shuffle(user_interacted_items)
    choosen_truth = user_interacted_items[:n_truth_tail]
    
    # Random select false items
    choosen_false = np.random.randint(0, 15440, size=n_false_tail+len(user_interacted_items)).astype("str")
    choosen_false = list(set(choosen_false) - set(user_interacted_items))
    choosen_false = choosen_false[:n_false_tail]
    
    keep_list = choosen_truth + choosen_false
    
    # ====== Threading ======
    user_paths = []
    threads = []

    for i in range(0, len(user_interacted_items)):        
        # Split item id equally
        thread_items = [user_interacted_items[i]]
        thread = threading.Thread(target=run_filtered_path_thread, args=(i, thread_items, user_paths, keep_list))    
        thread.start()
        threads.append(thread)
    
    print("n threads:", len(threads))
    
    for thread in threads:
        thread.join()
    
    user_paths = ["{} {} {}".format(user_id, "200027", x) for x in user_paths]
    return user_paths, choosen_truth, choosen_false

In [None]:
def get_custom_suggestion(user_id, k=10, n_truth_tail=10, n_false_tail=10, max_pooling_size=-1):
    user_paths, choosen_truth, choosen_false = generate_filtered_path_from_user(user_id, n_truth_tail=n_truth_tail, n_false_tail=n_false_tail)
    X_test, y_true = reformat_user_path(user_paths)
    y_pred = model.predict(X_test, batch_size=2048, verbose=1)
    
    top_paths, top_choosen_items = _get_k_prediction(X_test, y_pred, k=10, get_best=True, max_pooling_size=max_pooling_size)
    worst_path, worst_choosen_items = _get_k_prediction(X_test, y_pred, k=10, get_best=False, max_pooling_size=max_pooling_size)
    
    return top_paths, top_choosen_items, worst_path, worst_choosen_items, choosen_truth, choosen_false

In [199]:
import numpy as np

k = 10
n = 1
n_truth_tail=10
n_false_tail=10

sample_user = np.random.randint(500001, 630000, n)
sample_user = [str(x) for x in sample_user]

top_paths = []
worst_paths = []
truth_items = []

n_paths = []
intersects = []
scores = []

all_intersect = None
all_union = None

for user in tqdm(sample_user):
    top_suggested_path, top_suggested_items, worst_suggested_path, worst_suggested_items, choosen_truth, choosen_false = get_custom_suggestion(user, k, n_truth_tail, n_false_tail, max_pooling_size=-1)
    top_truth_items = get_top_truth(user)
    intersect, score = check_precision(top_suggested_items, top_truth_items)
    
    top_paths.append(top_suggested_path)
    worst_paths.append(worst_suggested_path)
    intersects.append(intersect)
    truth_items.append(top_truth_items)
    
    n_paths.append(len(top_suggested_path))
    scores.append(score)
    
    if all_intersect is None:
        all_intersect = top_suggested_items
    else:
        all_intersect = all_intersect.intersection(top_suggested_items)
   
    if all_union is None:
        all_union = top_suggested_items
    else:
        all_union = all_union.union(top_suggested_items)
        
print("Prec@k score:", np.average(scores))
print("\nintersect")
print(all_intersect, len(all_intersect))
print("\nunion")
print(all_union, len(all_union))
print("\ndistinct rate")
print((len(all_union)) / (n*k))
print("choosen_truth", choosen_truth) 
print("choosen_false", choosen_false)





  0%|          | 0/1 [00:00<?, ?it/s][A[A[A[A

user_interacted_items: ['10187', '10201', '10306', '10390', '10422', '10515', '10545', '10546', '10659', '10796', '1085', '10878', '10880', '11006', '11145', '11146', '11177', '11243', '11251', '11361', '11363', '11494', '11507', '11677', '1168', '11690', '11797', '11964', '11979', '12072', '12121', '12153', '12196', '12250', '12339', '12342', '12348', '12372', '12404', '12451', '12452', '12455', '12457', '12458', '12459', '12461', '12462', '12475', '12489', '12626', '12677', '12889', '1300', '13043', '13115', '13122', '13126', '13168', '1320', '1324', '136', '13654', '13770', '13837', '13872', '13881', '13891', '13974', '1402', '14162', '14516', '14618', '1483', '14868', '14920', '15073', '15075', '15154', '15294', '15333', '1599', '1772', '1774', '1782', '2474', '2506', '2514', '26', '2632', '2767', '2897', '2903', '2959', '3071', '3137', '3264', '327', '3302', '3401', '3414', '3428', '3436', '3523', '3701', '3732', '3785', '3815', '3855', '3922', '394', '3987', '4080', '4113', '4195

Thread 14 : exitThread 137 : exit

Thread 166 : len 1139 set 1139
Thread 166 : exit
Thread 134 : len 1718 set 1718
Thread 168 : len 1516 set 1516Thread 134 : exit

Thread 168 : exit
Thread 123 : len 2381 set 2381Thread 189 : len 1105 set 1105
Thread 123 : exit
Thread 189 : exit

Thread 106 : len 2400 set 2400
Thread 106 : exit
Thread 167 : len 1815 set 1815
Thread 167 : exitThread 178 : len 1463 set 1463

Thread 178 : exit
Thread 75 : len 2652 set 2652
Thread 75 : exit
Thread 155 : len 2341 set 2341
Thread 155 : exit
Thread 101 : len 2447 set 2447
Thread 101 : exit
Thread 18 : len 4251 set 4251
Thread 36 : len 3763 set 3763
Thread 36 : exit
Thread 18 : exit
Thread 138 : len 2721 set 2721
Thread 138 : exit
Thread 110 : len 3488 set 3488
Thread 110 : exit
Thread 122 : len 3259 set 3259
Thread 122 : exit
Thread 34 : len 3599 set 3599
Thread 34 : exit
Thread 15 : len 4692 set 4692
Thread 15 : exit
Thread 97 : len 3382 set 3382
Thread 97 : exit
Thread 31 : len 4684 set 4684
Thread 31 : exit





100%|██████████| 1/1 [30:35<00:00, 1835.98s/it][A[A[A[A

Prec@k score: 0.9

intersect
{14561, 11363, 7526, 3302, 12072, 3401, 3436, 13168, 12626, 12372} 10

union
{14561, 11363, 7526, 3302, 12072, 3401, 3436, 13168, 12626, 12372} 10

distinct rate
1.0


In [None]:
top_reasoning_paths = get_reasoning_paths(top_paths, truth_items)
worst_reasoning_paths = get_reasoning_paths(worst_paths, truth_items)

display(top_reasoning_paths)
print("=====================================================================================")
display(worst_reasoning_paths)

In [83]:
user_paths = generate_all_path_from_user("500302", max_seed_num=3, max_seed_ratio=0.3)
X_test, y_true = reformat_user_path(user_paths)
y_pred = model.predict(X_test, batch_size=2048, verbose=1)

# descending_path_score = sorted(zip(X_test, y_pred), key=lambda x: x[1], reverse=True)
top_paths, top_choosen_items = _get_top_k_items_with_score_pooling(X_test, y_pred, k=10, max_pooling_size=5, get_best=True)

