 # **Test KPRN Result**

In [1]:
import warnings
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
from tqdm import tqdm
import os
from keras.models import load_model
from keras.models import Model
import pickle
import threading
import numpy as np
import pandas as pd

warnings.filterwarnings(action='ignore')

Using TensorFlow backend.


# > Config

In [2]:
TEST_CODE = "2019-07-19 05:47:47"
MODEL_DIR = "../logs/{}".format(TEST_CODE)
CHOSEN_EPOCH = 5

MAX_SEED_NUM = 3
MAX_RELATION_NUM = 15

In [3]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True

sess = tf.Session(config=config)
set_session(sess)

# > Load model

In [4]:
trained_weights = sorted(os.listdir(MODEL_DIR))
choosen_weight = "{}/{}".format(MODEL_DIR, trained_weights[CHOSEN_EPOCH - 1])

In [5]:
model = load_model(choosen_weight)

W0801 08:49:10.539453 140241513215744 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0801 08:49:10.545511 140241513215744 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0801 08:49:10.572175 140241513215744 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0801 08:49:10.695865 140241513215744 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_w

# > Load data

## >> Read data

In [6]:
file_ratings_re = open("../data/ratings_re.csv").readlines()
file_triples_idx = open("../data/triples_idx.txt").readlines()
file_moviesIdx = open("../data/moviesIdx.txt").readlines()
file_types = open("../data/types.txt").readlines()
file_entities = open("../data/entities.txt").readlines()
file_relations = open("../data/relations.txt").readlines()

In [7]:
def _get_entity_to_name():

    entity_id_to_name = {}
    for line in file_moviesIdx:
        movie_title, entity_id = line.strip().split()
        entity_id_to_name[entity_id] = movie_title

    for line in file_entities:
        entity_name, entity_id = line.strip().split()
        entity_id_to_name[entity_id] = entity_name

    return entity_id_to_name

def _get_movie_title_to_entity_type():

    movie_title_to_entity_type = {}
    for line in file_types:

        entity, entity_type = line.strip().split('\t')
        movie_title_to_entity_type[entity] = entity_type

    return movie_title_to_entity_type

def _get_entity_list_with_type():

    entity_list_with_type = {}
    for line in file_types:

        entity, entity_type = line.strip().split('\t')
        if entity_type not in entity_list_with_type:
            entity_list_with_type[entity_type] = []
        entity_list_with_type[entity_type].append(entity)

    return entity_list_with_type

REL_ID_END = '200030'
def _get_relation_to_name():

    # Create relation id to name mapping
    relation_id_to_name = {}
    for line in file_relations:
        relation_name, relation_id = line.strip().split()
        relation_id = int(relation_id)
        relation_id += 200000

        # last 2 relation : spouse and relative has no inverse
        if relation_id < 200023:
            relation_id_to_name[str(relation_id + 1)] = relation_name + "_inverse"

        relation_id_to_name[str(relation_id)] = relation_name

    relation_id_to_name[REL_ID_END] = "END"
    return relation_id_to_name

def _get_entity_type_to_id():

    list_of_type = [
        "Category",
        "Company",
        "Country",
        "Genre",
        "Movie",
        "Person",
        "User",
        "#PAD_TOKEN",
        "#UNK_ENTITY_TYPE",
    ]

    entity_type_to_id = {list_of_type[i]: i for i in range(0, len(list_of_type))}
    return entity_type_to_id

In [8]:
movie_title_to_entity_type = _get_movie_title_to_entity_type()
entity_list_with_type = _get_entity_list_with_type()
entity_id_to_name = _get_entity_to_name()
relation_id_to_name = _get_relation_to_name()
entity_type_to_id = _get_entity_type_to_id()

In [9]:
USER_ENTITY_ID_PADDING = 500000
def get_type_from_entity_id(entity_id):
    # user
    if int(entity_id) > USER_ENTITY_ID_PADDING:
        return "User"
    elif entity_id_to_name[entity_id] in movie_title_to_entity_type:
        return movie_title_to_entity_type[entity_id_to_name[entity_id]]
    else:
        return "#UNK_ENTITY_TYPE"
    
def path_to_string(paths):
    string_paths = []
    for path in paths:

        entity_string = []
        for entity in path.split():
            entity_string.append(get_entity_name(str(entity)))
        entity_string = " -> ".join(entity_string)

        string_paths.append(entity_string)

    return string_paths

def get_entity_name(entity_id):

    if entity_id in entity_id_to_name:
        return entity_id_to_name[entity_id]
    elif entity_id in relation_id_to_name:
        return relation_id_to_name[entity_id]
    else:
        return "user_{}".format(entity_id)

## >> Load KG

In [10]:
# Load KG from cache
kg_path = pickle.load(open("../data/cache_kg_path", "rb"))

## >> Prepare path for prediction

In [11]:
IRREGULAR_RELATION = ['200024', '200025']
NUM_OF_ITEMS = len(file_moviesIdx)

def _generate_path_from_entity_to_all_others(entity_id, keep_relation_ratio=0.5, max_relation_num=30, max_entity_per_relation=1):

    generated_paths = []

    # Get direct relation attached to entity1
    r1 = kg_path[entity_id]

    # List all possible item-entities
    all_item_entities = [str(x) for x in range(0, NUM_OF_ITEMS)]
    for e2_id in all_item_entities:

        # Skip if e1 == e2 (path to itself)
        if e2_id == entity_id:
            continue

        # Get relations attached to entity2
        r2 = kg_path[e2_id]

        # Downsample intersect relation
        intersect_relations = list(set(r1.keys()).intersection(set(r2.keys())))
        if len(intersect_relations) < max_relation_num:
            n_intersect_relation = len(intersect_relations)
        else:
            n_intersect_relation = max(max_relation_num, int(len(intersect_relations) * keep_relation_ratio))

        # Find intersect between relation 1 and relation 2
        np.random.shuffle(intersect_relations)
        for relation in intersect_relations[:n_intersect_relation]:

            # Find entities attached to relation (non-item entities)
            intersect_entity = list(set(r1[relation]).intersection(set(r2[relation])))

            # Down sample intersecting entities
            np.random.shuffle(intersect_entity)
            for mid_entity in intersect_entity[:max_entity_per_relation]:

                # Format path and add to result
                inverse_relation = relation if (relation in IRREGULAR_RELATION) else str(int(relation) + 1)
                path = "{} {} {} {} {}".format(entity_id, relation, mid_entity, inverse_relation, e2_id)
                generated_paths.append(path)

    return generated_paths

In [12]:
REL_ID_GIVEN_GOOD_RATING = '200027'
def _generate_all_path_from_user(user_id, max_seed_num=10, max_seed_ratio=0.2):

    # List down all user interacted items
    user_interacted_items = []
    for relation in kg_path[user_id]:
        user_interacted_items += kg_path[user_id][relation]
    user_interacted_items = sorted(set(user_interacted_items))

    # Downsample seeds (item interacted)
    if len(user_interacted_items) < max_seed_num:
        n_seed = len(user_interacted_items)
    else:
        n_seed = max(max_seed_num, int(len(user_interacted_items) * max_seed_ratio))
    np.random.shuffle(user_interacted_items)
    user_interacted_items = user_interacted_items[:n_seed]

    # ====== Threading ======
    user_paths = []
    threads = []
    for i in range(0, n_seed):

        # Split items id equally
        thread_items = user_interacted_items[i::n_seed]
        thread = threading.Thread(target=_run_thread, args=(i, thread_items, user_paths))
        thread.start()
        threads.append(thread)

    # Wait for all thread to finish
    for i in range(0, n_seed):
        threads[i].join()

    # Reformat : add user id and 'REL_ID_GIVEN_GOOD_RATING'
    user_paths = ["{} {} {}".format(user_id, REL_ID_GIVEN_GOOD_RATING, x) for x in user_paths]
    return user_paths

In [13]:
# Run path preparation using thread to reduce time
def _run_thread(thread_id, thread_items, result):
    for item_id in (thread_items):
        result += _generate_path_from_entity_to_all_others(item_id, keep_relation_ratio=0.5,
                                                           max_relation_num=MAX_RELATION_NUM, max_entity_per_relation=1)

 ## >> Utility function

In [14]:
def _reformat_user_path(user_paths):

    new_paths = []
    labels = []

    for path in user_paths:
        e1, r1, e2, r2, e3, r3, e4 = path.strip().split()

        t1 = entity_type_to_id[get_type_from_entity_id(e1)]
        t2 = entity_type_to_id[get_type_from_entity_id(e2)]
        t3 = entity_type_to_id[get_type_from_entity_id(e3)]
        t4 = entity_type_to_id[get_type_from_entity_id(e4)]

        r4 = REL_ID_END

        entity_rated = kg_path[e1][REL_ID_GIVEN_GOOD_RATING]
        label = 1 if e4 in entity_rated else 0

        new_paths.append([
            [e1, t1, r1],
            [e2, t2, r2],
            [e3, t3, r3],
            [e4, t4, r4],
        ])

        labels.append(label)

    return np.array(new_paths).astype('int'), np.array(labels).astype('int')

# > Evaluation

## >> Evaluation function

In [15]:
def _get_k_prediction(X_test, y_pred, k=10, get_best=True, max_pooling_size=-1):
    """
    max_pooling_size -1 means without pooling
    """
    if max_pooling_size > 0:
        return _get_top_k_items_with_score_pooling(X_test, y_pred, k=10, max_pooling_size=max_pooling_size, get_best=get_best)
    else:
        return _get_top_k_items_without_score_pooling(X_test, y_pred, k=10, get_best=get_best)

def _get_top_k_items_without_score_pooling(X_test, y_pred, k=10, get_best=True):
    """
    Get the top-k items from X_test based on y_pred scores, 
    pick all path in sorted X_test until the amount of unique items is equal to k
    """
    choosen_paths = []
    choosen_items = set()

    path_scores = sorted(zip(X_test, y_pred), key=lambda x: x[1], reverse=get_best)
    for path, score in path_scores:
        choosen_paths.append((path, score))
        choosen_items.add(path[3][0])  # Add the last item

        if len(choosen_items) >= k:
            break

    return choosen_paths, choosen_items

def _get_top_k_items_with_score_pooling(X_test, y_pred, k=10, max_pooling_size=5, get_best=True):
    """
    Get the top-k items from X_test based on averaged y_pred scores,
    pick 'max_pooling_size' paths for each item, rank item based on average score, pick top k items.
    """
    paths_group_by_items = {}
    score_group_by_items = {}

    path_scores = sorted(zip(X_test, y_pred), key=lambda x: x[1], reverse=get_best)
    for path, score in path_scores:

        ending_item = path[3][0]  # The last item before END RELATION

        if ending_item not in paths_group_by_items:
            paths_group_by_items[ending_item] = []
            score_group_by_items[ending_item] = []

        # Pool the paths and score
        if len(paths_group_by_items[ending_item]) < max_pooling_size:
            paths_group_by_items[ending_item].append((path, score))
            score_group_by_items[ending_item].append(score)

    # Calculate average
    for item in score_group_by_items:
        score_group_by_items[item] = sum(score_group_by_items[item]) / len(score_group_by_items[item])

    # Sort pooled result
    sorted_average_score = sorted(score_group_by_items.items(), key=lambda kv: kv[1], reverse=get_best)
    choosen_items = {key for key, v in sorted_average_score[:k]}

    # Filter top-k items with highest average score
    choosen_paths = []
    for item_id in choosen_items:
        choosen_paths += paths_group_by_items[item_id]

    return choosen_paths, choosen_items

In [16]:
def get_suggestion(user_id, k=10, max_pooling_size=-1):

    user_paths = _generate_all_path_from_user(user_id, max_seed_num=MAX_SEED_NUM, max_seed_ratio=0.3)
    X_test, _ = _reformat_user_path(user_paths)

    y_pred = model.predict(X_test, batch_size=2048, verbose=1)

    top_paths, top_choosen_items = _get_k_prediction(X_test, y_pred, k=10, get_best=True, max_pooling_size=max_pooling_size)
    worst_path, worst_choosen_items = _get_k_prediction(X_test, y_pred, k=10, get_best=False, max_pooling_size=max_pooling_size)

    return top_paths, top_choosen_items, worst_path, worst_choosen_items

In [17]:
def get_top_truth(user_id):
    return set(kg_path[user_id][REL_ID_GIVEN_GOOD_RATING])

In [18]:
def check_precision(pred, truth, k=10):

    # Make sure same type
    pred = {str(x) for x in pred}
    truth = {str(x) for x in truth}

    intersect = pred.intersection(truth)

    len_intersect = len(intersect)
    len_truth = len(truth) if 0 < len(truth) <= k else k

    return intersect, len_intersect / len_truth

## >> Reasoning

In [19]:
def get_reasoning_paths(paths, truth_items=None):
    reasoning_paths = []
    for i in range(0, len(paths)):
        user_path = paths[i]
        user_truth = truth_items[i]

        for path in user_path:

            score = path[1]
            reason_path = [str(score)]

            for sequence in path[0]:
                entity, _, relation = sequence
                e = get_entity_name(str(entity))
                r = get_entity_name(str(relation))

                reason_path.append(e)
                reason_path.append(r)

                if str(relation) == REL_ID_END:
                    watched = 'watched' if str(entity) in user_truth else "nope"
                    reason_path.append(watched)

            reason = " -> ".join(reason_path)
            reasoning_paths.append(reason)
    return reasoning_paths


def get_reasoning_paths_df(paths, truth_items=None):
    reasoning_paths = []
    for i in range(0, len(paths)):
        user_path = paths[i]
        user_truth = truth_items[i]

        for path in user_path:

            score = path[1]
            reason_path = [str(score)]

            for sequence in path[0]:
                entity, _, relation = sequence
                e = get_entity_name(str(entity))
                r = get_entity_name(str(relation))

                reason_path.append(e)
                reason_path.append(r)

                if str(relation) == REL_ID_END:
                    watched = 'watched' if str(entity) in user_truth else "nope"
                    reason_path.append(watched)

            reasoning_paths.append(reason_path)
    return pd.DataFrame(reasoning_paths)

In [20]:
def compare_prediction_truth(predictions, truths):

    predictions = [get_entity_name(str(p)) for p in predictions]
    truths = [get_entity_name(str(t)) for t in truths]

    print("Predictions : ")
    for p in sorted(predictions):
        is_watched = "watched" if str(p) in truths else "nope"
        print("{} > {}".format(p, is_watched))

    print("\n")
    print("Truth : ")
    for t in sorted(truths):
        print(t)

## >> Run evaluation

In [21]:
k_suggestions = 10
n_users = 30

sample_user = np.random.randint(500001, 630000, n_users)
sample_user = [str(x) for x in sample_user]

top_paths = []
top_items = []
worst_paths = []
worst_items = []

truth_items = []

n_paths = []
intersects = []
scores = []

all_intersect = None
all_union = None

for user in tqdm(sample_user):
    top_suggested_path, top_suggested_items, worst_suggested_path, worst_suggested_items = get_suggestion(user, k=k_suggestions, max_pooling_size=3)
    top_truth_items = get_top_truth(user)

    intersect, score = check_precision(top_suggested_items, top_truth_items)

    top_paths.append(top_suggested_path)
    top_items.append(top_suggested_items)

    worst_paths.append(worst_suggested_path)
    worst_items.append(worst_suggested_items)

    intersects.append(intersect)
    truth_items.append(top_truth_items)

    n_paths.append(len(top_suggested_path))
    scores.append(score)

    if all_intersect is None:
        all_intersect = top_suggested_items
    else:
        all_intersect = all_intersect.intersection(top_suggested_items)

    if all_union is None:
        all_union = top_suggested_items
    else:
        all_union = all_union.union(top_suggested_items)

  0%|          | 0/30 [00:00<?, ?it/s]



  3%|▎         | 1/30 [01:54<55:32, 114.90s/it]



  7%|▋         | 2/30 [02:12<40:01, 85.77s/it] 



 10%|█         | 3/30 [03:45<39:31, 87.85s/it]



 13%|█▎        | 4/30 [05:12<37:56, 87.55s/it]



 17%|█▋        | 5/30 [06:16<33:34, 80.60s/it]



 20%|██        | 6/30 [10:21<52:00, 130.03s/it]



 23%|██▎       | 7/30 [17:38<1:25:07, 222.05s/it]



 27%|██▋       | 8/30 [18:16<1:01:10, 166.85s/it]



 30%|███       | 9/30 [24:57<1:22:58, 237.05s/it]



 33%|███▎      | 10/30 [25:43<59:56, 179.83s/it] 



 37%|███▋      | 11/30 [27:28<49:49, 157.37s/it]



 40%|████      | 12/30 [28:29<38:30, 128.34s/it]



 43%|████▎     | 13/30 [28:53<27:30, 97.07s/it] 



 47%|████▋     | 14/30 [29:26<20:46, 77.94s/it]



 50%|█████     | 15/30 [30:15<17:17, 69.19s/it]



 53%|█████▎    | 16/30 [32:26<20:27, 87.70s/it]



 57%|█████▋    | 17/30 [33:12<16:17, 75.19s/it]



 60%|██████    | 18/30 [35:28<18:41, 93.46s/it]



 63%|██████▎   | 19/30 [37:36<19:02, 103.85s/it]



 67%|██████▋   | 20/30 [38:08<13:42, 82.29s/it] 



 70%|███████   | 21/30 [46:22<30:50, 205.59s/it]



 73%|███████▎  | 22/30 [52:46<34:34, 259.33s/it]



 77%|███████▋  | 23/30 [53:32<22:45, 195.13s/it]



 80%|████████  | 24/30 [1:00:25<26:03, 260.65s/it]



 83%|████████▎ | 25/30 [1:02:06<17:42, 212.58s/it]



 87%|████████▋ | 26/30 [1:02:40<10:36, 159.07s/it]



 90%|█████████ | 27/30 [1:04:25<07:08, 142.87s/it]



 93%|█████████▎| 28/30 [1:05:56<04:14, 127.24s/it]



 97%|█████████▋| 29/30 [1:06:40<01:42, 102.38s/it]



100%|██████████| 30/30 [1:07:51<00:00, 93.09s/it] 


In [22]:
print("Prec@k score:", np.average(scores))
# print("top_suggested_items:", top_suggested_items)
# print("truth_items:", truth_items)

print("\nintersect")
print(all_intersect, len(all_intersect))
print("\nunion")
print(all_union, len(all_union))
print("\ndistinct rate")
print((len(all_union)) / (n_users * k_suggestions))

Prec@k score: 0.25111111111111106

intersect
{10659} 1

union
{8962, 10371, 911, 11536, 1169, 1170, 1168, 9104, 14868, 534, 2073, 11677, 546, 10659, 676, 1321, 11435, 10796, 5548, 1457, 6834, 15154, 5939, 693, 4791, 9399, 5441, 5058, 13122, 12355, 14025, 2506, 3401, 10825, 11212, 4687, 7631, 3538, 2899, 4820, 90, 10842, 2525, 8672, 14048, 11494, 3815, 4071, 4473, 5738, 8811, 491, 1774, 12016, 5619, 11892, 9846, 5497, 2298, 5629, 2942} 61

distinct rate
0.20333333333333334


In [23]:
sample_user_idx = 0
get_reasoning_paths([top_paths[sample_user_idx]], [truth_items[sample_user_idx]])
compare_prediction_truth(top_items[sample_user_idx], truth_items[sample_user_idx])

Predictions : 
http://dbpedia.org/resource/Back_to_the_Future_Part_III > nope
http://dbpedia.org/resource/Being_John_Malkovich > nope
http://dbpedia.org/resource/Fargo_(film) > nope
http://dbpedia.org/resource/Forrest_Gump > watched
http://dbpedia.org/resource/Good_Will_Hunting > watched
http://dbpedia.org/resource/Goodfellas > nope
http://dbpedia.org/resource/Pulp_Fiction > nope
http://dbpedia.org/resource/Saving_Private_Ryan > nope
http://dbpedia.org/resource/Schindler's_List > watched
http://dbpedia.org/resource/Toy_Story > nope


Truth : 
http://dbpedia.org/resource/28_Days_(film)
http://dbpedia.org/resource/Adventures_in_Babysitting_(2016_film)
http://dbpedia.org/resource/Apollo_13_(film)
http://dbpedia.org/resource/As_Good_as_It_Gets
http://dbpedia.org/resource/Big_Trouble_in_Little_China
http://dbpedia.org/resource/Boys_on_the_Side
http://dbpedia.org/resource/Braveheart_(1925_film)
http://dbpedia.org/resource/Cast_Away
http://dbpedia.org/resource/Cinderella_Man
http://dbpedia.or

--------