 # **Test KPRN Result**

In [1]:
TEST_CODE = "2019-07-19 05:47:47"
MODEL_DIR = "../logs/{}".format(TEST_CODE)
CHOSEN_EPOCH = 5

# > Config

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True

sess = tf.Session(config=config)
set_session(sess)

Using TensorFlow backend.


In [4]:
from tqdm import tqdm

# > Load model

In [5]:
import os
trained_weights = sorted(os.listdir(MODEL_DIR))
choosen_weight = "{}/{}".format(MODEL_DIR, trained_weights[CHOSEN_EPOCH - 1])

In [6]:
from keras.models import load_model
from keras.models import Model

model = load_model(choosen_weight);

W0722 06:55:42.524200 139948314203904 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0722 06:55:42.587604 139948314203904 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0722 06:55:42.634318 139948314203904 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0722 06:55:42.946032 139948314203904 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_w

# > Load data

## >> Read data

In [7]:
file_ratings_re = open("../data/ratings_re.csv").readlines()
file_triples_idx = open("../data/triples_idx.txt").readlines()

file_moviesIdx = open("../data/moviesIdx.txt").readlines() 
file_types = open("../data/types.txt").readlines() 
file_entities = open("../data/entities.txt").readlines()
file_relations = open("../data/relations.txt").readlines()

In [8]:
# create entity id -> name mapping

entity_id_to_name = {}
for line in file_moviesIdx:
    movie_title, entity_id = line.strip().split()
    entity_id_to_name[entity_id] = movie_title
    
for line in file_entities:
    entity_name, entity_id = line.strip().split()
    entity_id_to_name[entity_id] = entity_name

In [9]:
# create movie title -> entity type mapping and list of entity for each type

movie_title_to_entity_type = {}
entity_list_with_type = {}

for line in file_types:
    
    # movie title -> entity type
    entity, entity_type = line.strip().split('\t')
    movie_title_to_entity_type[entity] = entity_type
    
    # entity for each type
    if entity_type not in entity_list_with_type:
        entity_list_with_type[entity_type] = []
    
    entity_list_with_type[entity_type].append(entity)

In [10]:
# Create relation id to name mapping

relation_id_to_name = {}
for line in file_relations:
    relation_name, relation_id = line.strip().split()
    relation_id = int(relation_id)
    relation_id += 200000
    
    # last 2 relation : spouse and relative has no inverse
    if relation_id < 200023:
        relation_id_to_name[str(relation_id + 1)] = relation_name + "_inverse"
    
    relation_id_to_name[str(relation_id)] = relation_name

In [11]:
list_of_type = [
    "Category",
    "Company",
    "Country",
    "Genre",
    "Movie",
    "Person",
    "User",
    
    "#PAD_TOKEN",
    "#UNK_ENTITY_TYPE",
]

entity_type_to_id = {}
list_of_type_with_id = []

i = 0
for t in list_of_type:
    
    entity_type_to_id[t] = i
    list_of_type_with_id.append("{}\t{}".format(t, i))
    
    i += 1

In [12]:
# Load KG from cache

import pickle
kg_path = pickle.load(open("../data/cache_kg_path", "rb"))

In [13]:
user_entity_id_padding = 500000
relation_rated_good_by_id = '200026'
relation_given_good_rating_id = '200027'
good_movie_rating_threshold = 4

## >> Prepare data

In [14]:
import threading

In [15]:
IRREGULAR_RELATION = ['200024', '200025']
NUM_OF_ITEMS = len(file_moviesIdx)

def generate_path_from_entity_to_all_others(entity_id, keep_relation_ratio=0.1, max_relation_num=3, max_entity_per_relation=1):

    generated_paths = []
    
    r1 = kg_path[entity_id]
    for e2_id in range(0, NUM_OF_ITEMS):
        e2_id = str(e2_id)
        # Skip if e1 == e2 (path to itself)
        if e2_id == entity_id:
            continue

        r2 = kg_path[e2_id]
        intersect_relations = set(r1.keys()).intersection(set(r2.keys()))
        
        # downsample relation
        if len(intersect_relations) < max_relation_num:
            n_intersect_relation = len(intersect_relations)
        else:
            n_intersect_relation = max(max_relation_num, int(len(intersect_relations) * keep_relation_ratio))
            
        intersect_relations = list(intersect_relations)
        np.random.shuffle(intersect_relations)

        for relation in intersect_relations[:n_intersect_relation]:

            t1 = r1[relation]
            t2 = r2[relation]
            intersect_entity = list(set(t1).intersection(set(t2)))
            np.random.shuffle(intersect_entity)
            
            for mid_entity in intersect_entity[:max_entity_per_relation]:
                
                inverse_relation = relation if (relation in IRREGULAR_RELATION) else str(int(relation) + 1)        
                path = "{} {} {} {} {}".format(entity_id, relation, mid_entity, inverse_relation, e2_id)
                generated_paths.append(path)
    
    return generated_paths

In [16]:
def run_thread(thread_id, thread_items, result):
    for item_id in (thread_items):
        result += generate_path_from_entity_to_all_others(item_id, keep_relation_ratio=0.2, max_relation_num=5, max_entity_per_relation=1)
            
def generate_all_path_from_user(user_id, max_seed_num=10, max_seed_ratio=0.2):
    
    user_interacted_items = []
    for relation in kg_path[user_id]:
        user_interacted_items += kg_path[user_id][relation]
    
    user_interacted_items = sorted(set(user_interacted_items))
    
    # downsample seeds
    if len(user_interacted_items) < max_seed_num:
        n_seed = len(user_interacted_items)
    else:
        n_seed = max(max_seed_num, int(len(user_interacted_items) * max_seed_ratio))
    
    # Randomize
    np.random.shuffle(user_interacted_items)
    user_interacted_items = user_interacted_items[:n_seed]
    
    # ====== Threading ======
    user_paths = []
    threads = []
    for i in range(0, n_seed):
        
        # Split item id equally
        thread_items = user_interacted_items[i::n_seed]
        thread = threading.Thread(target=run_thread, args=(i, thread_items, user_paths))    
        thread.start()
        threads.append(thread)
    
    for i in range(0, n_seed):
        threads[i].join()
        
    user_paths = ["{} {} {}".format(user_id, "200027", x) for x in user_paths]
    return user_paths

In [17]:
def get_type_from_entity_id(entity_id):
    
    # user
    if int(entity_id) > user_entity_id_padding:
        return "User"
    elif entity_id_to_name[entity_id] in movie_title_to_entity_type:
        return movie_title_to_entity_type[entity_id_to_name[entity_id]]
    else:
        return "#UNK_ENTITY_TYPE"

In [18]:
import numpy as np

END_RELATION = '200030'
GIVEN_RATING_RELATION = '200027'
BATCH_COUNT = 72

def reformat_user_path(user_paths):
    
    new_paths = []
    labels = []
    
    for path in user_paths:
        e1, r1, e2, r2, e3, r3, e4 = path.strip().split()

        t1 = entity_type_to_id[get_type_from_entity_id(e1)]
        t2 = entity_type_to_id[get_type_from_entity_id(e2)]
        t3 = entity_type_to_id[get_type_from_entity_id(e3)]
        t4 = entity_type_to_id[get_type_from_entity_id(e4)]

        r4 = END_RELATION

        entity_rated = kg_path[e1][GIVEN_RATING_RELATION]
        label = 1 if e4 in entity_rated else 0
        
        new_paths.append([
            [e1, t1, r1],
            [e2, t2, r2],
            [e3, t3, r3],
            [e4, t4, r4],
        ])
        
        labels.append(label)
        
    return np.array(new_paths).astype('int'), np.array(labels).astype('int')

# > Evaluation

In [19]:
def get_top_suggestion(user_id, k=10):
    user_paths = generate_all_path_from_user(user_id, max_seed_num=15, max_seed_ratio=0.3)
    X_test, y_true = reformat_user_path(user_paths)
    y_pred = model.predict(X_test, batch_size=2048, verbose=1)
    
    path_score = sorted(zip(X_test, y_pred), key=lambda x: x[1], reverse=True)
    
    top_paths = []
    top_choosen_items = set()
    for path, score in path_score:
        
        top_paths.append((path, score))
        top_choosen_items.add(path[3][0]) # Add the last item
        
        if len(top_choosen_items) >= k:
            break
    
    return top_paths, top_choosen_items

In [20]:
def get_top_truth(user_id):
    return set(kg_path[user_id][GIVEN_RATING_RELATION])

In [21]:
def check_precision(pred, truth):    
    
    # Make sure same type
    pred = {str(x) for x in pred}
    truth = {str(x) for x in truth}

    intersect = pred.intersection(truth)
    
    if len(truth) > 0 :
        return intersect, len(intersect) / len(truth)
    else:
        return {}, 0

----

In [None]:
sample_user = np.random.randint(500001, 630000, 50)
sample_user = [str(x) for x in sample_user]

top_paths = []
n_paths = []
intersects = []
scores = []

for user in tqdm(sample_user):
    top_suggested_path, top_suggested_items = get_top_suggestion(user, k=10)
    top_truth_items = get_top_truth(user)
    intersect, score = check_precision(top_suggested_items, top_truth_items)
    
    top_paths.append(top_suggested_path)
    n_paths.append(len(top_suggested_path))
    intersects.append(intersect)
    scores.append(score)

  0%|          | 0/50 [00:00<?, ?it/s]



  2%|▏         | 1/50 [02:05<1:42:25, 125.43s/it]



  4%|▍         | 2/50 [03:57<1:37:09, 121.44s/it]



  6%|▌         | 3/50 [07:38<1:58:27, 151.22s/it]



  8%|▊         | 4/50 [10:05<1:55:00, 150.00s/it]



 10%|█         | 5/50 [12:22<1:49:37, 146.17s/it]



 12%|█▏        | 6/50 [15:15<1:53:04, 154.19s/it]



 14%|█▍        | 7/50 [17:45<1:49:31, 152.83s/it]



 16%|█▌        | 8/50 [21:17<1:59:27, 170.66s/it]



 18%|█▊        | 9/50 [22:17<1:33:52, 137.38s/it]



 20%|██        | 10/50 [23:15<1:15:48, 113.71s/it]



 22%|██▏       | 11/50 [24:07<1:01:54, 95.24s/it] 



 24%|██▍       | 12/50 [27:43<1:23:14, 131.44s/it]



 26%|██▌       | 13/50 [28:58<1:10:39, 114.58s/it]



 28%|██▊       | 14/50 [29:01<48:34, 80.97s/it]   



 30%|███       | 15/50 [30:42<50:44, 86.98s/it]



 32%|███▏      | 16/50 [34:45<1:15:48, 133.77s/it]



 34%|███▍      | 17/50 [42:17<2:06:03, 229.19s/it]



 36%|███▌      | 18/50 [43:32<1:37:33, 182.91s/it]



 38%|███▊      | 19/50 [44:54<1:18:51, 152.64s/it]



 40%|████      | 20/50 [48:14<1:23:26, 166.90s/it]



 42%|████▏     | 21/50 [53:15<1:40:07, 207.16s/it]



 44%|████▍     | 22/50 [53:44<1:11:41, 153.64s/it]



 46%|████▌     | 23/50 [56:03<1:07:11, 149.31s/it]



 48%|████▊     | 24/50 [57:06<53:25, 123.29s/it]  



 50%|█████     | 25/50 [58:23<45:39, 109.60s/it]



 52%|█████▏    | 26/50 [59:28<38:29, 96.21s/it] 



 54%|█████▍    | 27/50 [1:01:35<40:22, 105.31s/it]



 56%|█████▌    | 28/50 [1:03:22<38:50, 105.95s/it]



 58%|█████▊    | 29/50 [1:06:49<47:41, 136.26s/it]



 60%|██████    | 30/50 [1:09:54<50:15, 150.75s/it]



 62%|██████▏   | 31/50 [1:10:12<35:09, 111.02s/it]



 64%|██████▍   | 32/50 [1:11:26<29:59, 99.97s/it] 



 66%|██████▌   | 33/50 [1:14:07<33:27, 118.10s/it]



 68%|██████▊   | 34/50 [1:15:50<30:16, 113.53s/it]



 70%|███████   | 35/50 [1:19:15<35:15, 141.02s/it]



 72%|███████▏  | 36/50 [1:20:23<27:50, 119.29s/it]



 74%|███████▍  | 37/50 [1:21:03<20:41, 95.51s/it] 



 76%|███████▌  | 38/50 [1:21:26<14:44, 73.70s/it]



 78%|███████▊  | 39/50 [1:22:09<11:49, 64.51s/it]



 80%|████████  | 40/50 [1:28:46<27:23, 164.30s/it]



 82%|████████▏ | 41/50 [1:32:18<26:47, 178.64s/it]



 84%|████████▍ | 42/50 [1:33:08<18:38, 139.78s/it]



 86%|████████▌ | 43/50 [1:35:42<16:48, 144.08s/it]



 88%|████████▊ | 44/50 [1:38:25<14:59, 149.97s/it]

## Check prec@k result 

In [None]:
import numpy as np

print("Prec@k score:", np.average(scores))

## Check suggestion diversity

In [24]:
k = 10
n = 10

sample_user = np.random.randint(500001, 630000, n)
sample_user = [str(x) for x in sample_user]
top_suggested_path, top_suggested_items = get_top_suggestion(sample_user[0], k)

intersect = top_suggested_items
uni = top_suggested_items

for i in range(0, n):
    top_suggested_path, top_suggested_items = get_top_suggestion(sample_user[i], k)
    
    intersect = intersect.intersection(top_suggested_items)
    uni = uni.union(top_suggested_items)
    
print("\nintersect")
print(intersect, len(intersect))
print("\nunion")
print(uni, len(uni))
print("\ndistinct rate")
print((len(uni)) / (n*k))


intersect
set() 0

union
{4353, 8962, 4365, 11536, 7442, 14868, 534, 6167, 2840, 2073, 10524, 12067, 10791, 10796, 12077, 2863, 5167, 6452, 14136, 4666, 10811, 1599, 578, 14403, 3401, 4687, 854, 8029, 2914, 8811, 9326, 8559, 7024, 2160, 9846, 5246, 6534, 11143, 136, 7049, 14222, 1423, 911, 3214, 10388, 9620, 14231, 11677, 15267, 10659, 1457, 6834, 3252, 4791, 1976, 1719, 5058, 14532, 15302, 14025, 15049, 7113, 7631, 9176, 15323, 13023, 5601, 9189, 12262, 11494, 3815, 3816, 14828, 7917, 7405, 7663, 5619} 77

distinct rate
0.77
