 # **Test KPRN Result**

In [1]:
TEST_CODE = "2019-07-17 12:51:28"
MODEL_DIR = "../logs/{}".format(TEST_CODE)
CHOSEN_EPOCH = 5

# > Config

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True

sess = tf.Session(config=config)
set_session(sess)

Using TensorFlow backend.


In [4]:
from tqdm import tqdm

# > Load model

In [5]:
import os
trained_weights = sorted(os.listdir(MODEL_DIR))
choosen_weight = "{}/{}".format(MODEL_DIR, trained_weights[CHOSEN_EPOCH - 1])

In [6]:
from keras.models import load_model
from keras.models import Model

model = load_model(choosen_weight);

W0719 01:50:10.853295 140186200762112 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0719 01:50:10.859103 140186200762112 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0719 01:50:10.883099 140186200762112 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0719 01:50:10.998543 140186200762112 deprecation_wrapper.py:119] From /home/jessinra/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_w

# > Load data

## >> Read data

In [7]:
file_ratings_re = open("../data/ratings_re.csv").readlines()
file_triples_idx = open("../data/triples_idx.txt").readlines()

file_moviesIdx = open("../data/moviesIdx.txt").readlines() 
file_types = open("../data/types.txt").readlines() 
file_entities = open("../data/entities.txt").readlines()
file_relations = open("../data/relations.txt").readlines()

In [8]:
# create entity id -> name mapping

entity_id_to_name = {}
for line in file_moviesIdx:
    movie_title, entity_id = line.strip().split()
    entity_id_to_name[entity_id] = movie_title
    
for line in file_entities:
    entity_name, entity_id = line.strip().split()
    entity_id_to_name[entity_id] = entity_name

In [9]:
# create movie title -> entity type mapping and list of entity for each type

movie_title_to_entity_type = {}
entity_list_with_type = {}

for line in file_types:
    
    # movie title -> entity type
    entity, entity_type = line.strip().split('\t')
    movie_title_to_entity_type[entity] = entity_type
    
    # entity for each type
    if entity_type not in entity_list_with_type:
        entity_list_with_type[entity_type] = []
    
    entity_list_with_type[entity_type].append(entity)

In [10]:
# Create relation id to name mapping

relation_id_to_name = {}
for line in file_relations:
    relation_name, relation_id = line.strip().split()
    relation_id = int(relation_id)
    relation_id += 200000
    
    # last 2 relation : spouse and relative has no inverse
    if relation_id < 200023:
        relation_id_to_name[str(relation_id + 1)] = relation_name + "_inverse"
    
    relation_id_to_name[str(relation_id)] = relation_name

In [31]:
list_of_type = [
    "Category",
    "Company",
    "Country",
    "Genre",
    "Movie",
    "Person",
    "User",
    
    "#PAD_TOKEN",
    "#UNK_ENTITY_TYPE",
]

entity_type_to_id = {}
list_of_type_with_id = []

i = 0
for t in list_of_type:
    
    entity_type_to_id[t] = i
    list_of_type_with_id.append("{}\t{}".format(t, i))
    
    i += 1

In [11]:
# Load KG from cache

import pickle
kg_path = pickle.load(open("../data/cache_kg_path", "rb"))

In [12]:
user_entity_id_padding = 500000
relation_rated_good_by_id = '200026'
relation_given_good_rating_id = '200027'
good_movie_rating_threshold = 4

## >> Prepare data

In [17]:
import threading

In [18]:
IRREGULAR_RELATION = ['200024', '200025']
NUM_OF_ITEMS = len(file_moviesIdx)

def generate_path_from_entity_to_all_others(entity_id, path_per_relation=1):

    generated_paths = []
    
    r1 = kg_path[entity_id]
    for e2_id in range(0, NUM_OF_ITEMS):
        e2_id = str(e2_id)
        # Skip if e1 == e2 (path to itself)
        if e2_id == entity_id:
            continue

        r2 = kg_path[e2_id]
        intersect_relations = set(r1.keys()).intersection(set(r2.keys()))

        for relation in intersect_relations:

            t1 = r1[relation]
            t2 = r2[relation]
            intersect_entity = list(set(t1).intersection(set(t2)))
            np.random.shuffle(intersect_entity)
            
            for mid_entity in intersect_entity[:path_per_relation]:
                
                inverse_relation = relation if (relation in IRREGULAR_RELATION) else str(int(relation) + 1)        
                path = "{} {} {} {} {}".format(entity_id, relation, mid_entity, inverse_relation, e2_id)
                generated_paths.append(path)
    
    return generated_paths

In [19]:
def run_thread(thread_id, thread_items, result):
    for item_id in (thread_items):
        result += generate_path_from_entity_to_all_others(item_id)
            
def generate_all_path_from_user(user_id, num_seed=3):
    
    user_interacted_items = []
    for relation in kg_path[user_id]:
        user_interacted_items += kg_path[user_id][relation]
    
    user_interacted_items = sorted(set(user_interacted_items))
    np.random.shuffle(user_interacted_items)
    user_interacted_items = user_interacted_items[:num_seed]
    
    user_paths = []

    # ====== Threading ======
    threads = []
    for i in range(0, num_seed):
        
        # Split item id equally
        thread_items = user_interacted_items[i::num_seed]
        thread = threading.Thread(target=run_thread, args=(i, thread_items, user_paths))    
        thread.start()
        threads.append(thread)
    
    for i in range(0, num_seed):
        threads[i].join()
        
    user_paths = ["{} {} {}".format(user_id, "200027", x) for x in user_paths]
    return user_paths

In [21]:
def get_type_from_entity_id(entity_id):
    
    # user
    if int(entity_id) > user_entity_id_padding:
        return "User"
    elif entity_id_to_name[entity_id] in movie_title_to_entity_type:
        return movie_title_to_entity_type[entity_id_to_name[entity_id]]
    else:
        return "#UNK_ENTITY_TYPE"

In [47]:
import numpy as np

END_RELATION = '200030'
GIVEN_RATING_RELATION = '200027'
BATCH_COUNT = 72

def reformat_user_path(user_paths):
    
    new_paths = []
    labels = []
    
    for path in user_paths:
        e1, r1, e2, r2, e3, r3, e4 = path.strip().split()

        t1 = entity_type_to_id[get_type_from_entity_id(e1)]
        t2 = entity_type_to_id[get_type_from_entity_id(e2)]
        t3 = entity_type_to_id[get_type_from_entity_id(e3)]
        t4 = entity_type_to_id[get_type_from_entity_id(e4)]

        r4 = END_RELATION

        entity_rated = kg_path[e1][GIVEN_RATING_RELATION]
        label = 1 if e4 in entity_rated else 0
        
        new_paths.append([
            [e1, t1, r1],
            [e2, t2, r2],
            [e3, t3, r3],
            [e4, t4, r4],
        ])
        
        labels.append(label)
        
    return np.array(new_paths).astype('int'), np.array(labels).astype('int')

# > Evaluation

In [100]:
def get_top_suggestion(user_id, k=10):
    user_paths = generate_all_path_from_user(user_id, num_seed=3)
    X_test, y_true = reformat_user_path(user_paths)
    y_pred = model.predict(X_test, batch_size=2048, verbose=1)
    
    path_score = sorted(zip(X_test, y_pred), key=lambda x: x[1], reverse=True)
    
    top_paths = []
    top_choosen_items = set()
    for path, score in path_score:
        
        top_paths.append((path, score))
        top_choosen_items.add(path[3][0]) # Add the last item
        
        if len(top_choosen_items) >= k:
            break
    
    return top_paths, top_choosen_items

In [101]:
def get_top_truth(user_id):
    return set(kg_path[user_id][GIVEN_RATING_RELATION])

In [112]:
def check_precision(pred, truth):    
    
    # Make sure same type
    pred = {str(x) for x in pred}
    truth = {str(x) for x in truth}

    intersect = pred.intersection(truth)
    
    if len(truth) > 0 :
        return intersect, len(intersect) / len(truth)
    else:
        return {}, 0

In [120]:
sample_user = np.random.randint(500001, 630000, 50)
sample_user = [str(x) for x in sample_user]

top_paths = []
intersects = []
scores = []

for user in tqdm(sample_user):
    top_suggested_path, top_suggested_items = get_top_suggestion(user, k=10)
    top_truth_items = get_top_truth(user)
    intersect, score = check_precision(top_suggested_items, top_truth_items)
    
    top_paths.append(top_suggested_path)
    intersects.append(intersect)
    scores.append(score)

  0%|          | 0/50 [00:00<?, ?it/s]



  2%|▏         | 1/50 [00:51<42:24, 51.92s/it]



  4%|▍         | 2/50 [01:53<43:55, 54.90s/it]



  6%|▌         | 3/50 [02:12<34:25, 43.94s/it]



  8%|▊         | 4/50 [02:41<30:17, 39.51s/it]



 10%|█         | 5/50 [02:52<23:15, 31.00s/it]



 12%|█▏        | 6/50 [03:16<21:08, 28.83s/it]



 14%|█▍        | 7/50 [04:07<25:29, 35.56s/it]



 16%|█▌        | 8/50 [04:18<19:48, 28.29s/it]



 18%|█▊        | 9/50 [04:44<18:48, 27.52s/it]



 20%|██        | 10/50 [05:00<16:02, 24.07s/it]



 22%|██▏       | 11/50 [05:45<19:39, 30.24s/it]



 24%|██▍       | 12/50 [06:36<23:14, 36.70s/it]



 26%|██▌       | 13/50 [07:01<20:18, 32.93s/it]



 28%|██▊       | 14/50 [07:15<16:19, 27.22s/it]



 30%|███       | 15/50 [07:38<15:16, 26.19s/it]



 32%|███▏      | 16/50 [08:12<16:12, 28.59s/it]



 34%|███▍      | 17/50 [09:39<25:19, 46.03s/it]



 36%|███▌      | 18/50 [09:59<20:25, 38.30s/it]



 38%|███▊      | 19/50 [10:27<18:07, 35.09s/it]



 40%|████      | 20/50 [11:31<21:51, 43.72s/it]



 42%|████▏     | 21/50 [11:49<17:25, 36.06s/it]



 44%|████▍     | 22/50 [11:59<13:09, 28.20s/it]



 46%|████▌     | 23/50 [12:28<12:48, 28.47s/it]



 48%|████▊     | 24/50 [12:48<11:14, 25.94s/it]



 50%|█████     | 25/50 [13:48<15:04, 36.17s/it]



 52%|█████▏    | 26/50 [14:16<13:30, 33.78s/it]



 54%|█████▍    | 27/50 [14:57<13:47, 35.97s/it]



 56%|█████▌    | 28/50 [15:03<09:47, 26.72s/it]



 58%|█████▊    | 29/50 [15:43<10:49, 30.91s/it]



 60%|██████    | 30/50 [16:03<09:10, 27.54s/it]



 62%|██████▏   | 31/50 [16:35<09:11, 29.05s/it]



 64%|██████▍   | 32/50 [16:59<08:12, 27.38s/it]



 66%|██████▌   | 33/50 [17:20<07:11, 25.37s/it]



 68%|██████▊   | 34/50 [18:10<08:45, 32.85s/it]



 70%|███████   | 35/50 [18:16<06:13, 24.91s/it]



 72%|███████▏  | 36/50 [19:09<07:45, 33.24s/it]



 74%|███████▍  | 37/50 [19:40<07:02, 32.54s/it]



 76%|███████▌  | 38/50 [20:16<06:44, 33.74s/it]



 78%|███████▊  | 39/50 [20:57<06:32, 35.71s/it]



 80%|████████  | 40/50 [21:14<05:00, 30.10s/it]



 82%|████████▏ | 41/50 [21:58<05:09, 34.38s/it]



 84%|████████▍ | 42/50 [22:06<03:32, 26.52s/it]



 86%|████████▌ | 43/50 [22:33<03:06, 26.65s/it]



 88%|████████▊ | 44/50 [22:51<02:24, 24.01s/it]



 90%|█████████ | 45/50 [23:09<01:51, 22.29s/it]



 92%|█████████▏| 46/50 [23:23<01:19, 19.80s/it]



 94%|█████████▍| 47/50 [23:47<01:02, 20.90s/it]



 96%|█████████▌| 48/50 [24:13<00:44, 22.47s/it]



 98%|█████████▊| 49/50 [24:40<00:23, 23.76s/it]



100%|██████████| 50/50 [24:53<00:00, 20.64s/it]


## Check user - num of rating dist

In [122]:
import numpy as np

np.average(scores)

0.06699360188619984

## Check suggestion diversity

In [125]:
k = 10
n = 10

sample_user = np.random.randint(500001, 630000, n)
sample_user = [str(x) for x in sample_user]
top_suggested_path, top_suggested_items = get_top_suggestion(sample_user[0], k)

intersect = top_suggested_items
uni = top_suggested_items

for i in range(0, n):
    top_suggested_path, top_suggested_items = get_top_suggestion(sample_user[i], k)
    
    intersect = intersect.intersection(top_suggested_items)
    uni = uni.union(top_suggested_items)
    
print("\nintersect")
print(intersect, len(intersect))
print("\nunion")
print(uni, len(uni))
print("\ndistinct rate")
print((len(uni)) / (n*k))


intersect
{11536} 1

union
{4224, 2179, 7304, 6796, 3214, 911, 11536, 1168, 10644, 2840, 2073, 8729, 3482, 8860, 3996, 11677, 546, 10659, 676, 10796, 10801, 11314, 9274, 5441, 13122, 5058, 12355, 967, 14025, 3401, 4687, 7631, 12368, 14164, 15323, 477, 3551, 11494, 9066, 491, 8559, 1392, 4080, 5619, 11892, 9846, 5497, 6139} 48

distinct rate
0.48
