# Check reasoning

In [1]:
import pickle
from tqdm import tqdm

In [2]:
data_info = pickle.load(open("../data/movie/preprocessed_data_info_32", 'rb'))
entities = open("../data/movie/entities2.txt").readlines()
movies = open("../data/movie/moviesIdx2.txt").readlines()

In [3]:
ripple_set = data_info[5]

dict_entities = {}
for entity in entities:
    x = entity.strip().split()
    entity_id = x[0]
    entity_name = x[1].replace('http://dbpedia.org/resource/', '').replace("_", " ")
    
    dict_entities[entity_id] = entity_name
    
for movie in movies:
    x = movie.strip().split()
    movie_id = x[0]
    movie_name = x[1].replace('http://dbpedia.org/resource/', '').replace("_", " ")
    
    dict_entities[movie_id] = movie_name

In [6]:
relations = {"http://dbpedia.org/ontology/director":0,
            "http://dbpedia.org/ontology/writer":2,
            "http://dbpedia.org/ontology/starring":4,
            "http://dbpedia.org/ontology/creator":6,
            "http://purl.org/dc/terms/subject":8,
            "http://dbpedia.org/ontology/musicComposer":10,
            "http://dbpedia.org/ontology/country":12,
            "http://dbpedia.org/property/story":14,
            "http://dbpedia.org/property/studio":16,
            "http://dbpedia.org/ontology/genre":18,
            "http://dbpedia.org/ontology/cinematography":20,
            "http://dbpedia.org/ontology/distributor":22,
            "http://dbpedia.org/ontology/spouse":24,
            "http://dbpedia.org/ontology/relative":25}
 
for k, v in relations.items():
    relation_id = str(v + 200000)
    inv_relation_id = str(v + 200001)
    
    relation_name = k.split('/')[-1]
    
#     relation_name = k.replace('http://dbpedia.org/ontology/', '').replace("_", " ")
    inv_relation_name = "Inverse " + relation_name
    
    dict_entities[relation_id] = relation_name
    dict_entities[inv_relation_id] = inv_relation_name

# Make String version

In [95]:
s_ripple_set = {}

for user in tqdm(ripple_set):
    
    s_ripple_hop = []
    for ripple_hop in ripple_set[user]:
        heads, relations, tails = ripple_hop
        
        s_heads = [dict_entities[str(int(x))] for x in heads]
        s_relations = [dict_entities[str(int(x))] for x in relations]
        s_tails = [dict_entities[str(int(x))] for x in tails]
                
        s_ripple_hop.append(["{} => {} => {}".format(s_heads[i], s_relations[i], s_tails[i]) for i in range(0, len(heads))])
    
    s_ripple_set[(int(user))] = s_ripple_hop

100%|██████████| 137589/137589 [00:24<00:00, 5527.22it/s]


# Make Inference

In [96]:
def intersection(lst1, lst2): 
  
    # Use of hybrid method 
    temp = set(lst2) 
    lst3 = [value for value in lst1 if value in temp] 
    return list(set(lst3)) 

In [97]:
inferenced_ripple_set = {}
for user in tqdm(ripple_set):
    
    try:
        ripple_hop_0 = ripple_set[user][0]
        ripple_hop_1 = ripple_set[user][1]
    except:
        continue
        
    intersect_t0_h1 = intersection(ripple_hop_0[2], ripple_hop_1[0])
    intersect_path = []
    for entity in intersect_t0_h1:
        
        # Check head 0
        hop_0_tails = ripple_hop_0[2]
        hop_1_heads = ripple_hop_1[0]
        
        for i in range(0, len(hop_0_tails)):
            if entity == hop_0_tails[i]:
                
                h0 = ripple_hop_0[0][i]
                r0 = ripple_hop_0[1][i]
                t0 = ripple_hop_0[2][i] 
                break
        
        # Check tail 1
        for i in range(0, len(hop_1_heads)):
            if entity == hop_1_heads[i]:
                
                h1 = ripple_hop_1[0][i]
                r1 = ripple_hop_1[1][i]
                t1 = ripple_hop_1[2][i]  
                break
        
        path = (h0, r0, h1, r1, t1)
        path_s = " => ".join([dict_entities[str(int(x))] for x in path])
        intersect_path.append((path, path_s))
        
    inferenced_ripple_set[int(user)] = intersect_path        

100%|██████████| 137589/137589 [00:12<00:00, 11371.92it/s]


# Make history dict

ratings = open("../data/movie/ratings_re.csv", encoding="utf-8").readlines()

threshold = 4
user_preference_history = {}

for line in tqdm(ratings):
    user, movie_id, rating = line.strip().split(",")[:3]
    
    if int(user) not in user_preference_history:
        user_preference_history[int(user)] = []
    
    if float(rating) >= threshold:
        user_preference_history[int(user)].append(dict_entities[str(movie_id)])

ratings = open("../data/movie/ratings_final.txt", encoding="utf-8").readlines()

user_preference_history = {}

for line in tqdm(ratings):
    user, movie_id, rating = line.strip().split("\t")
    
    if int(user) not in user_preference_history:
        user_preference_history[int(user)] = []
    
    if float(rating) == 1:
        user_preference_history[int(user)].append(dict_entities[str(movie_id)])

In [14]:
import numpy as np

In [15]:
train_data = data_info[0]
eval_data = data_info[1]
test_data = data_info[2]

history = np.concatenate((train_data, eval_data, test_data))

In [98]:
user_preference_history = {}
user_hate_history = {}

for line in tqdm(history):
    user, movie_id, rating = line
    
    if int(user) not in user_preference_history:
        user_preference_history[int(user)] = []
        user_hate_history[int(user)] = []
    
    if float(rating) == 1:
        user_preference_history[int(user)].append(dict_entities[str(int(movie_id))])
    else:
        user_hate_history[int(user)].append(dict_entities[str(int(movie_id))])

100%|██████████| 13724776/13724776 [00:40<00:00, 336751.84it/s]


# Test

In [102]:
def test_reasoning(sample_user):
    
    string_ver = sorted(s_ripple_set[sample_user][0])
    hist = sorted(user_preference_history[sample_user])
    hate = sorted(user_hate_history[sample_user])
    inference = inferenced_ripple_set[sample_user]
    
    print("===== STRING VERSION =====")
    for x in string_ver:
        print(x)
    print("\n===== USER HISTORY =====")
    for x in hist:
        print(x)
#     print("\n-- HATED --")
#     for x in hate:
#         print(x)
    print("\n===== INFERENCE =====")
    for x in inference:
        print(x)

In [104]:
sample_user = 96699
test_reasoning(sample_user)

===== STRING VERSION =====
Ace Ventura: When Nature Calls => genre => Comedy
Ace Ventura: When Nature Calls => starring => Maynard Eziashi
Ace Ventura: When Nature Calls => subject => Category:American sequel films
Ace Ventura: When Nature Calls => subject => Category:Films directed by Steve Oedekerk
Apollo 13 (film) => starring => Bill Paxton
Apollo 13 (film) => starring => Ed Harris
Apollo 13 (film) => subject => Category:American aviation films
Apollo 13 (film) => subject => Category:Films directed by Ron Howard
Babe (film) => genre => Children
Babe (film) => starring => James Cromwell
Babe (film) => subject => Category:Films featuring anthropomorphic characters
Cliffhanger (film) => distributor => TriStar Pictures
Clueless (film) => subject => Category:American buddy films
Clueless (film) => subject => Category:American romantic comedy films
Clueless (film) => subject => Category:Screenplays by Amy Heckerling
Crimson Tide (film) => distributor => Walt Disney Studios Motion Pictures

# Note:
apparently those ripple-hop tail, not necesarrely have high score when come into prediction (

In [None]:
suggestion = get_top_suggestion(96699, 10000)
some_tails = [13240, 7159, 3777, 14994, 12366, 2797, 602, 7191, 6741, 3403]

for score, item_id in suggestion:
    if item_id in some_tails:
        print(score, item_id)

0.3980206685321706 3777 <br>
0.10237747024394224 6741 <br>
0.06157134852737707 12366 <br>
0.008790826381946485 3403 <br>
0.0009740564897270111 2797 <br>
0.000362757349141969 602 <br>

so it rely on the randomness when creating ripple set as well,, if wrongly sampled, the user preferences become distorted