# Calculate loss score of false triples 

## modules

In [2]:
import torch
import json
import pylab as plt
import pandas as pd
from sources.evaluation import vanilla_hits_at_k
from IPython.display import display, HTML
from pykeen.datasets import get_dataset
from util.databinder import DataBinder
from pykeen.evaluation import RankBasedEvaluator
from sklearn.metrics import precision_recall_curve, auc
from util.databinder import DataBinder

## variables, functions and classes

In [16]:
def load_json_from_file(f:str):
    with open(f, 'r') as fin:
        dict_data = json.load(fin)
    return dict_data

## parameters

In [53]:
# directory for trained model
dir_model = './models/20240803/kge_try1_transe_fb15k237'

# directory for false triples
dir_false_dataset = './data/processed/20240803/false_dataset_try1_fb15k237'

# direcory for report
dir_report = './reports/20240803/eval_results_try1_fb15k237_transe'

## main

### input check

In [8]:
db_model = DataBinder(dir_model)
db_false_dataset = DataBinder(dir_false_dataset)

INFO:root:Loaded info from ./models/20240803/kge_try1_transe_fb15k237/info.json
INFO:root:Loaded info from ./data/processed/20240803/false_dataset_try1_fb15k237/info.json


In [9]:
if db_model.get('dataset_name') != db_false_dataset.get('dataset_name'):
    raise Exception('a name of data set is inconsistent between knowledge graph embedding model and data set')

### read model & data

In [19]:
dataset = get_dataset(dataset=db_model.get('dataset_name'), dataset_kwargs={'create_inverse_triples':True})

INFO:pykeen.datasets.utils:Loading cached preprocessed dataset from file:///home/acg16558pn/.data/pykeen/datasets/fb15k237/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
INFO:pykeen.triples.triples_factory:Loading from file:///home/acg16558pn/.data/pykeen/datasets/fb15k237/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/training
INFO:pykeen.triples.triples_factory:Loading from file:///home/acg16558pn/.data/pykeen/datasets/fb15k237/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/testing
INFO:pykeen.triples.triples_factory:Loading from file:///home/acg16558pn/.data/pykeen/datasets/fb15k237/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/validation


In [29]:
dict_models = {k:torch.load(f'{dir_model}/{k}/trained_model.pkl') for k in db_model.get('list_random_seeds')}
dict_model_info = {k:load_json_from_file(f'{dir_model}/{k}/results.json') for k in db_model.get('list_random_seeds')}

In [30]:
db_false_triples = DataBinder(target_dir=dir_false_dataset)
list_random_seed = db_false_triples.get('list_random_seed')
dict_false_triples = {}
for random_seed in list_random_seed:
    i = int(random_seed)
    dict_false_triples[i] = {}
    dict_false_triples[i]['tf'] = db_false_triples.get(f'tf_{random_seed}')
    dict_false_triples[i]['tt'] = db_false_triples.get(f'tt_{random_seed}')
    dict_false_triples[i]['df_feature'] = db_false_triples.get(f'df_tt_tf_features_{random_seed}')

INFO:root:Loaded info from ./data/processed/20240803/false_dataset_try1_fb15k237/info.json


### evaluate models

In [36]:
dict_eval_results = {seed: {} for seed in db_model.get('list_random_seeds')}

Hits@k of the knowledge graph embedding model (calculated by pykeen)

In [38]:
for seed in db_model.get('list_random_seeds'):
    dict_data = {}
    for i in [1,3,5,10]:
        dict_data[i] = dict_model_info[seed]['metrics']['both']['realistic'][f'hits_at_{i}']
    #sr_hits_at_k = pd.Series(dict_data, name='hits@k')
    #sr_hits_at_k.index.name = 'k'
    dict_eval_results[seed]['hits@k'] = dict_data

Hits@k of knowledge graph embeddign model (calculated by homemade function)

In [42]:
for seed in db_model.get('list_random_seeds'):
    dict_data = vanilla_hits_at_k(dict_models[seed], dataset.testing.mapped_triples)
    dict_eval_results[seed]['vanilla hits@k'] = dict_data

Calculate score and true nagative ratio

In [51]:
for model_random_seed in db_model.get('list_random_seeds'):
    for random_seed in dict_false_triples.keys():

        kge_model = dict_models[model_random_seed]
        
        tf = dict_false_triples[random_seed]['tf']
        tt = dict_false_triples[random_seed]['tt']
        df_feature = dict_false_triples[random_seed]['df_feature']
    
        score_tf = kge_model.score_hrt(tf.mapped_triples).cpu().detach().numpy()
        df_feature['score'] = score_tf
    
        score_tt = kge_model.score_hrt(tt.mapped_triples).cpu().detach().numpy()
        df_feature['score(org)'] = score_tt
    
        dict_false_triples[random_seed]['true_negative_ratio'] = {}
        n = len(df_feature)
        df_sorted_feature = df_feature.sort_values('score')
        
        for k in [0.01, 0.03, 0.05]:
            n_top = int(n*k)
            df = df_sorted_feature.head(n_top)
            threashold = df['score'].max()
            tn = len(df[df['is-error']==True])
            fp = len(df[df['is-error']==False])
            tnr = tn/(fp+tn)
    
            dict_false_triples[random_seed]['true_negative_ratio'][k] = {}
            dict_false_triples[random_seed]['true_negative_ratio'][k]['value'] = tnr
            dict_false_triples[random_seed]['true_negative_ratio'][k]['threashold'] = threashold

    dict_eval_results[model_random_seed]['true_negative_ratio'] = {random_seed: dict_data['true_negative_ratio'] 
                                                                   for random_seed, dict_data in dict_false_triples.items()}

### save results

In [54]:
db = DataBinder(target_dir=dir_report)

INFO:root:Create ./reports/20240803/eval_results_try1_fb15k237_transe


In [18]:
db.add('dict_eval_results', dict_eval_results)
db.add('dict_model_info', db_model.info)
db.add('dict_model_info', db_dataset.info)

INFO:root:Saved info at 2024-07-25 11:59:39
INFO:root:Saved info at 2024-07-25 11:59:39
INFO:root:Saved info at 2024-07-25 11:59:39
INFO:root:Saved info at 2024-07-25 11:59:39
INFO:root:Saved info at 2024-07-25 11:59:39
INFO:root:Saved info at 2024-07-25 11:59:39
INFO:root:Saved info at 2024-07-25 11:59:39


In [55]:
db_model.info

{'model_0': './models/20240803/kge_try1_transe_fb15k237/model_0.pt',
 'saved_datetime': '2024-08-03 07:01:34',
 'model_name': 'transe',
 'dataset_name': 'fb15k237',
 'dict_args': './models/20240803/kge_try1_transe_fb15k237/dict_args.pt',
 'f_params': '../benchmarking/df_best_param.pkl',
 'list_random_seeds': './models/20240803/kge_try1_transe_fb15k237/list_random_seeds.pt'}

In [56]:
dict_data

{1: 0.003033564928075154,
 3: 0.00621391525589588,
 5: 0.00841569625207946,
 10: 0.013651042176338193}