# imports

In [7]:
import pandas as pd
import os
import math

%run utils.ipynb

# helper methods

In [8]:
def convert_to_ids(df, path_to_dataset):
    df = df.copy()
    eri = ERI(path_to_dataset)
    df['h_id'] = eri.get_entity_ids(df['h_id'].values)
    df['t_id'] = eri.get_entity_ids(df['t_id'].values)
    df['r_id'] = eri.get_relation_ids(df['r_id'].values)
    
    return df

def convert_to_names(df, path_to_dataset):
    df = df.copy()
    eri = ERI(path_to_dataset)
    df['h_id'] = eri.get_entities_by_id(df['h_id'].values)
    df['t_id'] = eri.get_entities_by_id(df['t_id'].values)
    df['r_id'] = eri.get_relations_by_id(df['r_id'].values)
    
    return df

def save_testset(path_to_dataset, dataset, set_des, df):
    df.to_csv(os.path.join('testsets', dataset, f'test_{set_des}.del'), sep='\t', header=False, index=False)
    convert_to_names(df, path_to_dataset).to_csv(os.path.join('testsets', dataset, f'test_{set_des}.txt'), sep='\t', header=False, index=False)

In [9]:
for dataset in ['wnrr', 'codex-m', 'yago3-10', 'fb15k-237']:
    # load dataset
    path_to_dataset = os.path.join('experiments', '0_datasets', dataset)
    df = pd.read_csv(os.path.join(path_to_dataset,'test.del'), delimiter='\t', header=None, names=['h_id', 'r_id', 't_id'])
    
    # relation class test sets
    relation_classes = get_relation_classes(os.path.join('experiments', f'{dataset}_complex_1', 'checkpoint_best.pt'))

    ## 1-1
    df_1_1_set = df[df['r_id'].isin(relation_classes[relation_classes['relationClass'] == '1-1']['r_id'].values)]
    save_testset(path_to_dataset, dataset, 'one-to-one', df_1_1_set)

    ## multi-cardinality
    df_multi_cardinality_set = df[~df['r_id'].isin(relation_classes[relation_classes['relationClass'] == '1-1']['r_id'].values)]
    save_testset(path_to_dataset, dataset, 'multi-cardinality', df_multi_cardinality_set)
    
    # relation frequency test sets
    relation_frequency = get_relation_frequency_in_training_data(dataset).sort_values('freq', ascending=False)
    amount_relations_25p = math.ceil(relation_frequency.shape[0] * 0.25)
    amount_relations_10p = math.ceil(relation_frequency.shape[0] * 0.05)

    ## frequent
    df_25p_most_frequent_relations = df[df['r_id'].isin(relation_frequency['r_id'].values[:amount_relations_10p])]
    save_testset(path_to_dataset, dataset, '25p-most-frequent-relations', df_25p_most_frequent_relations)

    ## infrequent
    df_25p_least_frequent_relations = df[df['r_id'].isin(relation_frequency['r_id'].values[-amount_relations_25p:])]
    save_testset(path_to_dataset, dataset, '25p-least-frequent-relations', df_25p_least_frequent_relations)
    
    # similar triples in trainings data test sets
    df_dir_1 = df.copy()
    df_dir_1['predicted_head'] = True
    df_dir_2 = df.copy()
    df_dir_2['predicted_head'] = False
    df_dir = pd.concat([df_dir_1, df_dir_2])

    df_dir = get_similar_triples_in_trainings_data(df_dir, dataset)

    ## no similar
    df_no_similar = df_dir[df_dir['similar_triples'] == 0][['h_id','r_id','t_id']].drop_duplicates()
    save_testset(path_to_dataset, dataset, 'no-similar-triples-in-training', df_no_similar)

    ## at least one similar
    df_similar = df_dir[df_dir['similar_triples'] > 0][['h_id','r_id','t_id']].drop_duplicates()
    save_testset(path_to_dataset, dataset, 'at-least-one-similar-triple-in-training', df_similar)

using precalculated values from temp\_get_relation_classes-experiments-wnrr_complex_1-checkpoint_best-pt.txt
using precalculated values from temp\_get_relation_frequency_in_training_data-wnrr.txt
using precalculated values from temp\get_similar_triples_in_trainings_data-6268wnrr.txt
using precalculated values from temp\_get_relation_classes-experiments-codex-m_complex_1-checkpoint_best-pt.txt
using precalculated values from temp\_get_relation_frequency_in_training_data-codex-m.txt
using precalculated values from temp\get_similar_triples_in_trainings_data-20622codex-m.txt
using precalculated values from temp\_get_relation_classes-experiments-yago3-10_complex_1-checkpoint_best-pt.txt
using precalculated values from temp\_get_relation_frequency_in_training_data-yago3-10.txt
using precalculated values from temp\get_similar_triples_in_trainings_data-10000yago3-10.txt
using precalculated values from temp\_get_relation_classes-experiments-fb15k-237_complex_1-checkpoint_best-pt.txt
using preca