# Imports

In [30]:
import numpy as np
import os
import pandas as pd
import torch

from sklearn.cluster import KMeans

from kge.model import KgeModel
from kge.util.io import load_checkpoint
from kge.indexing import index_relation_types

# Lazy Getter Decorator 

In [31]:
def lazy_df_getter(func):
    def get(*args, **kwargs):
        cleaned_arg = args[0].replace('\\', '-').replace('.', '-').replace('/', '-')
        path_to_file = os.path.join('temp', f'{func.__name__}-{cleaned_arg}.txt')
        if os.path.exists(path_to_file):
            print(f'using precalculated values from {path_to_file}')
            return pd.read_csv(path_to_file)
        else:
            df = func(*args, **kwargs)
            df.to_csv(path_to_file)
            return df        
    return get

# Entities/ Relations <-> Ids

In [32]:
class ERI:

    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.entities = pd.read_csv(os.path.join(dataset_path, 'entity_ids.del'), sep='\t', names=['id', 'entity'])
        self.relations = pd.read_csv(os.path.join(dataset_path, 'relation_ids.del'), sep='\t', names=['id', 'relation'])
    
    def _get_multiple(self, getter, inputs):        
        results = []
        for input in inputs:
            results.append(getter(input))
        return results
    
    def get_entity_by_id(self, id):
        return self.entities.loc[self.entities['id'] == id].iat[0,1]
    
    def get_entities_by_id(self, ids):
        return self._get_multiple(self.get_entity_by_id, ids)
    
    def get_entity_id(self, entity):
        if type(entity) == str and entity.isdigit():
            entity = int(entity)
        return self.entities.loc[self.entities['entity'] == entity].iat[0,0]
    
    def get_entity_ids(self, entities):
        return self._get_multiple(self.get_entity_id, entities)
    
    def get_relation_by_id(self, id):
        return self.relations.loc[self.relations['id'] == id].iat[0,1]
    
    def get_relations_by_id(self, ids):
        return self._get_multiple(self.get_relation_by_id, ids)
    
    def get_relation_id(self, relation):
        if relation.isdigit():
            relation = int(relation)
        return self.relations.loc[self.relations['relation'] == relation].iat[0,0]
    
    def get_relation_ids(self, relations):
        return self._get_multiple(self.get_relation_id, relations)
    
    def get_all_entities(self):
        return self.entities['entity'].unique()
    
    def get_all_relations(self):
        return self.relations['relation'].unique()
    
    def get_all_entity_ids(self):
        return self.entities['id'].unique()
    
    def get_all_relation_ids(self):
        return self.relations['id'].unique()

# Relation Frequency in Training Data

In [33]:
relation_frequency_map = {}
def get_relation_frequency_in_training_data(dataset):
    if dataset in relation_frequency_map:
        return relation_frequency_map[dataset]
    
    return _get_relation_frequency_in_training_data(dataset)

@lazy_df_getter
def _get_relation_frequency_in_training_data(dataset):    
    path_to_dataset = os.path.join('experiments', '0_datasets', dataset)
    path_to_training = os.path.join(path_to_dataset, 'train.txt')

    training_df = pd.read_csv(path_to_training, sep='\t', header=None)
    training_df.columns = ['h', 'r', 't']

    relations = training_df['r'].unique()
    relation_counts = training_df['r'].value_counts()

    df = pd.DataFrame(columns=['r_id','freq'])
    eri = ERI(path_to_dataset)
    for relation in relations:
        df.loc[df.shape[0]] = [eri.get_relation_id(relation), relation_counts[relation]]
    
    df['norm_freq'] = (df['freq'] - df['freq'].min())/(df['freq'].max()-df['freq'].min())
    df = df.reset_index()
    
    relation_frequency_map[dataset] = df    
    return df

# Entity Frequency in Training Data

In [34]:
entity_frequency_map = {}
def get_entity_frequency_in_training_data(dataset):
    if dataset in entity_frequency_map:
        return entity_frequency_map[dataset]
    
    return _get_entity_frequency_in_training_data(dataset)

@lazy_df_getter
def _get_entity_frequency_in_training_data(dataset):
    path_to_dataset = os.path.join('experiments', '0_datasets', dataset)
    path_to_training = os.path.join(path_to_dataset, 'train.txt')

    training_df = pd.read_csv(path_to_training, sep='\t', header=None)
    training_df.columns = ['h', 'r', 't']

    h_entities = training_df['h'].unique()
    t_entities = training_df['t'].unique()
    entities = np.append(h_entities, t_entities)
    entity_counts_h = training_df['h'].value_counts()
    entity_counts_t = training_df['t'].value_counts()

    
    df = pd.DataFrame(columns=['h_id', 't_id', 'freq_h', 'freq_t'])
    eri = ERI(path_to_dataset)
    for entity in entities:   
        e_id = eri.get_entity_id(entity)
        hc = entity_counts_h[entity] if entity in h_entities else 0
        tc = entity_counts_t[entity] if entity in t_entities else 0
        df.loc[df.shape[0]] = [e_id, e_id, hc, tc]
    
    df['freq'] = df['freq_h'] + df['freq_t']
    df['norm_freq'] = (df['freq'] - df['freq'].min())/(df['freq'].max()-df['freq'].min())
    df['norm_freq_h'] = (df['freq_h'] - df['freq_h'].min())/(df['freq_h'].max()-df['freq_h'].min())
    df['norm_freq_t'] = (df['freq_t'] - df['freq_t'].min())/(df['freq_t'].max()-df['freq_t'].min())
    df = df.reset_index()
    
    entity_frequency_map[dataset] = df    
    return df

# Relation Class

In [35]:
"""
def get_relation_classes(dataset_name, threshold=0.85):
    path_to_dataset = os.path.join('experiments', '0_datasets', dataset_name)
    path_to_training = os.path.join(path_to_dataset, 'train.txt')
    path_to_valid = os.path.join(path_to_dataset, 'valid.txt')
    path_to_test = os.path.join(path_to_dataset, 'test.txt')

    training_df = pd.read_csv(path_to_training, sep='\t', header=None)
    valid_df = pd.read_csv(path_to_valid, sep='\t', header=None)
    test_df = pd.read_csv(path_to_test, sep='\t', header=None)

    eri = ERI(path_to_dataset)
    relations = eri.get_all_relations()

    data = pd.concat([training_df, valid_df, test_df])
    data.columns= ['h','r','t']

    df=pd.DataFrame(columns=['r_id', 'relationClass'])

    for relation in relations:
        data_for_h_r = data[data['r']==relation].groupby(['h', 'r']).agg(set)
        data_for_t_r = data[data['r']==relation].groupby(['t', 'r']).agg(set)

        OneTo = 0
        Nto = 0
        for i in range(data_for_t_r.size):
            h_count  = len(data_for_t_r['h'].iloc[i])    
            if h_count > 1:
                Nto += 1
            elif h_count == 1:
                OneTo += 1 

        toOne = 0
        toM = 0
        for i in range(data_for_h_r.size):
            t_count  = len(data_for_h_r['t'].iloc[i])
            if t_count > 1:
                toM += 1
            elif t_count == 1:
                toOne += 1          

        xTo = ''
        if OneTo/(OneTo+Nto) > threshold:
            xTo = '1'
        else:
            xTo = 'N'

        toX = ''
        if toOne/(toOne+toM) > threshold:
            toX = '1'
        else:
            toX = 'M'

        df.loc[df.shape[0]] = [eri.get_relation_id(relation), f'{xTo}to{toX}']
    
    return df 
"""

"\ndef get_relation_classes(dataset_name, threshold=0.85):\n    path_to_dataset = os.path.join('experiments', '0_datasets', dataset_name)\n    path_to_training = os.path.join(path_to_dataset, 'train.txt')\n    path_to_valid = os.path.join(path_to_dataset, 'valid.txt')\n    path_to_test = os.path.join(path_to_dataset, 'test.txt')\n\n    training_df = pd.read_csv(path_to_training, sep='\t', header=None)\n    valid_df = pd.read_csv(path_to_valid, sep='\t', header=None)\n    test_df = pd.read_csv(path_to_test, sep='\t', header=None)\n\n    eri = ERI(path_to_dataset)\n    relations = eri.get_all_relations()\n\n    data = pd.concat([training_df, valid_df, test_df])\n    data.columns= ['h','r','t']\n\n    df=pd.DataFrame(columns=['r_id', 'relationClass'])\n\n    for relation in relations:\n        data_for_h_r = data[data['r']==relation].groupby(['h', 'r']).agg(set)\n        data_for_t_r = data[data['r']==relation].groupby(['t', 'r']).agg(set)\n\n        OneTo = 0\n        Nto = 0\n        fo

In [36]:
relation_classes_map = {}
def get_relation_classes(checkpoint_path):
    if checkpoint_path in relation_classes_map:
        return relation_classes_map[checkpoint_path]
    
    return _get_relation_classes(checkpoint_path)

@lazy_df_getter
def _get_relation_classes(checkpoint_path):
    checkpoint = load_checkpoint(checkpoint_path)
    model = KgeModel.create_from(checkpoint)

    eri = ERI(model.dataset.folder)

    relation_strings = model.dataset.relation_strings()
    relation_ids = [eri.get_relation_id(relation_string) for relation_string in relation_strings]
    relation_types = index_relation_types(model.dataset)
    
    df = pd.DataFrame(data={'r_id': relation_ids, 'relationClass': relation_types})
    
    relation_classes_map[checkpoint_path] = df
    return df

# Formatted Data Name

In [37]:
def get_formatted_data_name(dataset_name, symbolic_name, subsymbolic_name):
    return f'{dataset_name}_{symbolic_name}_{subsymbolic_name}.txt'

# Calculate Mean Reciprocal Rank

In [38]:
def calculate_mean_reciprocal_rank(ranks):
    return (1/ranks.count())*((1/ranks).sum())

In [39]:
def calculate_hits_at_k(ranks, k):
    return (ranks <= k).sum() / ranks.count()

def calculate_hits_at_1(ranks):
    return calculate_hits_at_k(ranks, 1)
def calculate_hits_at_10(ranks):
    return calculate_hits_at_k(ranks, 10)
def calculate_hits_at_100(ranks):
    return calculate_hits_at_k(ranks, 100)

# Entity Similarity Score

In [40]:
def sim_matrix(a, b, eps=1e-8):
    """
    added eps for numerical stability
    """
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.clamp(a_n, min=eps)
    b_norm = b / torch.clamp(b_n, min=eps)
    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
    return sim_mt

In [41]:
entity_similarity_matrix_map = {}
def get_entity_similarity_scores(checkpoint_path):
    if checkpoint_path in entity_similarity_matrix_map:
        return entity_similarity_matrix_map[checkpoint_path]
    
    return _get_entity_similarity_scores(checkpoint_path)

@lazy_df_getter
def _get_entity_similarity_scores(checkpoint_path):
    checkpoint = load_checkpoint(checkpoint_path)
    model = KgeModel.create_from(checkpoint)
    
    embeddings = model.get_s_embedder().embed_all()
    entity_keys = model.dataset.entity_ids()
    
    cos_matrix = sim_matrix(embeddings, embeddings)
    cos_scores = cos_matrix.sum(dim=1)
    
    eri = ERI(os.path.join('experiments', '0_datasets', model.dataset.folder.split('\\')[-1]))
    entity_ids = eri.get_entity_ids(entity_keys)
    
    df = pd.DataFrame({
        'h_id': entity_ids,
        't_id': entity_ids,
        'entity_sim_score': cos_scores.detach().numpy()
    })
    
    df['norm_entity_sim_score'] = (df['entity_sim_score'] - df['entity_sim_score'].min())/(df['entity_sim_score'].max()-df['entity_sim_score'].min())
    
    return df

# Cluster Entities

In [45]:
entity_clusters_map = {}
def get_entity_clusters(checkpoint_path):
    if checkpoint_path in entity_clusters_map:
        return entity_clusters_map[checkpoint_path]
    
    return _get_entity_clusters(checkpoint_path)

@lazy_df_getter
def _get_entity_clusters(checkpoint_path, n_clusters=100):
    checkpoint = load_checkpoint(checkpoint_path)
    model = KgeModel.create_from(checkpoint)
    
    embeddings = model.get_s_embedder().embed_all().detach().numpy()
    
    entity_keys = model.dataset.entity_ids()
    eri = ERI(os.path.join('experiments', '0_datasets', model.dataset.folder.split('\\')[-1]))
    entity_ids = eri.get_entity_ids(entity_keys)
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(embeddings)
    df = pd.DataFrame({        
        'h_id': entity_ids,
        't_id': entity_ids,
        'cluster_id': clusters
    })
    df = df.set_index('cluster_id').join(pd.DataFrame(df.cluster_id.value_counts()).rename(columns={'cluster_id': 'freq'}))
    df['norm_freq'] = (df['freq'] - df['freq'].min())/(df['freq'].max()-df['freq'].min())
    return df.reset_index().rename(columns=({'index': 'cluster_id'}))