# Imports

In [26]:
%run utils.ipynb

import ast
import numpy as np
import os
import pandas as pd
import re

# Load Data

## AnyBURL

In [27]:
def format_anyburl(path, eri):
    columns = np.array(['h', 'h_id', 'r', 'r_id', 't', 't_id', 'predicted_head', 'rank_filtered'])
    index = list(columns[[1,3,5,6]])
    anyburl = pd.DataFrame(columns=columns)

    with open(path) as file:
        for line in file:
            if line.startswith('Heads:'):
                line = line[7:]
                candidates = line.split('\t')
                del candidates[1::2]
                rank = candidates.index(head) + 1 if head in candidates else None
                new_row = pd.DataFrame([[head, eri.get_entity_id(head), relation, eri.get_relation_id(relation), tail, eri.get_entity_id(tail), True, rank]], columns=columns)
                anyburl = pd.concat([anyburl, new_row])    
            elif line.startswith('Tails:'):
                line = line[7:]
                candidates = line.split('\t')
                del candidates[1::2]
                rank = candidates.index(tail) + 1 if tail in candidates else None
                new_row = pd.DataFrame([[head, eri.get_entity_id(head), relation, eri.get_relation_id(relation), tail, eri.get_entity_id(tail), False, rank]], columns=columns)
                anyburl = pd.concat([anyburl, new_row])  
            else:
                head, relation, tail = line.split()        

    anyburl.set_index(index, inplace=True)
    return anyburl

## LibKGE

In [28]:
def add_quotes_to_lists(match):
    return re.sub(r'([\s\[])([^\],]+)', r'\1"\2"', match.group(0))

In [29]:
def format_libkge(paths, eri):
    columns = np.array(['h', 'h_id', 'r', 'r_id', 't', 't_id', 'predicted_head', 'rank_filtered'])
    index = list(columns[[1,3,5,6]])
    
    filler = [None]
    for i, path in enumerate(paths):
        columns = np.append(columns, f'rank_filtered_{i}')
        filler.append(None)
        
    kge = pd.DataFrame(columns=columns)
    
    for i, path in enumerate(paths):
        print(f'{path}...')
        column_name = f'rank_filtered_{i}'
        with open(path) as file:
            for line in file:
                line = re.sub(r':\s?(?![{\[\s])([^,}]+)', r': "\1"', line) # Add quotes to dict values
                line = re.sub(r'(\w+):', r'"\1":', line) # Add quotes to dict keys
                line = re.sub(r'\[[^\]]+', add_quotes_to_lists, line) # Add quotes to list items
                try:
                    l = ast.literal_eval(line)

                    if set(['s','p','o']).issubset(set(l.keys())):
                        s_id = int(l['s'])
                        o_id = int(l['o'])
                        p_id = int(l['p'])
                        s = eri.get_entity_by_id(s_id)
                        o = eri.get_entity_by_id(o_id)
                        p = eri.get_relation_by_id(p_id)
                        predicted_head = l['task'] == 'po'
                        rank = int(l['rank_filtered'])

                        existing_row = kge[(kge['h_id'] == s_id) & (kge['r_id']==p_id) & (kge['t_id']==o_id) & (kge['predicted_head']==predicted_head)]
                        existing_row_id = None
                        if len(existing_row) > 0:
                            existing_row_id = existing_row.iloc[0].name

                        if existing_row_id is not None:
                            kge.at[existing_row_id, column_name] = rank
                        else:             
                            new_row = pd.DataFrame([[s, s_id, p, p_id, o, o_id, predicted_head] + filler], columns=columns)
                            new_row.at[0, column_name] = rank
                            kge = pd.concat([kge, new_row]) 
                            kge.reset_index(drop=True, inplace=True)
                except:
                    print('Couldn\'t process line: ')
                    print(line)
                    
    kge['rank_filtered'] = 0
    for i, path in enumerate(paths):
        kge["rank_filtered"] += kge[f'rank_filtered_{i}']
    kge["rank_filtered"] /= len(paths)
    
    kge[["rank_filtered"]] = kge[["rank_filtered"]].applymap(lambda x: int(round(x, 0)) if isinstance(x, (int, float)) else x)
    
    kge.set_index(index, inplace=True)
    return kge

# Join Data

In [30]:
def get_prediction_data(dataset_name, symbolic_name, subsymbolic_name):
    anyburl_run = 'alpha-100'
    
    eri = ERI(os.path.join('experiments', '0_datasets', dataset_name))
    filename =  get_formatted_data_name(dataset_name, symbolic_name, subsymbolic_name)
    
    if os.path.exists(filename):
        return pd.read_csv(filename)
    else:
        print('formatted predictions don\'t exist yet. Creating...')
        print('libkge...')
        kge_folders = [f'{dataset_name}_{subsymbolic_name}_{i}' for i in range(1,6)]
        kge = format_libkge([os.path.join('experiments', kge_folder, 'test_results', 'trace.yaml') for kge_folder in kge_folders], eri)
        print(kge.head(10))
        print('libkge done')
              
        print('anyburl...')
        anyburl_folder = f'{dataset_name}_{symbolic_name}'
        anyburl = format_anyburl(os.path.join('experiments', anyburl_folder, 'predictions', 'alpha-100'), eri)
        print(anyburl.head(10))
        print('anyburl done')              
              
        df = anyburl[['rank_filtered']].join(kge, lsuffix="_anyburl", rsuffix="_kge", how='outer')  
        print(df.head(10))
        
        df.to_csv(filename)
        return df.reset_index()