In [2]:
%run utils.ipynb
import matplotlib.pyplot as plt
import matplotlib.colors
import numpy as np
import os
import pandas as pd

In [3]:
figsize = (20,15)
dpi = 100

# All Graphs

In [4]:
def show_all_graphs(df, dataset_name, bp_threshold, relation_class_threshold):
    show_ranking_dist(df)
    show_ranking_dist_by_direction(df)
    show_performance_different_thresholds(df)
    show_relation_frequency(dataset_name)
    show_better_predicted_per_relation(df, bp_threshold)
    show_better_predicted_per_relation_with_frequency(df, dataset_name, bp_threshold)
    show_better_predicted_per_relation_with_relation_classes(df, dataset_name, bp_threshold, relation_class_threshold)

# Helper Functions

In [5]:
def plt_show():
    pdf.savefig()
    plt.show()
    
def get_better_predicted_by(df, threshold):
    df['rank_difference'] = df['rank_filtered_anyburl'] - df['rank_filtered_kge']
    df[f'better_predicted_by_anyburl_t{threshold}'] = df.apply(lambda x: 1 if x['rank_difference'] < (-1 * threshold) else -1 if x['rank_difference'] > threshold else 0, axis=1)
    
    return df.drop(columns=['rank_difference'])

def group_df_by_r_id(df):
    return df[['r_id', f'better_predicted_by_anyburl_t{bp_threshold}']].groupby('r_id').mean()

# Boxplots of ranking distribution

In [6]:
def show_ranking_dist(df):
    """
    expects a df with 'rank_filtered_anyburl' and 'rank_filtered_kge'
    """
    plt.figure(figsize=figsize, dpi=dpi)
    plt.boxplot(df[['rank_filtered_anyburl', 'rank_filtered_kge']], labels=[symbolic_name, subsymbolic_name])
    
    plt.ylabel('Rank (filtered)')
    plt.title(f'({dataset_name}) Comparison of Rankings per Model')
    plt_show()

# Boxplots of ranking distribution by direction

In [7]:
def show_ranking_dist_by_direction(df):
    """
    expects a df with 'rank_filtered_anyburl', 'rank_filtered_kge' and 'predicted_head'
    """
    heads = df[df['predicted_head']==True][['h_id', 'r_id', 't_id', 'rank_filtered_anyburl', 'rank_filtered_kge']]
    tails = df[df['predicted_head']==False][['h_id', 'r_id', 't_id','rank_filtered_anyburl', 'rank_filtered_kge']]
    
    f, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize, dpi=dpi)    
    ax1.boxplot(heads[['rank_filtered_anyburl', 'rank_filtered_kge']], labels=[symbolic_name, subsymbolic_name])
    ax2.boxplot(tails[['rank_filtered_anyburl', 'rank_filtered_kge']], labels=[symbolic_name, subsymbolic_name])
    
    f.suptitle(f'({dataset_name}) Comparison of Rankings per Model and Prediction Direction')
    ax1.set_title('Predicted Heads')
    ax2.set_title('Predicted Tails')
    ax1.set_ylabel('rank (filtered)')
    ax2.set_ylabel('rank (filtered)')

    plt_show()

# Performance with different comparison Thresholds

In [8]:
def show_performance_different_thresholds(df, rank_thresholds = [0,5,10,25,50,100]):
    """
    expects a df with 'rank_filtered_anyburl' and 'rank_filtered_kge'
    """    
    for t in rank_thresholds:
        df = get_better_predicted_by(df, t)
    
    X_axis = np.arange(6)
    Y_anyburl = [(df[f'better_predicted_by_anyburl_t{t}'] == 1).sum() for t in rank_thresholds]
    Y_kge = [(df[f'better_predicted_by_anyburl_t{t}'] == -1).sum() for t in rank_thresholds]
    Y_equal = [(df[f'better_predicted_by_anyburl_t{t}'] == 0).sum() for t in rank_thresholds]

    plt.figure(figsize=figsize, dpi=dpi)
    plt.bar(X_axis - 0.2, Y_anyburl, 0.2, label = symbolic_name)
    plt.bar(X_axis, Y_kge, 0.2, label = subsymbolic_name)
    plt.bar(X_axis + 0.2, Y_equal, 0.2, label = 'Equal')
    
    plt.xticks(X_axis, rank_thresholds)
    plt.xlabel("Threshold")
    plt.ylabel("Amount of better Predictions")
    plt.title(f'({dataset_name}) Comparison of Rankings for different Thresholds')
    plt.legend()
    
    plt_show()

# Relation Frequency

In [9]:
def show_relation_frequency(dataset_name):
    r_freq = get_relation_frequency_in_training_data(dataset_name)
    
    plt.figure(figsize=figsize, dpi=dpi)
    plt.bar(r_freq['r_id'], r_freq['freq'])
    
    plt.xticks(r_freq['r_id'])
    plt.title(f'({dataset_name}) Relation Frequency in Trainings Data')
    
    plt_show()

# Better Predicted by AnyBURL per Relation

In [10]:
def show_better_predicted_per_relation(df, bp_threshold):
    """
    expects a df with 'r_id', 'rank_filtered_anyburl' and 'rank_filtered_kge'
    """
    df = get_better_predicted_by(df, bp_threshold)
    
    df_by_r = group_df_by_r_id(df).sort_values(f'better_predicted_by_anyburl_t{bp_threshold}')
    df_by_r.reset_index(inplace=True)
    
    plt.figure(figsize=figsize, dpi=dpi)
    plt.barh(range(len(df_by_r['r_id'])),  df_by_r[f'better_predicted_by_anyburl_t{bp_threshold}'])
    
    plt.yticks(range(len(df_by_r['r_id'])), df_by_r['r_id'])
    plt.xlabel('Better predicted by AnyBURL')
    plt.ylabel('Relation Id')
    plt.title(f'({dataset_name}) Better Predicted by AnyBURL per Relation')
    
    plt_show()

# Better Predicted by AnyBURL per Relation with Frequency

In [11]:
def show_better_predicted_per_relation_with_frequency(df, dataset_name, bp_threshold):
    """
    expects a df with 'r_id', 'rank_filtered_anyburl' and 'rank_filtered_kge'
    """
    df = get_better_predicted_by(df, bp_threshold)
    r_freq = get_relation_frequency_in_training_data(dataset_name)
    
    df_by_r = group_df_by_r_id(df)
    df_by_r_w_freq = r_freq.join(df_by_r, on='r_id').sort_values('freq')
    df_by_r_w_freq.reset_index(inplace=True)
    df_by_r_w_freq = df_by_r_w_freq.dropna()
    df_by_r_w_freq = df_by_r_w_freq.sort_values(f'better_predicted_by_anyburl_t{bp_threshold}')
    
    cmap = plt.cm.viridis
    norm = matplotlib.colors.Normalize(vmin=df_by_r_w_freq['norm_freq'].min(), vmax=df_by_r_w_freq['norm_freq'].nlargest(30).values[1])

    fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
    ax.bar(range(len(df_by_r_w_freq['r_id'])), df_by_r_w_freq[f'better_predicted_by_anyburl_t{bp_threshold}'], color=cmap(norm(df_by_r_w_freq['norm_freq'].values)))
    
    ax.set_xticks(range(len(df_by_r_w_freq['r_id'])), df_by_r_w_freq['r_id'])

    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    clb = fig.colorbar(sm)
    clb.set_label('normalized relation frequency')

    plt.title(f'({dataset_name}) Better predicted by AnyBURL per Relation with Relation Frequency in Trainings Data')
    plt.xlabel('Relation Id')
    plt.ylabel('Better predicted by AnyBURL')

    plt_show()

# Better Predicted by AnyBURL per Relation with Relation Class

In [12]:
def show_better_predicted_per_relation_with_relation_classes(df, dataset_name, bp_threshold, relation_class_threshold):
    """
    expects a df with 'r_id', 'rank_filtered_anyburl' and 'rank_filtered_kge'
    """
    df = get_better_predicted_by(df, bp_threshold)
    df_by_r = group_df_by_r_id(df).sort_values(f'better_predicted_by_anyburl_t{bp_threshold}')
    relation_classes = get_relation_classes(dataset_name, relation_class_threshold)
    df_by_r_w_classes = df_by_r.join(relation_classes)

    colors = {'1to1': 'red', '1toM': 'green', 'Nto1': 'blue', 'NtoM': 'orange'}
    c = df_by_r_w_classes['relationClass'].apply(lambda x: colors[x])

    plt.figure(figsize=figsize, dpi=dpi)
    plt.barh(range(len(df_by_r_w_classes['r_id'])),  df_by_r_w_classes[f'better_predicted_by_anyburl_t{bp_threshold}'], color=c)

    for i, j in colors.items(): 
        plt.barh(range(len(df_by_r_w_classes['r_id'])), df_by_r_w_classes[f'better_predicted_by_anyburl_t{bp_threshold}'],height=0,color=j,label=i)   
    
    plt.yticks(range(len(df_by_r_w_classes['r_id'])), df_by_r_w_classes['r_id'])    
    plt.title(f'({dataset_name}) Better predicted by AnyBURL per Relation with Relation Class')
    plt.xlabel('Relation Id')
    plt.ylabel('Better predicted by AnyBURL')

    plt.legend()
    plt_show()