In [1]:
import os
from typing import List, Optional

import pandas as pd

from kbc_pul.amie.amie_output_rule_extraction import get_amie_rules_from_rule_tsv_file
from kbc_pul.data_structures.pandas_kb import PandasKnowledgeBaseWrapper
from kbc_pul.data_structures.rule_wrapper import RuleWrapper
from kbc_pul.experiments_utils.load_df_ground_truth import get_df_ground_truth
from kbc_pul.project_info import data_dir

%load_ext autoreload
%autoreload 2



# How to apply AMIE's rules to a KB

## 1. Read in the mined rules

In [2]:
dataset_name: str = 'yago3_10'


amie_data_dir: str = os.path.join(data_dir, dataset_name, "amie_output_dir")
amie_rule_tsv_filename = os.path.join(
    amie_data_dir,
    f"amie_output_rules.tsv"
)


rule_wrapper_list: List[RuleWrapper] = get_amie_rules_from_rule_tsv_file(
    amie_rules_tsv_filename=amie_rule_tsv_filename)


## Select the first rule as an example

In [15]:
from pprint import pprint

rule_wrapper: RuleWrapper = rule_wrapper_list[0]
pprint(rule_wrapper)

{'Rule': 'dealswith(?a,?b)  => hasneighbor(?a,?b)', 'Head Coverage': 0.295495495, 'Std Confidence': 0.125960061, 'PCA Confidence': 0.16449348, 'Positive Examples': 164, 'Body size': 1302, 'PCA Body size': 997, 'Functional variable': '?b'}


## Load in a KB

In [16]:
filename_ground_truth_dataset: str = os.path.join(
    data_dir, dataset_name, 'cleaned_csv', 'train.csv'
)
separator_ground_truth_dataset = "\t"

df_ground_truth: pd.DataFrame = get_df_ground_truth(filename_ground_truth_dataset, separator_ground_truth_dataset)
pandas_kb_wrapper = PandasKnowledgeBaseWrapper.create_from_full_data(df_full_data=df_ground_truth)

## Calculate the prediction cache for that rule on the KB

In [17]:
o_df_cached_predictions: Optional[pd.DataFrame] = pandas_kb_wrapper.calculate_prediction_cache_for_rule(
    rule=rule_wrapper.rule
)
o_df_cached_predictions

Unnamed: 0,Subject,Object,is_supported,exists_lits_same_subject,exists_lits_same_object
0,'eAbkhazia','eGeorgia_(country)',False,False,True
1,'eAfghanistan','eChina',True,True,True
2,'eAfghanistan','eGermany',False,True,True
3,'eAfghanistan','eIndia',False,True,True
4,'eAfghanistan','eKazakhstan',False,True,True
...,...,...,...,...,...
1297,'eZambia','eUnited_Arab_Emirates',False,True,True
1298,'eZimbabwe','eBotswana',True,True,True
1299,'eZimbabwe','eChina',False,True,True
1300,'eZimbabwe','eDemocratic_Republic_of_the_Congo',False,True,True


In [18]:
o_df_cached_predictions.shape

(1302, 5)

In [19]:
df_based_rule_predictions: pd.DataFrame = pandas_kb_wrapper.get_predictions_for_rule(
    rule=rule_wrapper.rule)
df_based_rule_predictions.head()

Unnamed: 0,Subject,Object
0,'eAbkhazia','eGeorgia_(country)'
1,'eAfghanistan','eChina'
2,'eAfghanistan','eGermany'
3,'eAfghanistan','eIndia'
4,'eAfghanistan','eKazakhstan'


In [20]:
df_based_rule_predictions.shape


(1302, 2)

# Rule metrics

##  CWA-based confidence / standard confidence

In [21]:
from kbc_pul.rule_metrics.prediction_cache_rule_metrics.rule_cwa_and_pca_confidences_from_cached_predictions import \
    calculate_cwa_confidence_from_df_cache, calculate_pca_confidence_s_to_o_from_df_cache, \
    calculate_pca_confidence_o_to_s_from_df_cache

print(f"Rule: {rule_wrapper.rule}")

cwa_conf: Optional[float] = calculate_cwa_confidence_from_df_cache(o_df_cached_predictions)
print(f"CWA-based conf: {cwa_conf:0.3f}")
pca_conf_s_to_o: Optional[float] = calculate_pca_confidence_s_to_o_from_df_cache(o_df_cached_predictions)
print(f"PCA-based conf (subject-to-object): {pca_conf_s_to_o:0.3f}")
pca_conf_o_to_s: Optional[float] = calculate_pca_confidence_o_to_s_from_df_cache(o_df_cached_predictions)
print(f"PCA-based conf (object-to-subject): {pca_conf_o_to_s:0.3f}")


Rule: hasneighbor(A,B) :- dealswith(A,B)
CWA-based conf: 0.126
PCA-based conf (subject-to-object): 0.172
PCA-based conf (object-to-subject): 0.164


In [None]:
# # TRUE CONF
# true_conf: float = get_true_confidence_on_observed_data_using_cached_predictions(
#     df_cached_predictions=df_cached_predictions,
#     df_ground_truth_target_relation=df_ground_truth_target_relation,
#
# )
# rule_wrapper.o_true_confidence = true_conf
#
# # TRUE pair-positive confidence ('conf*') S->O
# true_pca_conf_subject_to_object: float = get_true_pca_confidence_on_observed_data_using_cached_predictions(
#     df_cached_predictions=df_cached_predictions,
#     true_entity_str_tuple_set=true_entity_sets.entity_pairs,
#     true_pca_non_target_entity_set=true_entity_sets.pca_subjects,
#     predict_object_entity=True,
# )
# rule_wrapper.o_true_pca_confidence_subject_to_object = true_pca_conf_subject_to_object
#
# # TRUE pair-positive confidence ('conf*') O->S
# true_pca_conf_object_to_subject: float = get_true_pca_confidence_on_observed_data_using_cached_predictions(
#     df_cached_predictions=df_cached_predictions,
#     true_entity_str_tuple_set=true_entity_sets.entity_pairs,
#     true_pca_non_target_entity_set=true_entity_sets.pca_objects,
#     predict_object_entity=False
# )


In [None]:
df_based_rule_predictions.shape


In [None]:
# # TRUE CONF
# true_conf: float = get_true_confidence_on_observed_data_using_cached_predictions(
#     df_cached_predictions=df_cached_predictions,
#     df_ground_truth_target_relation=df_ground_truth_target_relation,
#
# )
# rule_wrapper.o_true_confidence = true_conf
#
# # TRUE pair-positive confidence ('conf*') S->O
# true_pca_conf_subject_to_object: float = get_true_pca_confidence_on_observed_data_using_cached_predictions(
#     df_cached_predictions=df_cached_predictions,
#     true_entity_str_tuple_set=true_entity_sets.entity_pairs,
#     true_pca_non_target_entity_set=true_entity_sets.pca_subjects,
#     predict_object_entity=True,
# )
# rule_wrapper.o_true_pca_confidence_subject_to_object = true_pca_conf_subject_to_object
#
# # TRUE pair-positive confidence ('conf*') O->S
# true_pca_conf_object_to_subject: float = get_true_pca_confidence_on_observed_data_using_cached_predictions(
#     df_cached_predictions=df_cached_predictions,
#     true_entity_str_tuple_set=true_entity_sets.entity_pairs,
#     true_pca_non_target_entity_set=true_entity_sets.pca_objects,
#     predict_object_entity=False
# )


In [2]:
# # TRUE CONF
# true_conf: float = get_true_confidence_on_observed_data_using_cached_predictions(
#     df_cached_predictions=df_cached_predictions,
#     df_ground_truth_target_relation=df_ground_truth_target_relation,
#
# )
# rule_wrapper.o_true_confidence = true_conf
#
# # TRUE pair-positive confidence ('conf*') S->O
# true_pca_conf_subject_to_object: float = get_true_pca_confidence_on_observed_data_using_cached_predictions(
#     df_cached_predictions=df_cached_predictions,
#     true_entity_str_tuple_set=true_entity_sets.entity_pairs,
#     true_pca_non_target_entity_set=true_entity_sets.pca_subjects,
#     predict_object_entity=True,
# )
# rule_wrapper.o_true_pca_confidence_subject_to_object = true_pca_conf_subject_to_object
#
# # TRUE pair-positive confidence ('conf*') O->S
# true_pca_conf_object_to_subject: float = get_true_pca_confidence_on_observed_data_using_cached_predictions(
#     df_cached_predictions=df_cached_predictions,
#     true_entity_str_tuple_set=true_entity_sets.entity_pairs,
#     true_pca_non_target_entity_set=true_entity_sets.pca_objects,
#     predict_object_entity=False
# )
