In [266]:
import numpy as np
import config
import pandas as pd
import pickle
from rule_mining import rule_mining
from kb_extension import extend_kb
from ampligraph.latent_features import restore_model
from rule_comparison import plot_pie_chart, get_common_rules, display_comparison
from operator import itemgetter

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
import copy

# Rule mining comparison

In [263]:
# original knowlege base
original_kb = np.loadtxt("family_subset.txt", dtype = 'object')

In [264]:
# mine rules from original knowledge base
original_rules = rule_mining(original_kb)

# convert metrics to correct datatype
original_rules['PCA Confidence'] = original_rules['PCA Confidence'].apply(lambda x: float(x.replace(',','.')))
original_rules['Head Coverage'] = original_rules['Head Coverage'].apply(lambda x: float(x.replace(',','.')))

In [215]:
original_rules_median_PCA = original_rules["PCA Confidence"].median()
original_rules_median_HC = original_rules["Head Coverage"].median()

In [265]:
# save rules mined from original kb
original_rules.to_pickle("./original_rules.pkl")

# load saved rules
# original_rules = pd.read_pickle("./original_rules.pkl")

In [256]:
# for testing
#original_kb = np.loadtxt("/testing/family_subset_test.txt", dtype = 'object')

# load pretrained knowledge graph embeddings
randomBaseline_model= restore_model('./RandomBaseline.pkl')
transE_model= restore_model('./TransE.pkl')
distMult_model = restore_model('./DistMult.pkl')
complEx_model = restore_model('./ComplEx.pkl')



In [257]:
# parameters
models = [complEx_model, distMult_model, transE_model, randomBaseline_model]
entity_selection_methods = ["random", "probabilistic", "most_frequent", "least_frequent"]
candidate_admittance_criteria = config.rank_cutoffs + config.percents

In [258]:
parameter_combinations= []
for model in models:
    for method in entity_selection_methods:
        for criteria in candidate_admittance_criteria:
            model_name = model.name
            parameter_combinations.append([model_name, method, criteria])
parameter_combinations = pd.DataFrame(parameter_combinations, columns=["Model", "Entity_selection", "Candidate_criteria"])

In [261]:
# save parameter combinations to file
with open("parameter_combinations.pkl", "wb") as file:
    pickle.dump(parameter_combinations, file)

In [9]:
mined_rules = []
kb_extensions = []
for model in models:
    for method in entity_selection_methods:
        for criteria in candidate_admittance_criteria:
            extended_kb, admitted_candidates = extend_kb(original_kb, model, method, criteria)
            rules = rule_mining(extended_kb)
            kb_extensions.append(admitted_candidates)
            mined_rules.append(rules)

  if corruption_entities == 'all':
100%|██████████| 600/600 [00:00<00:00, 1589.12it/s]
100%|██████████| 600/600 [00:00<00:00, 1621.91it/s]
100%|██████████| 600/600 [00:00<00:00, 1624.08it/s]
100%|██████████| 600/600 [00:00<00:00, 1589.63it/s]
100%|██████████| 600/600 [00:00<00:00, 1617.71it/s]
100%|██████████| 600/600 [00:00<00:00, 1659.07it/s]
100%|██████████| 600/600 [00:00<00:00, 1612.64it/s]
100%|██████████| 600/600 [00:00<00:00, 1622.59it/s]
100%|██████████| 600/600 [00:00<00:00, 1580.97it/s]
100%|██████████| 600/600 [00:00<00:00, 1576.18it/s]
100%|██████████| 600/600 [00:00<00:00, 1592.91it/s]
100%|██████████| 600/600 [00:00<00:00, 1585.07it/s]
100%|██████████| 600/600 [00:00<00:00, 1563.14it/s]
100%|██████████| 600/600 [00:00<00:00, 1592.24it/s]
100%|██████████| 600/600 [00:00<00:00, 1637.10it/s]
100%|██████████| 600/600 [00:00<00:00, 1584.94it/s]
100%|██████████| 600/600 [00:00<00:00, 1626.25it/s]
100%|██████████| 600/600 [00:00<00:00, 1516.35it/s]
100%|██████████| 600/600 [00:

In [10]:
# convert metrics to correct datatype
for rule_set in mined_rules:
    rule_set['PCA Confidence'] = rule_set['PCA Confidence'].apply(lambda x: float(x.replace(',','.')))
    rule_set['Head Coverage'] = rule_set['Head Coverage'].apply(lambda x: float(x.replace(',','.')))

In [11]:
mined_rules[0].dtypes

Rule                    object
Head Coverage          float64
PCA Confidence         float64
Positive Examples       object
PCA Body size           object
Functional variable     object
dtype: object

In [12]:
# save mined rules to file
with open("mined_rules.pkl", "wb") as file:
    pickle.dump(mined_rules, file)

In [149]:
mined_rules = pd.read_pickle("./mined_rules.pkl")

## Combine to single dataframe
Combine the list of rule set dataframes to a single large dataframe. Add columns for parameter values used to mine rules.

In [150]:
# generate dataframe that adds information about the parameters used to each row containing a rule
if len(mined_rules) != len(parameter_combinations):
    print("ERROR: number of given parameter combinaitons, " + len(parameter_combinations) + " is not equal to those actually used: " + len(mined_rules))
for i, parameter_row in parameter_combinations.iterrows():
    number_of_rules = len(mined_rules[i])
    parameter_list = parameter_row.values.tolist()
    parameter_full = [copy.deepcopy(parameter_list) for j in range(number_of_rules)]
    parameter_full_df = pd.DataFrame(parameter_full, columns=["Model", "Entity_selection", "Candidate_criteria"])
    mined_rules[i] = pd.concat([mined_rules[i], parameter_full_df], axis=1)

In [151]:
# add original rules to dataframe
number_of_rules = len(original_rules)
parameter_list = ["Original rules","Original rules","Original rules"]
parameter_full = [parameter_list for j in range(number_of_rules)]
parameter_full_df = pd.DataFrame(parameter_full, columns=["Model", "Entity_selection", "Candidate_criteria"])
original_rules_parameters= pd.concat([original_rules, parameter_full_df], axis=1)
mined_rules.append(original_rules_parameters)
mined_rules_parameters = pd.concat(mined_rules)

In [160]:
# combine rule sets into one large dataframe
mined_rules_parameters = pd.concat(mined_rules)

# change datatype to string
mined_rules_parameters['Candidate_criteria'] =  mined_rules_parameters.Candidate_criteria.astype(str)

In [260]:
# save dataframe to file
with open("mined_rules_parameters.pkl", "wb") as file:
    pickle.dump(mined_rules_parameters, file)