In [1]:
import numpy as np
import config
import models # load pretrained_models
import pandas as pd
import pickle
from rule_mining import rule_mining
from kb_extension import extend_kb
from rule_comparison import plot_pie_chart, get_common_rules, display_comparison
from operator import itemgetter



# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
import copy



# Rule mining comparison

In [2]:
# original knowlege base
original_kb = np.loadtxt("family_subset.txt", dtype = 'object')
original_kb.shape

(258235, 3)

In [3]:
# mine rules from original knowledge base
original_rules = rule_mining(original_kb)

# convert metrics to correct datatype
original_rules['PCA Confidence'] = original_rules['PCA Confidence'].apply(lambda x: float(x.replace(',','.')))
original_rules['Head Coverage'] = original_rules['Head Coverage'].apply(lambda x: float(x.replace(',','.')))

In [4]:
original_rules_median_PCA = original_rules["PCA Confidence"].median()
original_rules_median_HC = original_rules["Head Coverage"].median()

In [5]:
# save rules mined from original kb
original_rules.to_pickle("./original_rules.pkl")

# load saved rules
# original_rules = pd.read_pickle("./original_rules.pkl")

In [6]:
# for testing
#original_kb = np.loadtxt("/testing/family_subset_test.txt", dtype = 'object')

In [7]:
# parameters
loaded_models = [models.complEx, models.distMult, models.transE, models.randomBaseline]
entity_selection_methods = ["probabilistic"]#, "random", "most_frequent", "least_frequent"]
candidate_admittance_criteria = config.rank_cutoffs# + config.percents

In [8]:
parameter_combinations= []
for model in loaded_models:
    for method in entity_selection_methods:
        for criteria in candidate_admittance_criteria:
            model_name = model.name
            parameter_combinations.append([model_name, method, criteria])
parameter_combinations = pd.DataFrame(parameter_combinations, columns=["Model", "Entity_selection", "Candidate_criteria"])

In [9]:
parameter_combinations.shape

(4, 3)

In [10]:
# save parameter combinations to file
with open("parameter_combinations.pkl", "wb") as file:
    pickle.dump(parameter_combinations, file)

In [11]:
mined_rules = []
kb_extensions = []
extension_sizes = pd.DataFrame([], columns=["Extension", "Model", "Entity_selection", "Candidate_criteria"])
for model in loaded_models:
    for method in entity_selection_methods:
        for criteria in candidate_admittance_criteria:
            extended_kb, admitted_candidates = extend_kb(original_kb, model, method, criteria, max_entities=config.max_entities)
            admitted_w_parameters = pd.DataFrame([[len(admitted_candidates), model.name, method, criteria]], columns=["Extension", "Model", "Entity_selection", "Candidate_criteria"])
            extension_sizes = extension_sizes.append(admitted_w_parameters)
            rules = rule_mining(extended_kb)
            kb_extensions.append(admitted_candidates)
            mined_rules.append(rules)

  if corruption_entities == 'all':
100%|██████████| 600/600 [00:00<00:00, 1567.70it/s]


   Extension    Model Entity_selection Candidate_criteria
0         39  ComplEx    probabilistic   (rank_cutoff, 1)
  Extension    Model Entity_selection Candidate_criteria
0        39  ComplEx    probabilistic   (rank_cutoff, 1)


  if corruption_entities == 'all':
100%|██████████| 600/600 [00:00<00:00, 1681.45it/s]


   Extension     Model Entity_selection Candidate_criteria
0         33  DistMult    probabilistic   (rank_cutoff, 1)
  Extension     Model Entity_selection Candidate_criteria
0        39   ComplEx    probabilistic   (rank_cutoff, 1)
0        33  DistMult    probabilistic   (rank_cutoff, 1)


  if corruption_entities == 'all':


   Extension   Model Entity_selection Candidate_criteria
0         49  TransE    probabilistic   (rank_cutoff, 1)
  Extension     Model Entity_selection Candidate_criteria
0        39   ComplEx    probabilistic   (rank_cutoff, 1)
0        33  DistMult    probabilistic   (rank_cutoff, 1)
0        49    TransE    probabilistic   (rank_cutoff, 1)


  if corruption_entities == 'all':


   Extension           Model Entity_selection Candidate_criteria
0         29  RandomBaseline    probabilistic   (rank_cutoff, 1)
  Extension           Model Entity_selection Candidate_criteria
0        39         ComplEx    probabilistic   (rank_cutoff, 1)
0        33        DistMult    probabilistic   (rank_cutoff, 1)
0        49          TransE    probabilistic   (rank_cutoff, 1)
0        29  RandomBaseline    probabilistic   (rank_cutoff, 1)


In [13]:
extension_sizes

Unnamed: 0,Extension,Model,Entity_selection,Candidate_criteria
0,39,ComplEx,probabilistic,"(rank_cutoff, 1)"
0,33,DistMult,probabilistic,"(rank_cutoff, 1)"
0,49,TransE,probabilistic,"(rank_cutoff, 1)"
0,29,RandomBaseline,probabilistic,"(rank_cutoff, 1)"


In [None]:
# convert metrics to correct datatype
for rule_set in mined_rules:
    rule_set['PCA Confidence'] = rule_set['PCA Confidence'].apply(lambda x: float(x.replace(',','.')))
    rule_set['Head Coverage'] = rule_set['Head Coverage'].apply(lambda x: float(x.replace(',','.')))

In [None]:
mined_rules[0].dtypes

In [None]:
# save mined rules to file
with open("mined_rules.pkl", "wb") as file:
    pickle.dump(mined_rules, file)

In [None]:
# save mined rules to file
with open("kb_extensions.pkl", "wb") as file:
    pickle.dump(kb_extensions, file)

In [None]:
mined_rules = pd.read_pickle("./mined_rules.pkl")

## Combine to single dataframe
Combine the list of rule set dataframes to a single large dataframe. Add columns for parameter values used to mine rules.

In [None]:
# generate dataframe that adds information about the parameters used to each row containing a rule
if len(mined_rules) != len(parameter_combinations):
    print("ERROR: number of given parameter combinaitons, " + len(parameter_combinations) + " is not equal to those actually used: " + len(mined_rules))
for i, parameter_row in parameter_combinations.iterrows():
    number_of_rules = len(mined_rules[i])
    parameter_list = parameter_row.values.tolist()
    parameter_full = [copy.deepcopy(parameter_list) for j in range(number_of_rules)]
    parameter_full_df = pd.DataFrame(parameter_full, columns=["Model", "Entity_selection", "Candidate_criteria"])
    mined_rules[i] = pd.concat([mined_rules[i], parameter_full_df], axis=1)

In [None]:
# add original rules to dataframe
number_of_rules = len(original_rules)
parameter_list = ["Original rules","Original rules","Original rules"]
parameter_full = [parameter_list for j in range(number_of_rules)]
parameter_full_df = pd.DataFrame(parameter_full, columns=["Model", "Entity_selection", "Candidate_criteria"])
original_rules_parameters= pd.concat([original_rules, parameter_full_df], axis=1)
mined_rules.append(original_rules_parameters)
mined_rules_parameters = pd.concat(mined_rules)

In [None]:

# add rules mined from randomly selected candidates
extended_kb, admitted_candidates = extend_kb(original_kb, models.complEx, "probabilistic", ("random", 0), max_entities=config.max_entities)
rules = rule_mining(extended_kb)
number_of_rules = len(rules)
parameter_list = ["Rand cand","Rand cand","Rand cand"]
parameter_full = [parameter_list for j in range(number_of_rules)]
parameter_full_df = pd.DataFrame(parameter_full, columns=["Model", "Entity_selection", "Candidate_criteria"])
original_rules_parameters= pd.concat([rules, parameter_full_df], axis=1)
original_rules_parameters['PCA Confidence'] = original_rules_parameters['PCA Confidence'].apply(lambda x: float(x.replace(',','.')))
original_rules_parameters['Head Coverage'] = original_rules_parameters['Head Coverage'].apply(lambda x: float(x.replace(',','.')))
mined_rules.append(original_rules_parameters)
mined_rules_parameters = pd.concat(mined_rules)


In [None]:
# combine rule sets into one large dataframe
mined_rules_parameters = pd.concat(mined_rules)

# change datatype to string
mined_rules_parameters['Candidate_criteria'] =  mined_rules_parameters.Candidate_criteria.astype(str)

In [None]:
# save dataframe to file
with open("mined_rules_parameters.pkl", "wb") as file:
    pickle.dump(mined_rules_parameters, file)

In [None]:
original_kb.shape

In [None]:
config.percents

In [None]:
config.percents

In [None]:
for ex in kb_extensions:
    print(len(ex))

In [14]:
mined_rules_parameters

NameError: name 'mined_rules_parameters' is not defined