# Model selection for knowledge graph embeddings

For hyperparameter optimisation random search is more optimal than grid search as the search space grows: *James Bergstra and Yoshua Bengio. Random search for hyper-parameter optimization. Journal of Machine Learning Research, 13(Feb):281–305, 2012.*

While this approach is not optimal, it is a strong baseline agains other more advanced methods such as Baysian optimisation: *Lisha Li and Kevin Jamieson. Hyperband: a novel bandit-based approach to hyperparameter optimization. Journal of Machine Learning Research, 18:1–52, 2018.*

In [2]:
import tensorflow as tf
import numpy as np
np.random.seed(0)

from ampligraph.evaluation import train_test_split_no_unseen 
from ampligraph.latent_features import RandomBaseline, TransE, DistMult, ComplEx, HolE, ConvE, ConvKB
from ampligraph.latent_features import save_model
from ampligraph.evaluation import evaluate_performance, select_best_model_ranking, mr_score, mrr_score, hits_at_n_score
from ampligraph.latent_features import save_model, restore_model

## Data retrieval

In [2]:
data = np.loadtxt("family_subset_test.txt", dtype = 'object')

In [3]:
data.shape

(454, 3)

In [28]:
data = np.loadtxt("X_train.txt", dtype = 'object')
X_train = data[:1000]
X_val = data[1000:1500]
X_test = data[1600:2000]

## Metrics

In [51]:
# borrowed from https://github.com/Accenture/AmpliGraph/blob/master/ampligraph/evaluation/protocol.py
def evaluation(ranks):
        mrr = mrr_score(ranks)
        mr = mr_score(ranks)
        hits_1 = hits_at_n_score(ranks, n=1)
        hits_3 = hits_at_n_score(ranks, n=3)
        hits_10 = hits_at_n_score(ranks, n=10)
        test_evaluation = {
            "mrr": mrr,
            "mr": mr,
            "hits_1": hits_1,
            "hits_3": hits_3,
            "hits_10": hits_10
        }
        return test_evaluation
    
def get_metrics(model, test_data, complete_data):
    ranks = evaluate_performance(complete_data, model=model,
                                         filter_triples=complete_data, verbose=False,
                                         entities_subset=None,
                                         use_default_protocol=False,
                                         corrupt_side='s,o')
    return evaluation(ranks)

In [55]:
test_metrics = {}

## Random Baseline
Random baseline requires no hyperparameter search as it assigns a pseudo-random score to triples.

In [18]:
X_val

array([['Q1000505', 'spouse', 'Q540597']], dtype=object)

In [19]:
X_test

array([['Q1000596', 'father', 'Q701504'],
       ['Q1000596', 'sibling', 'Q718827']], dtype=object)

In [56]:
model = RandomBaseline()
model.fit(np.concatenate((X_train, X_val)))
test_metrics["RandomBaseline"] = get_metrics(model, X_test, data)
#save_model(model, './trained_models/RandomBaseline.pkl')
del model



## TransE

## Distmult

## ComplEx

In [29]:
model_class = ComplEx
param_grid = {
    "batches_count": [50],
    "seed": 0,
    "epochs": [10],
    "k": [100, 200],
    "eta": [5, 10, 15],
    "loss": ["pairwise", "nll"],
    "loss_params": {
        "margin": [2]
    },
    "embedding_model_params": {
        
    },
    "regularizer": ["LP", None],
    "regularizer_params": {
        "p": [1, 3],
        "lambda": [1e-4, 1e-5]
    },
    "optimizer": ["adagrad", "adam"],
    "optimizer_params": {
        "lr": lambda: np.random.uniform(0.0001, 0.01)
    },
    "verbose": True
}

In [32]:
best_model, best_params, best_mrr_train, ranks_test, mrr_test, experimental_history = select_best_model_ranking(model_class, X_train, X_val, X_test,
                          param_grid,
                          max_combinations=6,
                          use_filter=True,
                          verbose=False,
                          early_stopping=False)

Average ComplEx Loss:   0.001314: 100%|█████| 10/10 [00:02<00:00,  3.40epoch/s]
100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 36.20it/s]
Average ComplEx Loss:   1.990262: 100%|█████| 10/10 [00:02<00:00,  4.39epoch/s]
100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 16.03it/s]
Average ComplEx Loss:   0.060085: 100%|█████| 10/10 [00:06<00:00,  1.59epoch/s]
100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 22.32it/s]
Average ComplEx Loss:   0.117074: 100%|█████| 10/10 [00:09<00:00,  1.07epoch/s]
100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 23.79it/s]
Average ComplEx Loss:   0.000000: 100%|█████| 10/10 [00:05<00:00,  1.99epoch/s]
100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 25.00it/s]
Average ComplEx Loss:   0.059909: 100%|█████| 10/10 [00:09<00:00,  1.11epoch/s]
100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 25.78it/s]
100%|███████████████████████████████████

In [None]:
save_model(best_model, './trained_models/ComplEx.pkl')
del best_model

In [34]:
best_model.get_hyperparameter_dict()

{'k': 200,
 'eta': 5,
 'epochs': 10,
 'batches_count': 50,
 'seed': 0,
 'embedding_model_params': {},
 'optimizer': 'adam',
 'optimizer_params': {'lr': 0.00957583607363516},
 'loss': 'nll',
 'loss_params': {},
 'regularizer': 'LP',
 'regularizer_params': {'p': 3, 'lambda': 0.0001},
 'initializer': 'xavier',
 'initializer_params': {'uniform': False},
 'verbose': True}

In [35]:
best_params

{'batches_count': 50,
 'seed': 0,
 'epochs': 10,
 'k': 200,
 'eta': 5,
 'loss': 'nll',
 'loss_params': {},
 'embedding_model_params': {},
 'regularizer': 'LP',
 'regularizer_params': {'p': 3, 'lambda': 0.0001},
 'optimizer': 'adam',
 'optimizer_params': {'lr': 0.00957583607363516},
 'verbose': True}

In [36]:
best_mrr_train

0.013912616006013234

In [37]:
ranks_test

array([[ 541,  388],
       [1076,  415],
       [1296, 1364],
       [1584, 1098]])

In [38]:
mrr_test

{'mrr': 0.0013514447148697825,
 'mr': 970.25,
 'hits_1': 0.0,
 'hits_3': 0.0,
 'hits_10': 0.0}

In [39]:
experimental_history

[{'model_name': 'ComplEx',
  'model_params': {'batches_count': 50,
   'seed': 0,
   'epochs': 10,
   'k': 100,
   'eta': 10,
   'loss': 'pairwise',
   'loss_params': {'margin': 2},
   'embedding_model_params': {},
   'regularizer': None,
   'regularizer_params': {},
   'optimizer': 'adam',
   'optimizer_params': {'lr': 0.004294182513455157},
   'verbose': True},
  'results': {'mrr': 0.0025149009856549644,
   'mr': 765.3,
   'hits_1': 0.0,
   'hits_3': 0.0,
   'hits_10': 0.0}},
 {'model_name': 'ComplEx',
  'model_params': {'batches_count': 50,
   'seed': 0,
   'epochs': 10,
   'k': 200,
   'eta': 15,
   'loss': 'pairwise',
   'loss_params': {'margin': 2},
   'embedding_model_params': {},
   'regularizer': None,
   'regularizer_params': {},
   'optimizer': 'adagrad',
   'optimizer_params': {'lr': 0.0027992973163431206},
   'verbose': True},
  'results': {'mrr': 0.0015205850240877599,
   'mr': 1110.0,
   'hits_1': 0.0,
   'hits_3': 0.0,
   'hits_10': 0.0}},
 {'model_name': 'ComplEx',
  'm

---
# KG embedding using Ampligraph

**THIS SECTION CONTAINS CODE THAT WOULD HAVE BEEN USED IF NOT FOR ISSUES WITH THE APLIGRAPH LIBRARY**

AmpliGraph has implemented [several Knoweldge Graph Embedding models](https://docs.ampligraph.org/en/latest/ampligraph.latent_features.html#knowledge-graph-embedding-models) (TransE, ComplEx, DistMult, HolE), but for this project we will only use the [ComplEx](https://docs.ampligraph.org/en/latest/generated/ampligraph.latent_features.ComplEx.html#ampligraph.latent_features.ComplEx) model (with  default values).

In [6]:
# initialize model
model = ComplEx(batches_count=100, 
                seed=0, 
                epochs=100, 
                k=150, 
                eta=5,
                optimizer='adam', 
                optimizer_params={'lr':1e-3},
                loss='multiclass_nll', 
                regularizer='LP', 
                regularizer_params={'p':3, 'lambda':1e-5}, 
                verbose=True)

tf.logging.set_verbosity(tf.logging.ERROR)

model.fit(X_train, early_stopping = False)

## Fitting the model

In [7]:
tf.logging.set_verbosity(tf.logging.ERROR)

model.fit(X_train, early_stopping = False)

Average ComplEx Loss:   0.000690: 100%|█| 100/100 [2:16:44<00:00, 82.05s/epoch]


In [8]:
save_model(model, './Wikidata_family_subset_100_epocs.pkl')

Uncomment to save new model

In [5]:
model = restore_model('./Wikidata_family_subset_100_epocs.pkl')

## Evaluate model

In [10]:
positives_filter = family_subset
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=positives_filter,   # Corruption strategy filter defined above 
                             use_default_protocol=True, # corrupt subj and obj separately while evaluating
                             verbose=True)

    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.


    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.
100%|████████████████████████████████████████| 100/100 [01:33<00:00,  1.07it/s]


In [11]:
mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))

MRR: 0.95
Hits@10: 0.99
Hits@3: 0.98
Hits@1: 0.92


## Generate new triples

This is where the error is enountered :(

In [6]:
discoveries, discovery_ranks = discover_facts(family_subset, model, top_n = 500, max_candidates = 2500, entities_subset = entities_subset, seed = 2)



  if corruption_entities == 'all':
 17%|██████▌                                | 421/2500 [00:43<03:32,  9.78it/s]


KeyboardInterrupt: 

In [56]:
a = np.array([['Q828550', 'child', 'Q2426845']], dtype=object)
b = np.array([['Q1774982', 'father', 'Q215546']], dtype=object)
c = np.array([['Q6845092', 'mother', 'Q16840232']], dtype=object)
d = np.array([['Q5649896', 'relative', 'Q2426845']], dtype=object)
e = np.array([['Q65428', 'sibling', 'Q110374'],['Q380341', 'sibling', 'Q313219'],['Q24082781', 'sibling', 'Q5543457']], dtype=object)
f = np.array([], dtype=object)
g = np.array([], dtype=object)

In [57]:
disc = [a,b,c,d,e,f,g]

In [72]:
disc = [a.tolist(),b.tolist(),c.tolist(),d.tolist(),e.tolist(),f.tolist(),g.tolist()]

In [74]:
flat_list = [item for sublist in disc for item in sublist]

In [75]:
flat_list

[['Q828550', 'child', 'Q2426845'],
 ['Q1774982', 'father', 'Q215546'],
 ['Q6845092', 'mother', 'Q16840232'],
 ['Q5649896', 'relative', 'Q2426845'],
 ['Q65428', 'sibling', 'Q110374'],
 ['Q380341', 'sibling', 'Q313219'],
 ['Q24082781', 'sibling', 'Q5543457']]

In [13]:
family_subset

array([['Q1000366', 'child', 'Q1701445'],
       ['Q1000366', 'child', 'Q6776382'],
       ['Q1000505', 'spouse', 'Q268177'],
       ...,
       ['Q913574', 'spouse', 'Q235629'],
       ['Q953878', 'relative', 'Q314514'],
       ['Q9749', 'sibling', 'Q706559']], dtype=object)

In [14]:
family_subset.shape

(258341, 3)