# Model selection for knowledge graph embeddings

In [1]:
import tensorflow as tf
import numpy as np
np.random.seed(0)

from ampligraph.evaluation import train_test_split_no_unseen 
from ampligraph.latent_features import RandomBaseline, TransE, DistMult, ComplEx, HolE, ConvE, ConvKB
from ampligraph.latent_features import save_model
from ampligraph.evaluation import evaluate_performance, select_best_model_ranking, mr_score, mrr_score, hits_at_n_score
from ampligraph.latent_features import save_model, restore_model

## Data retrieval

In [3]:
family_subset = np.loadtxt("family_subset.txt", dtype = 'object')

In [17]:
X_train = np.loadtxt("X_train.txt", dtype = 'object')
X_val = X_train[100:200]
X_train = X_train[:1000]
X_test = np.loadtxt("X_test.txt", dtype = 'object')

In [4]:
entities_subset = np.loadtxt("entities_subset.txt", dtype = 'object')

In [18]:
model_class = ComplEx
param_grid = {
    "batches_count": [50],
    "seed": 0,
    "epochs": [100],
    "k": [100, 200],
    "eta": [5, 10, 15],
    "loss": ["pairwise", "nll"],
    "loss_params": {
        "margin": [2]
    },
    "embedding_model_params": {
        
    },
    "regularizer": ["LP", None],
    "regularizer_params": {
        "p": [1, 3],
        "lambda": [1e-4, 1e-5]
    },
    "optimizer": ["adagrad", "adam"],
    "optimizer_params": {
        "lr": lambda: np.random.uniform(0.0001, 0.01)
    },
    "verbose": True
}

  0%|                                                    | 0/2 [00:00<?, ?it/s]
  0%|                                               | 0/100 [00:00<?, ?epoch/s][A
Average ComplEx Loss:   1.999950:   0%|             | 0/100 [00:00<?, ?epoch/s][A
Average ComplEx Loss:   1.999950:   1%|     | 1/100 [00:00<00:51,  1.92epoch/s][A
Average ComplEx Loss:   1.919796:   1%|     | 1/100 [00:00<00:51,  1.92epoch/s][A
Average ComplEx Loss:   1.919796:   2%|     | 2/100 [00:00<00:44,  2.22epoch/s][A
Average ComplEx Loss:   1.410806:   2%|     | 2/100 [00:01<00:44,  2.22epoch/s][A
Average ComplEx Loss:   1.410806:   3%|▏    | 3/100 [00:01<00:39,  2.47epoch/s][A
Average ComplEx Loss:   0.148293:   3%|▏    | 3/100 [00:01<00:39,  2.47epoch/s][A
Average ComplEx Loss:   0.148293:   4%|▏    | 4/100 [00:01<00:36,  2.64epoch/s][A
Average ComplEx Loss:   0.001036:   4%|▏    | 4/100 [00:01<00:36,  2.64epoch/s][A
Average ComplEx Loss:   0.001036:   5%|▎    | 5/100 [00:01<00:34,  2.73epoch/s][A
Average

(<ampligraph.latent_features.models.ComplEx.ComplEx at 0x1b142ebdf88>,
 {'batches_count': 50,
  'seed': 0,
  'epochs': 100,
  'k': 100,
  'eta': 10,
  'loss': 'pairwise',
  'loss_params': {'margin': 2},
  'embedding_model_params': {},
  'regularizer': None,
  'regularizer_params': {},
  'optimizer': 'adam',
  'optimizer_params': {'lr': 0.004294182513455157},
  'verbose': True},
 1.0,
 array([[495, 444],
        [364, 430],
        [799, 697]]),
 {'mrr': 0.0020052621833634263,
  'mr': 538.1666666666666,
  'hits_1': 0.0,
  'hits_3': 0.0,
  'hits_10': 0.0},
 [{'model_name': 'ComplEx',
   'model_params': {'batches_count': 50,
    'seed': 0,
    'epochs': 100,
    'k': 100,
    'eta': 10,
    'loss': 'pairwise',
    'loss_params': {'margin': 2},
    'embedding_model_params': {},
    'regularizer': None,
    'regularizer_params': {},
    'optimizer': 'adam',
    'optimizer_params': {'lr': 0.004294182513455157},
    'verbose': True},
   'results': {'mrr': 1.0,
    'mr': 1.0,
    'hits_1': 1.0

In [None]:
results = select_best_model_ranking(model_class, X_train, X_val, X_test,
                          param_grid,
                          max_combinations=2,
                          use_filter=True,
                          verbose=False,
                          early_stopping=True)

---
# KG embedding using Ampligraph

**THIS SECTION CONTAINS CODE THAT WOULD HAVE BEEN USED IF NOT FOR ISSUES WITH THE APLIGRAPH LIBRARY**

AmpliGraph has implemented [several Knoweldge Graph Embedding models](https://docs.ampligraph.org/en/latest/ampligraph.latent_features.html#knowledge-graph-embedding-models) (TransE, ComplEx, DistMult, HolE), but for this project we will only use the [ComplEx](https://docs.ampligraph.org/en/latest/generated/ampligraph.latent_features.ComplEx.html#ampligraph.latent_features.ComplEx) model (with  default values).

In [6]:
# initialize model
model = ComplEx(batches_count=100, 
                seed=0, 
                epochs=100, 
                k=150, 
                eta=5,
                optimizer='adam', 
                optimizer_params={'lr':1e-3},
                loss='multiclass_nll', 
                regularizer='LP', 
                regularizer_params={'p':3, 'lambda':1e-5}, 
                verbose=True)

tf.logging.set_verbosity(tf.logging.ERROR)

model.fit(X_train, early_stopping = False)

## Fitting the model

In [7]:
tf.logging.set_verbosity(tf.logging.ERROR)

model.fit(X_train, early_stopping = False)

Average ComplEx Loss:   0.000690: 100%|█| 100/100 [2:16:44<00:00, 82.05s/epoch]


In [8]:
save_model(model, './Wikidata_family_subset_100_epocs.pkl')

Uncomment to save new model

In [5]:
model = restore_model('./Wikidata_family_subset_100_epocs.pkl')

## Evaluate model

In [10]:
positives_filter = family_subset
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=positives_filter,   # Corruption strategy filter defined above 
                             use_default_protocol=True, # corrupt subj and obj separately while evaluating
                             verbose=True)

    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.


    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.
100%|████████████████████████████████████████| 100/100 [01:33<00:00,  1.07it/s]


In [11]:
mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))

MRR: 0.95
Hits@10: 0.99
Hits@3: 0.98
Hits@1: 0.92


## Generate new triples

This is where the error is enountered :(

In [6]:
discoveries, discovery_ranks = discover_facts(family_subset, model, top_n = 500, max_candidates = 2500, entities_subset = entities_subset, seed = 2)



  if corruption_entities == 'all':
 17%|██████▌                                | 421/2500 [00:43<03:32,  9.78it/s]


KeyboardInterrupt: 

In [56]:
a = np.array([['Q828550', 'child', 'Q2426845']], dtype=object)
b = np.array([['Q1774982', 'father', 'Q215546']], dtype=object)
c = np.array([['Q6845092', 'mother', 'Q16840232']], dtype=object)
d = np.array([['Q5649896', 'relative', 'Q2426845']], dtype=object)
e = np.array([['Q65428', 'sibling', 'Q110374'],['Q380341', 'sibling', 'Q313219'],['Q24082781', 'sibling', 'Q5543457']], dtype=object)
f = np.array([], dtype=object)
g = np.array([], dtype=object)

In [57]:
disc = [a,b,c,d,e,f,g]

In [72]:
disc = [a.tolist(),b.tolist(),c.tolist(),d.tolist(),e.tolist(),f.tolist(),g.tolist()]

In [74]:
flat_list = [item for sublist in disc for item in sublist]

In [75]:
flat_list

[['Q828550', 'child', 'Q2426845'],
 ['Q1774982', 'father', 'Q215546'],
 ['Q6845092', 'mother', 'Q16840232'],
 ['Q5649896', 'relative', 'Q2426845'],
 ['Q65428', 'sibling', 'Q110374'],
 ['Q380341', 'sibling', 'Q313219'],
 ['Q24082781', 'sibling', 'Q5543457']]

In [13]:
family_subset

array([['Q1000366', 'child', 'Q1701445'],
       ['Q1000366', 'child', 'Q6776382'],
       ['Q1000505', 'spouse', 'Q268177'],
       ...,
       ['Q913574', 'spouse', 'Q235629'],
       ['Q953878', 'relative', 'Q314514'],
       ['Q9749', 'sibling', 'Q706559']], dtype=object)

In [14]:
family_subset.shape

(258341, 3)