# Installing libraries

In [None]:
!pip install openbiolink

Collecting openbiolink
[?25l  Downloading https://files.pythonhosted.org/packages/36/7a/9cdbcb0f679a2e06be3daee5c4e383bce447f8b54bf64e6d556e1a4ebb35/openbiolink-0.1.3-py3-none-any.whl (237kB)
[K     |█▍                              | 10kB 20.6MB/s eta 0:00:01[K     |██▊                             | 20kB 27.4MB/s eta 0:00:01[K     |████▏                           | 30kB 22.4MB/s eta 0:00:01[K     |█████▌                          | 40kB 25.9MB/s eta 0:00:01[K     |██████▉                         | 51kB 26.5MB/s eta 0:00:01[K     |████████▎                       | 61kB 28.9MB/s eta 0:00:01[K     |█████████▋                      | 71kB 23.1MB/s eta 0:00:01[K     |███████████                     | 81kB 24.3MB/s eta 0:00:01[K     |████████████▍                   | 92kB 23.0MB/s eta 0:00:01[K     |█████████████▊                  | 102kB 22.7MB/s eta 0:00:01[K     |███████████████▏                | 112kB 22.7MB/s eta 0:00:01[K     |████████████████▌               | 12

In [None]:
%%capture 
!pip install ampligraph;

In [None]:
%tensorflow_version 1.x 
import numpy as np
import pandas as pd
import ampligraph

ampligraph.__version__

TensorFlow 1.x selected.


'1.4.0'

# Loading OpenBioLink dataset

In [None]:
from openbiolink.evaluation.dataLoader import DataLoader

# Name of the Dataset, possible values HQ_DIR, HQ_UNDIR, ALL_DIR, ALL_UNDIR. Default: HQ_DIR
dl = DataLoader("HQ_DIR")

HQ_DIR.zip: 0.00B [00:00, ?B/s]

Dataset not found in, downloading to /content/datasets/HQ_DIR ...


HQ_DIR.zip: 104MB [00:14, 7.70MB/s]                           


Loading dataset ...
Done!


In [None]:
cmp = []

for el in dl.mappings['nodes']['label2id']:
  if 'compound' in el.lower():
    cmp.append(el)

In [None]:
len(cmp)

77709

In [None]:
rels = set(dl.data['train_positive'][1])
len(rels)

28

In [None]:
len(dl.mappings['nodes']['label2id'])

184732

In [None]:
dl.mappings['relations']['label2id']

{'DIS_DRUG': 25,
 'DIS_PHENOTYPE': 4,
 'DRUG_ACTIVATION_GENE': 9,
 'DRUG_BINDACT_GENE': 14,
 'DRUG_BINDING_GENE': 22,
 'DRUG_BINDINH_GENE': 5,
 'DRUG_CATALYSIS_GENE': 18,
 'DRUG_INHIBITION_GENE': 21,
 'DRUG_PHENOTYPE': 27,
 'DRUG_REACTION_GENE': 20,
 'GENE_ACTIVATION_GENE': 23,
 'GENE_BINDING_GENE': 12,
 'GENE_CATALYSIS_GENE': 24,
 'GENE_DIS': 10,
 'GENE_DRUG': 2,
 'GENE_EXPRESSED_ANATOMY': 15,
 'GENE_EXPRESSION_GENE': 19,
 'GENE_GENE': 0,
 'GENE_GO': 8,
 'GENE_INHIBITION_GENE': 16,
 'GENE_OVEREXPRESSED_ANATOMY': 13,
 'GENE_PATHWAY': 1,
 'GENE_PHENOTYPE': 7,
 'GENE_PTMOD_GENE': 26,
 'GENE_REACTION_GENE': 6,
 'GENE_UNDEREXPRESSED_ANATOMY': 3,
 'IS_A': 17,
 'PART_OF': 11}

In [None]:
train_positive = dl.data['train_positive'].to_numpy()
train_negative = dl.data['train_negative'].to_numpy()
test_positive = dl.data['test_positive'].to_numpy()
test_negative = dl.data['test_negative'].to_numpy()
validation_positive = dl.data['valid_positive'].to_numpy()
vakidation_negative = dl.data['valid_negative'].to_numpy()

# Model creation

In [None]:
from ampligraph.latent_features import ComplEx

model = ComplEx(batches_count=250, 
                seed=0, 
                epochs=50, 
                k=200, 
                eta=30,
                embedding_model_params = {'negative_corruption_entities': train_negative},
                optimizer='adam', 
                optimizer_params={'lr':0.001},
                loss='multiclass_nll', 
                regularizer='LP', 
                regularizer_params={'p':3, 'lambda':0.001}, 
                verbose=True)

# Model training

In [None]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
model.fit(train_positive, early_stopping = False)

  if negative_corruption_entities == 'all':
  elif negative_corruption_entities == 'batch':
Average ComplEx Loss:   0.840990: 100%|██████████| 50/50 [1:12:33<00:00, 87.07s/epoch]


In [None]:
from ampligraph.latent_features import save_model, restore_model

save_model(model, './complex_model_OpenBioLink.pkl')

# Model evaluation

In [None]:
X_positive = np.concatenate((train_positive, test_positive, validation_positive), axis=0)
len(X_positive)

4563405

In [None]:
from ampligraph.evaluation import evaluate_performance

ranks = evaluate_performance(test_positive, 
                             model=model, 
                             filter_triples=X_positive,   # Corruption strategy filter defined above 
                             verbose=True)

    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.


    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.
 62%|██████▏   | 111593/180964 [2:37:36<1:38:09, 11.78it/s]

In [None]:
from numpy import asarray
from numpy import savetxt

data = asarray(ranks)
# save to csv file
savetxt('ranks_complex.csv', data, delimiter=',')

In [None]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))