# 1. Preliminaries

In [1]:
# If using Google Colab run this cell 

# select tensorflow version for colab 
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
import tensorflow as tf 

print('TensorFlow  version: {}'.format(tf.__version__))

# Get the GPU name
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow  version: 1.15.2
Found GPU at: /device:GPU:0


In [3]:
%%capture 
# Install AmpliGraph library
! pip install ampligraph

# Required to visualize embeddings with tensorboard projector, comment out if not required!
! pip install --user tensorboard

# Required to plot text on embedding clusters, comment out if not required!
! pip install --user git+https://github.com/Phlya/adjustText

In [4]:
# All imports used in this tutorial 
%tensorflow_version 1.x
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf
from ampligraph.datasets import load_fb15k_237
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.discovery import query_topn, discover_facts, find_clusters
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model

def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))

print('Ampligraph version: {}'.format(ampligraph.__version__))

Ampligraph version: 1.3.2


# 2. Loading a Knowledge Graph dataset

In [5]:
from ampligraph.datasets import load_fb15k_237, load_wn18rr, load_yago3_10

In [6]:
import pandas as pd

dataset = pd.read_csv('/content/final_triples_with_predefined_relations.csv')
print(dataset.columns)
dataset.drop(columns='Unnamed: 0', inplace=True)
#dataset.columns = ['subject',  'object', 'predicate',]
dataset = dataset[['subject','new_relation', 'object' ]]
dataset.head(5)

Index(['Unnamed: 0', 'subject', 'object', 'new_relation'], dtype='object')


Unnamed: 0,subject,new_relation,object
0,aerosols,be_in,guinea pig
1,UK,transmit,influenza
2,MPs,be_related_with,SARS-CoV infection
3,Obesity,be_in,China
4,%,be_related_with,EBV


In [7]:
print('Total triples in the KG:', dataset.shape)

Total triples in the KG: (11605, 3)


## 2.1 Create training, validation and test splits

In [8]:
from ampligraph.evaluation import train_test_split_no_unseen
# get the validation set of size 500
test_train, X_valid = train_test_split_no_unseen(dataset.values, 500, seed=0)

# get the test set of size 1000 from the remaining triples
X_train, X_test = train_test_split_no_unseen(test_train, 1000, seed=0)

print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)

Total triples: (11605, 3)
Size of train: (10105, 3)
Size of valid: (500, 3)
Size of test: (1000, 3)


# 3. Model Training

In [9]:
from ampligraph.latent_features import TransE

# Train a KGE model
model = TransE(k=300, 
               epochs=100, 
               eta=1, 
               loss='multiclass_nll', 
               initializer='xavier', initializer_params={'uniform': False},
               regularizer='LP', regularizer_params= {'lambda': 0.001, 'p': 3},
               optimizer= 'adam', optimizer_params= {'lr': 0.0001}, 
               seed= 0, batches_count= 10, verbose=True)

model.fit(X_train)
# ----------------------
# Evaluate: 
# Filtered evaluation with ranking strategy assigning worst rank to break ties

from ampligraph.utils import save_model, restore_model
save_model(model, 'TransE.pkl')
model = restore_model('TransE.pkl')

# create the filter 
X_filter = np.concatenate([X_train, X_valid, X_test], 0)

# compute ranks
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=X_filter)

# ranks are computed per triple
print('Test set:', X_test.shape)
print('Size of ranks:', ranks.shape)

# Aggregate metrics show the aggregate performance of the model on the test set using a single number
display_aggregate_metrics(ranks)
# ----------------------

Average Loss:   0.112582: 100%|██████████| 100/100 [00:03<00:00, 29.52epoch/s]
100%|██████████| 1000/1000 [00:02<00:00, 333.92it/s]


Test set: (1000, 3)
Size of ranks: (1000, 2)
Mean Rank: 96.462
Mean Reciprocal Rank: 0.2340034681281025
Hits@1: 0.153
Hits@10: 0.4125
Hits@100: 0.802


# 4. Knowledge Discovery

## 4.1 Triple completion

``` 
    <head, relation, ?> 
    <head, ?,        tail>
    <?,    relation, tail>
```

In [10]:
# Predict tail

from ampligraph.discovery import query_topn

# restore the previously saved model to save time
model = restore_model('TransE.pkl')

triples, scores = query_topn(model, top_n=10, 
                             head='coronavirus', 
                             relation='be_in', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -15.520257949829102 	 ['coronavirus' 'be_in' 'China'] 
Score: -15.616787910461426 	 ['coronavirus' 'be_in' 'United States'] 
Score: -15.692081451416016 	 ['coronavirus' 'be_in' 'India'] 
Score: -15.784998893737793 	 ['coronavirus' 'be_in' 'Africa'] 
Score: -15.860442161560059 	 ['coronavirus' 'be_in' 'Malaysia'] 
Score: -15.952444076538086 	 ['coronavirus' 'be_in' 'USA'] 
Score: -16.00545883178711 	 ['coronavirus' 'be_in' 'Taiwan'] 
Score: -16.015079498291016 	 ['coronavirus' 'be_in' 'UK'] 
Score: -16.031898498535156 	 ['coronavirus' 'be_in' 'Italy'] 
Score: -16.12082290649414 	 ['coronavirus' 'be_in' 'Beijing'] 


In [13]:
# Predict relation

triples, scores = query_topn(model, top_n=10, 
                             head=None, 
                             relation='treat', 
                             tail='coronavirus', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -16.83261489868164 	 ['ILD' 'treat' 'coronavirus'] 
Score: -17.153030395507812 	 ['Hyperoxia' 'treat' 'coronavirus'] 
Score: -17.154539108276367 	 ['Viruses' 'treat' 'coronavirus'] 
Score: -17.17650604248047 	 ['SLC11A1 locus' 'treat' 'coronavirus'] 
Score: -17.256359100341797 	 ['H' 'treat' 'coronavirus'] 
Score: -17.33230972290039 	 ['Influenza viruses' 'treat' 'coronavirus'] 
Score: -17.36236572265625 	 ['gelatinases' 'treat' 'coronavirus'] 
Score: -17.365161895751953 	 ['silica' 'treat' 'coronavirus'] 
Score: -17.376819610595703 	 ['Americas' 'treat' 'coronavirus'] 
Score: -17.393280029296875 	 ['U1' 'treat' 'coronavirus'] 


In [21]:
# Predict relation

triples, scores = query_topn(model, top_n=20, 
                             head=None, 
                             relation='be_in', 
                             tail='coronavirus', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -17.366352081298828 	 ['coronavirus' 'be_in' 'coronavirus'] 
Score: -17.57384490966797 	 ['participants' 'be_in' 'coronavirus'] 
Score: -17.641658782958984 	 ['Centers' 'be_in' 'coronavirus'] 
Score: -17.719989776611328 	 ['target' 'be_in' 'coronavirus'] 
Score: -17.84402084350586 	 ['SARS' 'be_in' 'coronavirus'] 
Score: -17.88368034362793 	 ['burden' 'be_in' 'coronavirus'] 
Score: -17.911575317382812 	 ['COVID-19' 'be_in' 'coronavirus'] 
Score: -17.955989837646484 	 ['protein' 'be_in' 'coronavirus'] 
Score: -17.956199645996094 	 ['lockdown' 'be_in' 'coronavirus'] 
Score: -17.965545654296875 	 ['Prevalence' 'be_in' 'coronavirus'] 
Score: -17.98221206665039 	 ['pDCs' 'be_in' 'coronavirus'] 
Score: -17.99897575378418 	 ['complications' 'be_in' 'coronavirus'] 
Score: -18.000625610351562 	 ['outbreaks' 'be_in' 'coronavirus'] 
Score: -18.056129455566406 	 ['Ministry' 'be_in' 'coronavirus'] 
Score: -18.064979553222656 	 ['A/H1N1' 'be_in' 'coronavirus'] 
Score: -18.103973388671875 	 ['

In [22]:
# Predict relation

triples, scores = query_topn(model, top_n=30, 
                             head=None, 
                             relation='transmit', 
                             tail='COVID-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -16.82418441772461 	 ['Infection' 'transmit' 'COVID-19'] 
Score: -17.130191802978516 	 ['COVID-19' 'transmit' 'COVID-19'] 
Score: -17.139598846435547 	 ['UDA' 'transmit' 'COVID-19'] 
Score: -17.200986862182617 	 ['virus' 'transmit' 'COVID-19'] 
Score: -17.20121955871582 	 ['person' 'transmit' 'COVID-19'] 
Score: -17.215730667114258 	 ['RT-PCR/ESI-MS' 'transmit' 'COVID-19'] 
Score: -17.21701431274414 	 ['I' 'transmit' 'COVID-19'] 
Score: -17.355167388916016 	 ['inflection' 'transmit' 'COVID-19'] 
Score: -17.417526245117188 	 ['Feline infectious peritonitis' 'transmit' 'COVID-19'] 
Score: -17.445676803588867 	 ['us' 'transmit' 'COVID-19'] 
Score: -17.47604751586914 	 ['mouse' 'transmit' 'COVID-19'] 
Score: -17.522369384765625 	 ['ACE2' 'transmit' 'COVID-19'] 
Score: -17.549999237060547 	 ['antibody' 'transmit' 'COVID-19'] 
Score: -17.605812072753906 	 ['Obesity' 'transmit' 'COVID-19'] 
Score: -17.63262176513672 	 ['cell' 'transmit' 'COVID-19'] 
Score: -17.687114715576172 	 ['infec

In [23]:

triples, scores = query_topn(model, top_n=30, 
                             head=None, 
                             relation='be_related_with', 
                             tail='COVID-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -16.16590118408203 	 ['virus' 'be_related_with' 'COVID-19'] 
Score: -16.233020782470703 	 ['patient' 'be_related_with' 'COVID-19'] 
Score: -16.431358337402344 	 ['symptoms' 'be_related_with' 'COVID-19'] 
Score: -16.470134735107422 	 ['mouse' 'be_related_with' 'COVID-19'] 
Score: -16.526222229003906 	 ['COVID-19' 'be_related_with' 'COVID-19'] 
Score: -16.60723876953125 	 ['child' 'be_related_with' 'COVID-19'] 
Score: -16.850372314453125 	 ['person' 'be_related_with' 'COVID-19'] 
Score: -16.895915985107422 	 ['inflection' 'be_related_with' 'COVID-19'] 
Score: -16.99717140197754 	 ['%' 'be_related_with' 'COVID-19'] 
Score: -17.10623550415039 	 ['antibody' 'be_related_with' 'COVID-19'] 
Score: -17.176666259765625 	 ['woman' 'be_related_with' 'COVID-19'] 
Score: -17.36921501159668 	 ['Covid-19' 'be_related_with' 'COVID-19'] 
Score: -17.417644500732422 	 ['pDCs' 'be_related_with' 'COVID-19'] 
Score: -17.516820907592773 	 ['Infection' 'be_related_with' 'COVID-19'] 
Score: -17.551189422

In [31]:
# Compare the raw call and graph embedding results for the symptoms query

## raw call
dataset.loc[(dataset['subject']=='symptoms')].head(20)

#####==> the result of raw call seems to be better

Unnamed: 0,subject,new_relation,object
16,symptoms,be_related_with,pulmonary edema
47,symptoms,be_related_with,cough
49,symptoms,be_related_with,nasal congestion
53,symptoms,transmit,SARS
65,symptoms,be_related_with,headache
68,symptoms,be_related_with,acute respiratory distress syndrome
95,symptoms,be_related_with,influenza-like illness
139,symptoms,be_related_with,sore throat
148,symptoms,be_related_with,respiratory symptoms
164,symptoms,be_related_with,congestion


In [32]:
##graph embedding
triples, scores = query_topn(model, top_n=20, 
                             head='symptoms', 
                             relation='be_related_with', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -12.373523712158203 	 ['symptoms' 'be_related_with' 'respiratory distress syndrome'] 
Score: -12.417543411254883 	 ['symptoms' 'be_related_with' 'major'] 
Score: -12.428304672241211 	 ['symptoms' 'be_related_with' 'viruses'] 
Score: -12.55955696105957 	 ['symptoms' 'be_related_with' 'critical'] 
Score: -12.591981887817383 	 ['symptoms' 'be_related_with' 'high'] 
Score: -12.617071151733398 	 ['symptoms' 'be_related_with' 'severe'] 
Score: -12.641407012939453 	 ['symptoms' 'be_related_with' 'cough'] 
Score: -12.652081489562988 	 ['symptoms' 'be_related_with' 'distress syndrome'] 
Score: -12.681467056274414 	 ['symptoms' 'be_related_with' 'CNS'] 
Score: -12.709613800048828 	 ['symptoms' 'be_related_with' 'pandemic'] 
Score: -12.776214599609375 	 ['symptoms' 'be_related_with' 'blood'] 
Score: -12.796131134033203 	 ['symptoms' 'be_related_with' 'infection'] 
Score: -12.796436309814453 	 ['symptoms' 'be_related_with' 'lung'] 
Score: -12.894087791442871 	 ['symptoms' 'be_related_with' 