<a href="https://colab.research.google.com/github/HuyenNguyenHelen/CORD-19-KG/blob/master/covid_19_kg_mining_MetamapEnt_only.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Preliminaries

In [3]:
# If using Google Colab run this cell 

# select tensorflow version for colab 
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [4]:
import tensorflow as tf 

print('TensorFlow  version: {}'.format(tf.__version__))

# Get the GPU name
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow  version: 1.15.2
Found GPU at: /device:GPU:0


In [5]:
%%capture 
# Install AmpliGraph library
! pip install ampligraph

# Required to visualize embeddings with tensorboard projector, comment out if not required!
! pip install --user tensorboard

# Required to plot text on embedding clusters, comment out if not required!
! pip install --user git+https://github.com/Phlya/adjustText

In [6]:
# All imports used in this tutorial 
%tensorflow_version 1.x
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf
from ampligraph.datasets import load_fb15k_237
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.discovery import query_topn, discover_facts, find_clusters
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model

def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))

print('Ampligraph version: {}'.format(ampligraph.__version__))

Ampligraph version: 1.4.0


# 2. Loading a Knowledge Graph dataset

In [7]:
from ampligraph.datasets import load_fb15k_237, load_wn18rr, load_yago3_10

In [8]:
import pandas as pd
pmc = pd.read_csv('/content/new_triples_with_predefined_relations_pmc_June10_meta-only.csv')
pdf = pd.read_csv('/content/new_triples_with_predefined_relations_pdf_June10_meta-only.csv')
dataset = pd.concat([pmc, pdf])
print(dataset.columns)
dataset.drop(columns='Unnamed: 0', inplace=True)
#dataset.columns = ['subject',  'object', 'predicate',]
dataset = dataset[['subject','new_relation', 'object' ]]
dataset.head(5)

Index(['Unnamed: 0', 'Unnamed: 0.1', 'subject', 'object', 'new_relation'], dtype='object')


Unnamed: 0,subject,new_relation,object
0,step,gene_gene,endosomes
1,disease,disease_disease,pneumonia
2,prophylaxis,treat_procedure_gene,end
3,prophylaxis,treat_procedure_gene,end
4,impact,gene_gene,past


In [9]:
print('Total triples in the KG:', dataset.shape)

Total triples in the KG: (59070, 3)


In [10]:
print('the number of relations: \n', dataset['new_relation'].value_counts())

the number of relations: 
 disease_disease                    18995
treat_procedure_treat_procedure     5827
gene_gene                           4366
disease_treat_procedure             4218
treat_procedure_disease             4047
symptom_symptom                     3772
disease_gene                        3397
disease_symptom                     2742
gene_disease                        2573
gene_treat_procedure                2317
treat_procedure_gene                2269
symptom_disease                     2260
gene_symptom                         526
treat_procedure_symptom              474
symptom_treat_procedure              408
disease_species                      348
symptom_gene                         235
gene_species                         149
treat_procedure_species              127
symptom_species                       20
Name: new_relation, dtype: int64


## 2.1 Create training, validation and test splits

In [11]:
from ampligraph.evaluation import train_test_split_no_unseen
# get the validation set of size 500
test_train, X_valid = train_test_split_no_unseen(dataset.values, 500, seed=0)

# get the test set of size 1000 from the remaining triples
X_train, X_test = train_test_split_no_unseen(test_train, 1000, seed=0)

print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)

Total triples: (59070, 3)
Size of train: (57570, 3)
Size of valid: (500, 3)
Size of test: (1000, 3)


# 3. Model Training

In [12]:
from ampligraph.latent_features import TransE

# Train a KGE model
model = TransE(k=300, 
               epochs=100, 
               eta=1, 
               loss='multiclass_nll', 
               initializer='xavier', initializer_params={'uniform': False},
               regularizer='LP', regularizer_params= {'lambda': 0.001, 'p': 3},
               optimizer= 'adam', optimizer_params= {'lr': 0.0001}, 
               seed= 0, batches_count= 10, verbose=True)

model.fit(X_train)
# ----------------------
# Evaluate: 
# Filtered evaluation with ranking strategy assigning worst rank to break ties

from ampligraph.utils import save_model, restore_model
save_model(model, 'TransE.pkl')
model = restore_model('TransE.pkl')

# create the filter 
X_filter = np.concatenate([X_train, X_valid, X_test], 0)

# compute ranks
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=X_filter)

# ranks are computed per triple
print('Test set:', X_test.shape)
print('Size of ranks:', ranks.shape)

# Aggregate metrics show the aggregate performance of the model on the test set using a single number
display_aggregate_metrics(ranks)
# ----------------------

Average TransE Loss:   0.046611: 100%|██████████| 100/100 [00:05<00:00, 18.76epoch/s]
100%|██████████| 1000/1000 [00:03<00:00, 271.96it/s]


Test set: (1000, 3)
Size of ranks: (1000, 2)
Mean Rank: 396.955
Mean Reciprocal Rank: 0.22485473737262798
Hits@1: 0.153
Hits@10: 0.37
Hits@100: 0.651


# 4. Knowledge Discovery

### Relations

In [13]:
# Relations
'''
disease_disease                    87479
gene_gene                          25101
disease_species                    21152
gene_disease                       17414
disease_gene                       17383
treat_procedure_disease            15221
disease_treat_procedure            14696
treat_procedure_treat_procedure    13533
treat_procedure_gene                8913
gene_treat_procedure                8913
treat_procedure_species             5307
gene_species                        3894
disease_symptom                     2501
symptom_disease                     1409
treat_procedure_symptom              391
gene_symptom                         310
symptom_species                      253
symptom_treat_procedure              211
symptom_gene                         170
symptom_symptom                      141
'''

'\ndisease_disease                    87479\ngene_gene                          25101\ndisease_species                    21152\ngene_disease                       17414\ndisease_gene                       17383\ntreat_procedure_disease            15221\ndisease_treat_procedure            14696\ntreat_procedure_treat_procedure    13533\ntreat_procedure_gene                8913\ngene_treat_procedure                8913\ntreat_procedure_species             5307\ngene_species                        3894\ndisease_symptom                     2501\nsymptom_disease                     1409\ntreat_procedure_symptom              391\ngene_symptom                         310\nsymptom_species                      253\nsymptom_treat_procedure              211\nsymptom_gene                         170\nsymptom_symptom                      141\n'

##  Question - Answering

``` 
    <head, relation, ?> 
    <head, ?,        tail>
    <?,    relation, tail>
```

### Query: what are the possible therapeutic procedure of covid-19??

In [16]:
# Query: what are the possible therapeutic procedure of covid-19??

from ampligraph.discovery import query_topn

# restore the previously saved model to save time
model = restore_model('TransE.pkl')

triples, scores = query_topn(model, top_n=20, 
                             head='coronavirus', 
                             relation='disease_treat_procedure', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -10.234280586242676 	 ['coronavirus' 'disease_treat_procedure' 'detected'] 
Score: -10.975214004516602 	 ['coronavirus' 'disease_treat_procedure' 'destruction'] 
Score: -11.161733627319336 	 ['coronavirus' 'disease_treat_procedure' 'presence'] 
Score: -11.162944793701172 	 ['coronavirus' 'disease_treat_procedure' 'release'] 
Score: -11.18162727355957 	 ['coronavirus' 'disease_treat_procedure' 'mechanical ventilation'] 
Score: -11.225645065307617 	 ['coronavirus' 'disease_treat_procedure' 'detection'] 
Score: -11.257482528686523 	 ['coronavirus' 'disease_treat_procedure' 'elevation'] 
Score: -11.316635131835938 	 ['coronavirus' 'disease_treat_procedure' 'clearance'] 
Score: -11.34953498840332 	 ['coronavirus' 'disease_treat_procedure' 'open'] 
Score: -11.365729331970215 	 ['coronavirus' 'disease_treat_procedure' 'expression'] 
Score: -11.493637084960938 	 ['coronavirus' 'disease_treat_procedure' 'removal'] 
Score: -11.511768341064453 	 ['coronavirus' 'disease_treat_procedure' 'tr

In [17]:
# Query: what are the possible therapeutic procedure of covid-19??

from ampligraph.discovery import query_topn

# restore the previously saved model to save time
model = restore_model('TransE.pkl')

triples, scores = query_topn(model, top_n=20, 
                             head=None, 
                             relation='treat_procedure_disease', 
                             tail='coronavirus', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -11.309013366699219 	 ['inoculation' 'treat_procedure_disease' 'coronavirus'] 
Score: -11.460874557495117 	 ['administration' 'treat_procedure_disease' 'coronavirus'] 
Score: -11.652515411376953 	 ['drug' 'treat_procedure_disease' 'coronavirus'] 
Score: -11.703194618225098 	 ['monolayers' 'treat_procedure_disease' 'coronavirus'] 
Score: -11.73873519897461 	 ['influenza vaccination' 'treat_procedure_disease' 'coronavirus'] 
Score: -11.76041030883789 	 ['reduction' 'treat_procedure_disease' 'coronavirus'] 
Score: -11.783241271972656 	 ['presence' 'treat_procedure_disease' 'coronavirus'] 
Score: -11.800429344177246 	 ['isolation' 'treat_procedure_disease' 'coronavirus'] 
Score: -11.803807258605957 	 ['immunization' 'treat_procedure_disease' 'coronavirus'] 
Score: -11.809280395507812 	 ['application' 'treat_procedure_disease' 'coronavirus'] 
Score: -11.844008445739746 	 ['expression' 'treat_procedure_disease' 'coronavirus'] 
Score: -11.856630325317383 	 ['infiltration' 'treat_proced

### Query: What are related diseases of Covid-19

In [18]:
# Predict relation

triples, scores = query_topn(model, top_n=20, 
                             head='coronavirus', 
                             relation='disease_disease', 
                             tail= None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -13.433341026306152 	 ['coronavirus' 'disease_disease' 'bat coronavirus'] 
Score: -13.51054859161377 	 ['coronavirus' 'disease_disease' 'human metapneumovirus'] 
Score: -13.628095626831055 	 ['coronavirus' 'disease_disease' 'acute respiratory tract infection'] 
Score: -13.65103816986084 	 ['coronavirus' 'disease_disease' 'febrile illness'] 
Score: -13.753948211669922 	 ['coronavirus' 'disease_disease' 'hepatic necrosis'] 
Score: -13.865894317626953 	 ['coronavirus' 'disease_disease' 'respiratory tract disease'] 
Score: -14.011213302612305 	 ['coronavirus' 'disease_disease' 'contagious disease'] 
Score: -14.047826766967773 	 ['coronavirus' 'disease_disease' 'kidney failure'] 
Score: -14.047842025756836 	 ['coronavirus' 'disease_disease' 'coagulopathies'] 
Score: -14.105731010437012 	 ['coronavirus' 'disease_disease' 'endemic disease'] 
Score: -14.141969680786133 	 ['coronavirus' 'disease_disease' 'canine adenovirus 1'] 
Score: -14.213666915893555 	 ['coronavirus' 'disease_disease

In [26]:
# Predict relation

triples, scores = query_topn(model, top_n=20, 
                             head=None, 
                             relation='disease_disease', 
                             tail= 'coronavirus', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -13.730649948120117 	 ['calicivirus' 'disease_disease' 'coronavirus'] 
Score: -13.76961898803711 	 ['reovirus' 'disease_disease' 'coronavirus'] 
Score: -13.893882751464844 	 ['mucopurulent conjunctivitis' 'disease_disease' 'coronavirus'] 
Score: -13.895874977111816 	 ['he' 'disease_disease' 'coronavirus'] 
Score: -14.037752151489258 	 ['deltacoronavirus' 'disease_disease' 'coronavirus'] 
Score: -14.046636581420898 	 ['nasopharyngitis' 'disease_disease' 'coronavirus'] 
Score: -14.089031219482422 	 ['autoimmune hemolytic anemia' 'disease_disease' 'coronavirus'] 
Score: -14.232604026794434 	 ['normocytic anemia' 'disease_disease' 'coronavirus'] 
Score: -14.25552749633789 	 ['septicaemia' 'disease_disease' 'coronavirus'] 
Score: -14.280397415161133 	 ['pulmonary pathology' 'disease_disease' 'coronavirus'] 
Score: -14.316299438476562 	 ['virus infection' 'disease_disease' 'coronavirus'] 
Score: -14.383748054504395 	 ['bronchiolitis obliterans' 'disease_disease' 'coronavirus'] 
Score:

### Query: What are the genes/proteins related to Covid-19

In [19]:
# Predict relation

triples, scores = query_topn(model, top_n=25, 
                             head='coronavirus', 
                             relation='disease_gene', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -12.008208274841309 	 ['coronavirus' 'disease_gene' 'body'] 
Score: -12.088303565979004 	 ['coronavirus' 'disease_gene' 'cell'] 
Score: -12.123513221740723 	 ['coronavirus' 'disease_gene' 'rare'] 
Score: -12.12818717956543 	 ['coronavirus' 'disease_gene' 'host cell'] 
Score: -12.157024383544922 	 ['coronavirus' 'disease_gene' 'large'] 
Score: -12.200387954711914 	 ['coronavirus' 'disease_gene' 'hepatocytes'] 
Score: -12.2405366897583 	 ['coronavirus' 'disease_gene' '16'] 
Score: -12.269770622253418 	 ['coronavirus' 'disease_gene' 'mask'] 
Score: -12.296977996826172 	 ['coronavirus' 'disease_gene' 'set'] 
Score: -12.316703796386719 	 ['coronavirus' 'disease_gene' 'air'] 
Score: -12.338523864746094 	 ['coronavirus' 'disease_gene' 'genome'] 
Score: -12.382718086242676 	 ['coronavirus' 'disease_gene' 'damage'] 
Score: -12.432119369506836 	 ['coronavirus' 'disease_gene' 'end'] 
Score: -12.534629821777344 	 ['coronavirus' 'disease_gene' 'type'] 
Score: -12.552530288696289 	 ['coronavi

In [24]:
# Predict relation

triples, scores = query_topn(model, top_n=20, 
                             head= None, 
                             relation='gene_disease', 
                             tail='coronavirus', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -11.019047737121582 	 ['all' 'gene_disease' 'coronavirus'] 
Score: -11.169663429260254 	 ['cat' 'gene_disease' 'coronavirus'] 
Score: -11.176797866821289 	 ['aim' 'gene_disease' 'coronavirus'] 
Score: -11.355566024780273 	 ['she' 'gene_disease' 'coronavirus'] 
Score: -11.378961563110352 	 ['light' 'gene_disease' 'coronavirus'] 
Score: -11.392069816589355 	 ['sex' 'gene_disease' 'coronavirus'] 
Score: -11.52822494506836 	 ['gene' 'gene_disease' 'coronavirus'] 
Score: -11.587860107421875 	 ['impact' 'gene_disease' 'coronavirus'] 
Score: -11.678779602050781 	 ['rest' 'gene_disease' 'coronavirus'] 
Score: -11.848551750183105 	 ['pig' 'gene_disease' 'coronavirus'] 
Score: -11.853300094604492 	 ['bat' 'gene_disease' 'coronavirus'] 
Score: -11.876259803771973 	 ['protein' 'gene_disease' 'coronavirus'] 
Score: -11.883123397827148 	 ['antibody' 'gene_disease' 'coronavirus'] 
Score: -11.908979415893555 	 ['type' 'gene_disease' 'coronavirus'] 
Score: -11.928110122680664 	 ['mix' 'gene_dise

### Query: What species transmit the covid-19

In [21]:
# Predict relation
triples, scores = query_topn(model, top_n=20, 
                             tail=None, 
                             relation='disease_species', 
                             head='coronavirus', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -9.148772239685059 	 ['coronavirus' 'disease_species' 'specie'] 
Score: -9.719999313354492 	 ['coronavirus' 'disease_species' 'livestock'] 
Score: -10.14963150024414 	 ['coronavirus' 'disease_species' 'vector'] 
Score: -10.255745887756348 	 ['coronavirus' 'disease_species' 'animal'] 
Score: -10.488275527954102 	 ['coronavirus' 'disease_species' 'animal model'] 
Score: -11.005115509033203 	 ['coronavirus' 'disease_species' 'working'] 
Score: -11.378012657165527 	 ['coronavirus' 'disease_species' 'meat'] 
Score: -11.800975799560547 	 ['coronavirus' 'disease_species' 'show'] 
Score: -12.31668472290039 	 ['coronavirus' 'disease_species' 'pet'] 
Score: -12.491943359375 	 ['coronavirus' 'disease_species' 'cardiopulmonary resuscitation'] 
Score: -12.516268730163574 	 ['coronavirus' 'disease_species' 'dissection'] 
Score: -12.610631942749023 	 ['coronavirus' 'disease_species' 'experimental animal model'] 
Score: -12.61493968963623 	 ['coronavirus' 'disease_species' 'animal study'] 
Scor

### Query: what are symptoms of the covid-19

In [22]:

triples, scores = query_topn(model, top_n=20, 
                             head=None, 
                             relation='symptom_disease', 
                             tail='coronavirus', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -11.40304183959961 	 ['fever' 'symptom_disease' 'coronavirus'] 
Score: -11.583837509155273 	 ['diarrhoea' 'symptom_disease' 'coronavirus'] 
Score: -11.613479614257812 	 ['diarrhea' 'symptom_disease' 'coronavirus'] 
Score: -11.663475036621094 	 ['illness' 'symptom_disease' 'coronavirus'] 
Score: -11.725469589233398 	 ['pain' 'symptom_disease' 'coronavirus'] 
Score: -11.739606857299805 	 ['finding' 'symptom_disease' 'coronavirus'] 
Score: -11.766240119934082 	 ['headache' 'symptom_disease' 'coronavirus'] 
Score: -11.966055870056152 	 ['vomiting' 'symptom_disease' 'coronavirus'] 
Score: -11.974838256835938 	 ['weakness' 'symptom_disease' 'coronavirus'] 
Score: -11.9857816696167 	 ['cough' 'symptom_disease' 'coronavirus'] 
Score: -12.02140998840332 	 ['watery diarrhea' 'symptom_disease' 'coronavirus'] 
Score: -12.023786544799805 	 ['dyspnea' 'symptom_disease' 'coronavirus'] 
Score: -12.024237632751465 	 ['discharge' 'symptom_disease' 'coronavirus'] 
Score: -12.107635498046875 	 ['le

In [23]:
triples, scores = query_topn(model, top_n=20, 
                             tail=None, 
                             relation='disease_symptom', 
                             head='coronavirus', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -10.531105041503906 	 ['coronavirus' 'disease_symptom' 'illness'] 
Score: -10.567334175109863 	 ['coronavirus' 'disease_symptom' 'diarrhea'] 
Score: -10.943293571472168 	 ['coronavirus' 'disease_symptom' 'organ failure'] 
Score: -11.011301040649414 	 ['coronavirus' 'disease_symptom' 'severe diarrhea'] 
Score: -11.080653190612793 	 ['coronavirus' 'disease_symptom' 'fever'] 
Score: -11.167813301086426 	 ['coronavirus' 'disease_symptom' 'respiratory distress'] 
Score: -11.169528007507324 	 ['coronavirus' 'disease_symptom' 'coughing'] 
Score: -11.178520202636719 	 ['coronavirus' 'disease_symptom' 'cough'] 
Score: -11.181768417358398 	 ['coronavirus' 'disease_symptom' 'symptom'] 
Score: -11.194692611694336 	 ['coronavirus' 'disease_symptom' 'nasal discharge'] 
Score: -11.199895858764648 	 ['coronavirus' 'disease_symptom' 'nausea'] 
Score: -11.250325202941895 	 ['coronavirus' 'disease_symptom' 'rash'] 
Score: -11.370763778686523 	 ['coronavirus' 'disease_symptom' 'dyspnea'] 
Score: -1

In [None]:
# Compare the raw call and graph embedding results for the symptoms query

## raw call
#dataset.loc[(dataset['subject']=='symptom')].head(20)

#####==> the result of raw call seems to be better

In [None]:
##graph embedding
triples, scores = query_topn(model, top_n=20, 
                             head=, 
                             relation='be_related_with', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))