<a href="https://colab.research.google.com/github/HuyenNguyenHelen/CORD-19-KG/blob/master/covid_19_kg_mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Preliminaries

In [None]:
# If using Google Colab run this cell 

# select tensorflow version for colab 
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [None]:
import tensorflow as tf 

print('TensorFlow  version: {}'.format(tf.__version__))

# Get the GPU name
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow  version: 1.15.2
Found GPU at: /device:GPU:0


In [None]:
%%capture 
# Install AmpliGraph library
! pip install ampligraph

# Required to visualize embeddings with tensorboard projector, comment out if not required!
! pip install --user tensorboard

# Required to plot text on embedding clusters, comment out if not required!
! pip install --user git+https://github.com/Phlya/adjustText

In [None]:
# All imports used in this tutorial 
%tensorflow_version 1.x
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf
from ampligraph.datasets import load_fb15k_237
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.discovery import query_topn, discover_facts, find_clusters
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model

def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))

print('Ampligraph version: {}'.format(ampligraph.__version__))

Ampligraph version: 1.4.0


# 2. Loading a Knowledge Graph dataset

In [None]:
from ampligraph.datasets import load_fb15k_237, load_wn18rr, load_yago3_10

In [None]:
import pandas as pd
pmc = pd.read_csv('/content/new_triples_with_predefined_relations_pmc_June10_.csv')
pdf = pd.read_csv('/content/new_triples_with_predefined_relations_pdf_June10_.csv')
dataset = pd.concat([pmc, pdf])
print(dataset.columns)
dataset.drop(columns='Unnamed: 0', inplace=True)
#dataset.columns = ['subject',  'object', 'predicate',]
dataset = dataset[['subject','new_relation', 'object' ]]
dataset.head(5)

Index(['Unnamed: 0', 'Unnamed: 0.1', 'subject', 'object', 'new_relation'], dtype='object')


Unnamed: 0,subject,new_relation,object
0,rsv,disease_disease,virus
1,urgency,disease_disease,virus
2,urgency,disease_disease,virus
3,virus,disease_disease,virus
4,step,gene_gene,endosomes


In [None]:
print('Total triples in the KG:', dataset.shape)

Total triples in the KG: (175728, 3)


In [None]:
print('the number of relations: \n', dataset['new_relation'].value_counts())

the number of relations: 
 disease_disease            87479
gene_gene                  25101
gene_disease               17414
disease_gene               17383
disease_treat_procedure    14696
gene_treat_procedure        8913
disease_symptom             2501
symptom_disease             1409
gene_symptom                 310
symptom_treat_procedure      211
symptom_gene                 170
symptom_symptom              141
Name: new_relation, dtype: int64


## 2.1 Create training, validation and test splits

In [None]:
from ampligraph.evaluation import train_test_split_no_unseen
# get the validation set of size 500
test_train, X_valid = train_test_split_no_unseen(dataset.values, 500, seed=0)

# get the test set of size 1000 from the remaining triples
X_train, X_test = train_test_split_no_unseen(test_train, 1000, seed=0)

print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)

Total triples: (175728, 3)
Size of train: (174228, 3)
Size of valid: (500, 3)
Size of test: (1000, 3)


# 3. Model Training

In [None]:
from ampligraph.latent_features import TransE

# Train a KGE model
model = TransE(k=300, 
               epochs=100, 
               eta=1, 
               loss='multiclass_nll', 
               initializer='xavier', initializer_params={'uniform': False},
               regularizer='LP', regularizer_params= {'lambda': 0.001, 'p': 3},
               optimizer= 'adam', optimizer_params= {'lr': 0.0001}, 
               seed= 0, batches_count= 10, verbose=True)

model.fit(X_train)
# ----------------------
# Evaluate: 
# Filtered evaluation with ranking strategy assigning worst rank to break ties

from ampligraph.utils import save_model, restore_model
save_model(model, 'TransE.pkl')
model = restore_model('TransE.pkl')

# create the filter 
X_filter = np.concatenate([X_train, X_valid, X_test], 0)

# compute ranks
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=X_filter)

# ranks are computed per triple
print('Test set:', X_test.shape)
print('Size of ranks:', ranks.shape)

# Aggregate metrics show the aggregate performance of the model on the test set using a single number
display_aggregate_metrics(ranks)
# ----------------------

Average TransE Loss:   0.086463: 100%|██████████| 100/100 [00:10<00:00,  9.97epoch/s]
100%|██████████| 1000/1000 [00:05<00:00, 173.27it/s]


Test set: (1000, 3)
Size of ranks: (1000, 2)
Mean Rank: 270.848
Mean Reciprocal Rank: 0.15688141687115245
Hits@1: 0.0845
Hits@10: 0.29
Hits@100: 0.639


# 4. Knowledge Discovery

### Relations

In [None]:
# Relations
'''
disease_disease            87479
gene_gene                  25101
gene_disease               17414
disease_gene               17383
disease_treat_procedure    14696
gene_treat_procedure        8913
disease_symptom             2501
symptom_disease             1409
gene_symptom                 310
symptom_treat_procedure      211
symptom_gene                 170
symptom_symptom              141
Name: new_relation, dtype: int64
'''

##  Question - Answering

``` 
    <head, relation, ?> 
    <head, ?,        tail>
    <?,    relation, tail>
```

### Query: what are the possible therapeutic procedure of covid-19??

In [None]:
# Query: what are the possible therapeutic procedure of covid-19??

from ampligraph.discovery import query_topn

# restore the previously saved model to save time
model = restore_model('TransE.pkl')

triples, scores = query_topn(model, top_n=20, 
                             head='covid-19', 
                             relation='disease_treat_procedure', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -12.528743743896484 	 ['covid-19' 'disease_treat_procedure' 'inoculation'] 
Score: -12.528912544250488 	 ['covid-19' 'disease_treat_procedure' 'infiltration'] 
Score: -12.54183578491211 	 ['covid-19' 'disease_treat_procedure' 'presence'] 
Score: -12.562738418579102 	 ['covid-19' 'disease_treat_procedure' 'vaccination'] 
Score: -12.579742431640625 	 ['covid-19' 'disease_treat_procedure' 'detected'] 
Score: -12.590995788574219 	 ['covid-19' 'disease_treat_procedure' 'treatment'] 
Score: -12.625072479248047 	 ['covid-19' 'disease_treat_procedure' 'extracted'] 
Score: -12.647272109985352 	 ['covid-19' 'disease_treat_procedure' '5'] 
Score: -12.651243209838867 	 ['covid-19' 'disease_treat_procedure' 'mechanical ventilation'] 
Score: -12.663618087768555 	 ['covid-19' 'disease_treat_procedure' 'surgery'] 
Score: -12.679913520812988 	 ['covid-19' 'disease_treat_procedure' 'therapy'] 
Score: -12.689506530761719 	 ['covid-19' 'disease_treat_procedure' '2'] 
Score: -12.692441940307617 	 ['

In [None]:
# Query: what are the possible therapeutic procedure of covid-19??

from ampligraph.discovery import query_topn

# restore the previously saved model to save time
model = restore_model('TransE.pkl')

triples, scores = query_topn(model, top_n=20, 
                             head=None, 
                             relation='disease_treat_procedure', 
                             tail='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -14.198332786560059 	 ['rna' 'disease_treat_procedure' 'covid-19'] 
Score: -14.571569442749023 	 ['y' 'disease_treat_procedure' 'covid-19'] 
Score: -14.58637809753418 	 ['infection' 'disease_treat_procedure' 'covid-19'] 
Score: -14.610550880432129 	 ['vaccine' 'disease_treat_procedure' 'covid-19'] 
Score: -14.61115550994873 	 ['virus' 'disease_treat_procedure' 'covid-19'] 
Score: -14.833165168762207 	 ['r' 'disease_treat_procedure' 'covid-19'] 
Score: -14.864860534667969 	 ['respiratory virus' 'disease_treat_procedure' 'covid-19'] 
Score: -14.87765884399414 	 ['influenza virus' 'disease_treat_procedure' 'covid-19'] 
Score: -14.911301612854004 	 ['viral infection' 'disease_treat_procedure' 'covid-19'] 
Score: -14.933761596679688 	 ['type' 'disease_treat_procedure' 'covid-19'] 
Score: -14.93657112121582 	 ['hrv' 'disease_treat_procedure' 'covid-19'] 
Score: -14.961681365966797 	 ['covid pandemic' 'disease_treat_procedure' 'covid-19'] 
Score: -14.96232795715332 	 ['s' 'disease_trea

### Query: What are related diseases of Covid-19

In [None]:
# Predict relation

triples, scores = query_topn(model, top_n=20, 
                             head='covid-19', 
                             relation='disease_disease', 
                             tail= None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -13.92523193359375 	 ['covid-19' 'disease_disease' 'covid-19'] 
Score: -15.167387008666992 	 ['covid-19' 'disease_disease' 'infectious'] 
Score: -15.199078559875488 	 ['covid-19' 'disease_disease' '2009'] 
Score: -15.316337585449219 	 ['covid-19' 'disease_disease' 'severe'] 
Score: -15.414501190185547 	 ['covid-19' 'disease_disease' 'contagious'] 
Score: -15.552555084228516 	 ['covid-19' 'disease_disease' 'major'] 
Score: -15.597020149230957 	 ['covid-19' 'disease_disease' 'high'] 
Score: -15.612503051757812 	 ['covid-19' 'disease_disease' 'fatal'] 
Score: -15.613870620727539 	 ['covid-19' 'disease_disease' 'transmissible'] 
Score: -15.625604629516602 	 ['covid-19' 'disease_disease' 'acute respiratory syndrome coronavirus 2'] 
Score: -15.714166641235352 	 ['covid-19' 'disease_disease' 'respiratory'] 
Score: -15.72925853729248 	 ['covid-19' 'disease_disease' 'chronic'] 
Score: -15.75595474243164 	 ['covid-19' 'disease_disease' 'lower'] 
Score: -15.871280670166016 	 ['covid-19' 'd

In [None]:
# Predict relation

triples, scores = query_topn(model, top_n=20, 
                             head=None, 
                             relation='disease_disease', 
                             tail= 'covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -13.92523193359375 	 ['covid-19' 'disease_disease' 'covid-19'] 
Score: -15.9541015625 	 ['he' 'disease_disease' 'covid-19'] 
Score: -16.78826904296875 	 ['blepharitis' 'disease_disease' 'covid-19'] 
Score: -16.802663803100586 	 ['covid' 'disease_disease' 'covid-19'] 
Score: -16.832122802734375 	 ['pneumoniae' 'disease_disease' 'covid-19'] 
Score: -16.835886001586914 	 ['alzheimer' 'disease_disease' 'covid-19'] 
Score: -16.861398696899414 	 ['tuberculosis' 'disease_disease' 'covid-19'] 
Score: -16.86189079284668 	 ['sars cov infection' 'disease_disease' 'covid-19'] 
Score: -16.866025924682617 	 ['fungal infection' 'disease_disease' 'covid-19'] 
Score: -16.890899658203125 	 ['ped' 'disease_disease' 'covid-19'] 
Score: -16.903242111206055 	 ['coronaviruses' 'disease_disease' 'covid-19'] 
Score: -16.91048240661621 	 ['coinfection' 'disease_disease' 'covid-19'] 
Score: -16.92896842956543 	 ['hbov' 'disease_disease' 'covid-19'] 
Score: -16.942283630371094 	 ['leishmaniasis' 'disease_d

### Query: What are the genes/proteins related to Covid-19

In [None]:
# Predict relation

triples, scores = query_topn(model, top_n=25, 
                             head='covid-19', 
                             relation='disease_gene', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -13.10327434539795 	 ['covid-19' 'disease_gene' 'rare'] 
Score: -13.399755477905273 	 ['covid-19' 'disease_gene' 'human'] 
Score: -13.443669319152832 	 ['covid-19' 'disease_gene' 'hong kong'] 
Score: -13.675713539123535 	 ['covid-19' 'disease_gene' 'end'] 
Score: -13.727230072021484 	 ['covid-19' 'disease_gene' 'past'] 
Score: -13.736001968383789 	 ['covid-19' 'disease_gene' 'impact'] 
Score: -13.736896514892578 	 ['covid-19' 'disease_gene' 'damage'] 
Score: -13.835280418395996 	 ['covid-19' 'disease_gene' 'host'] 
Score: -13.876059532165527 	 ['covid-19' 'disease_gene' 'set'] 
Score: -13.899578094482422 	 ['covid-19' 'disease_gene' 'cell'] 
Score: -13.906440734863281 	 ['covid-19' 'disease_gene' 'large'] 
Score: -13.910594940185547 	 ['covid-19' 'disease_gene' 'ace2 receptor'] 
Score: -13.949365615844727 	 ['covid-19' 'disease_gene' 'airway'] 
Score: -13.978567123413086 	 ['covid-19' 'disease_gene' 'spring'] 
Score: -13.995262145996094 	 ['covid-19' 'disease_gene' 'small'] 
Sco

In [None]:
# Predict relation
ent_control_list = ['cat', 'human','bat', 'monkey',  'dog' ]
triples, scores = query_topn(model, top_n=30, 
                             head=None, 
                             relation='transmit', 
                             tail='covid-19', 
                             ents_to_consider=ent_control_list, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -14.479377746582031 	 ['human' 'transmit' 'covid-19'] 
Score: -15.203999519348145 	 ['bat' 'transmit' 'covid-19'] 
Score: -15.406490325927734 	 ['dog' 'transmit' 'covid-19'] 
Score: -17.068496704101562 	 ['monkey' 'transmit' 'covid-19'] 
Score: -17.196857452392578 	 ['cat' 'transmit' 'covid-19'] 


In [None]:

triples, scores = query_topn(model, top_n=30, 
                             head=None, 
                             relation='be_related_with', 
                             tail='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -16.407957077026367 	 ['covid-19' 'be_related_with' 'covid-19'] 
Score: -16.447853088378906 	 ['patient' 'be_related_with' 'covid-19'] 
Score: -16.633697509765625 	 ['virus' 'be_related_with' 'covid-19'] 
Score: -16.792278289794922 	 ['%' 'be_related_with' 'covid-19'] 
Score: -16.93667221069336 	 ['symptom' 'be_related_with' 'covid-19'] 
Score: -16.963205337524414 	 ['inflection' 'be_related_with' 'covid-19'] 
Score: -17.318044662475586 	 ['data' 'be_related_with' 'covid-19'] 
Score: -17.333995819091797 	 ['child' 'be_related_with' 'covid-19'] 
Score: -17.363147735595703 	 ['antibody' 'be_related_with' 'covid-19'] 
Score: -17.38707733154297 	 ['person' 'be_related_with' 'covid-19'] 
Score: -17.523738861083984 	 ['coronavirus' 'be_related_with' 'covid-19'] 
Score: -17.556407928466797 	 ['influenza' 'be_related_with' 'covid-19'] 
Score: -17.590547561645508 	 ['sars' 'be_related_with' 'covid-19'] 
Score: -17.609272003173828 	 ['influenza virus' 'be_related_with' 'covid-19'] 
Score:

In [None]:
# Compare the raw call and graph embedding results for the symptoms query

## raw call
dataset.loc[(dataset['subject']=='symptom')].head(20)

#####==> the result of raw call seems to be better

Unnamed: 0,subject,new_relation,object
17,symptom,be_related_with,pulmonary edema
51,symptom,be_related_with,cough
53,symptom,be_related_with,nasal congestion
57,symptom,transmit,sars
72,symptom,be_related_with,headache
75,symptom,be_related_with,acute respiratory distress syndrome
108,symptom,be_related_with,influenza-like illness
158,symptom,be_related_with,sore throat
168,symptom,be_related_with,respiratory symptom
188,symptom,be_related_with,congestion


In [None]:
##graph embedding
triples, scores = query_topn(model, top_n=20, 
                             head='symptoms', 
                             relation='be_related_with', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))