<a href="https://colab.research.google.com/github/HuyenNguyenHelen/CORD-19-KG/blob/master/covid_19_kg_mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Preliminaries

In [1]:
# If using Google Colab run this cell 

# select tensorflow version for colab 
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
import tensorflow as tf 

print('TensorFlow  version: {}'.format(tf.__version__))

# Get the GPU name
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow  version: 1.15.2
Found GPU at: /device:GPU:0


In [3]:
%%capture 
# Install AmpliGraph library
! pip install ampligraph

# Required to visualize embeddings with tensorboard projector, comment out if not required!
! pip install --user tensorboard

# Required to plot text on embedding clusters, comment out if not required!
! pip install --user git+https://github.com/Phlya/adjustText

In [4]:
# All imports used in this tutorial 
%tensorflow_version 1.x
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf
from ampligraph.datasets import load_fb15k_237
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.discovery import query_topn, discover_facts, find_clusters
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model

def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))

print('Ampligraph version: {}'.format(ampligraph.__version__))

Ampligraph version: 1.4.0


# 2. Loading a Knowledge Graph dataset

In [5]:
from ampligraph.datasets import load_fb15k_237, load_wn18rr, load_yago3_10
import pandas as pd
import requests
import io

In [6]:
# Downloading the csv file from your GitHub account
re_pdf = requests.get('https://raw.githubusercontent.com/HuyenNguyenHelen/CORD-19-KG/master/new_triples_with_predefined_relations_pdf_June20_.csv').content
re_pmc = requests.get('https://raw.githubusercontent.com/HuyenNguyenHelen/CORD-19-KG/master/new_triples_with_predefined_relations_pmc_June20_.csv').content
pdf = pd.read_csv(io.StringIO(re_pdf.decode('utf-8')))
pmc =  pd.read_csv(io.StringIO(re_pmc.decode('utf-8')))
dataset = pd.concat([pmc, pdf])
print(dataset.columns)
dataset.drop(columns='Unnamed: 0', inplace=True)
dataset = dataset[['subject','new_relation', 'object' ]]
dataset.head(5)

Index(['Unnamed: 0', 'Unnamed: 0.1', 'subject', 'object', 'new_relation'], dtype='object')


Unnamed: 0,subject,new_relation,object
0,dysfunction,disease_species,child
1,et dysfunction,disease_species,child
2,et dysfunction,disease_species,child
3,rsv,disease_disease,virus
4,treatment,treat_procedure_species,child


In [7]:
print('Total triples in the KG:', dataset.shape)

Total triples in the KG: (230330, 3)


In [8]:
print('the number of relations: \n', dataset['new_relation'].value_counts())

the number of relations: 
 disease_disease                    63192
gene_gene                          24172
disease_species                    22370
gene_disease                       14220
disease_gene                       13347
disease_treat_procedure            10184
treat_procedure_disease             8223
drug_disease                        7250
gene_treat_procedure                6142
treat_procedure_treat_procedure     5625
gene_species                        5298
drug_gene                           5270
drug_drug                           4613
treat_procedure_gene                4602
disease_symptom                     4354
treat_procedure_species             4263
disease_drug                        4044
drug_species                        3680
gene_drug                           3603
symptom_disease                     3411
symptom_symptom                     3266
drug_treat_procedure                3229
treat_procedure_drug                2578
symptom_species               

## 2.1 Create training, validation and test splits

In [9]:
from ampligraph.evaluation import train_test_split_no_unseen
# get the validation set of size 500
test_train, X_valid = train_test_split_no_unseen(dataset.values, 500, seed=0)

# get the test set of size 1000 from the remaining triples
X_train, X_test = train_test_split_no_unseen(test_train, 1000, seed=0)

print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)

Total triples: (230330, 3)
Size of train: (228830, 3)
Size of valid: (500, 3)
Size of test: (1000, 3)


# 3. Model Training

In [10]:
from ampligraph.latent_features import TransE

# Train a KGE model
model = TransE(k=300, 
               epochs=100, 
               eta=1, 
               loss='multiclass_nll', 
               initializer='xavier', initializer_params={'uniform': False},
               regularizer='LP', regularizer_params= {'lambda': 0.001, 'p': 3},
               optimizer= 'adam', optimizer_params= {'lr': 0.0001}, 
               seed= 0, batches_count= 10, verbose=True)

model.fit(X_train)
# ----------------------
# Evaluate: 
# Filtered evaluation with ranking strategy assigning worst rank to break ties

from ampligraph.utils import save_model, restore_model
save_model(model, 'TransE.pkl')
model = restore_model('TransE.pkl')

# create the filter 
X_filter = np.concatenate([X_train, X_valid, X_test], 0)

# compute ranks
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=X_filter)

# ranks are computed per triple
print('Test set:', X_test.shape)
print('Size of ranks:', ranks.shape)

# Aggregate metrics show the aggregate performance of the model on the test set using a single number
display_aggregate_metrics(ranks)
# ----------------------

Average TransE Loss:   0.070291: 100%|██████████| 100/100 [00:12<00:00,  8.16epoch/s]
100%|██████████| 1000/1000 [00:05<00:00, 176.16it/s]


Test set: (1000, 3)
Size of ranks: (1000, 2)
Mean Rank: 266.9585
Mean Reciprocal Rank: 0.1970958051101608
Hits@1: 0.1235
Hits@10: 0.3445
Hits@100: 0.6635


# 4. Knowledge Discovery

### Relations

In [28]:
print('the number of relations: \n', dataset['new_relation'].value_counts())

the number of relations: 
 disease_disease                    63192
gene_gene                          24172
disease_species                    22370
gene_disease                       14220
disease_gene                       13347
disease_treat_procedure            10184
treat_procedure_disease             8223
drug_disease                        7250
gene_treat_procedure                6142
treat_procedure_treat_procedure     5625
gene_species                        5298
drug_gene                           5270
drug_drug                           4613
treat_procedure_gene                4602
disease_symptom                     4354
treat_procedure_species             4263
disease_drug                        4044
drug_species                        3680
gene_drug                           3603
symptom_disease                     3411
symptom_symptom                     3266
drug_treat_procedure                3229
treat_procedure_drug                2578
symptom_species               

##  Question - Answering

``` 
    <head, relation, ?> 
    <head, ?,        tail>
    <?,    relation, tail>
```

### Query: what are the possible therapeutic procedure of covid-19??

In [12]:
# Query: what are the possible therapeutic procedure of covid-19??

from ampligraph.discovery import query_topn

# restore the previously saved model to save time
model = restore_model('TransE.pkl')

triples, scores = query_topn(model, top_n=20, 
                             head='covid-19', 
                             relation='disease_treat_procedure', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -10.239635467529297 	 ['covid-19' 'disease_treat_procedure' 'extracted'] 
Score: -10.519999504089355 	 ['covid-19' 'disease_treat_procedure' 'detected'] 
Score: -10.678789138793945 	 ['covid-19' 'disease_treat_procedure' 'destruction'] 
Score: -10.81458854675293 	 ['covid-19' 'disease_treat_procedure' 'induction'] 
Score: -10.815917015075684 	 ['covid-19' 'disease_treat_procedure' 'adjustment'] 
Score: -10.87232780456543 	 ['covid-19' 'disease_treat_procedure' 'expression'] 
Score: -10.888626098632812 	 ['covid-19' 'disease_treat_procedure' 'infiltration'] 
Score: -10.890941619873047 	 ['covid-19' 'disease_treat_procedure' 'diet'] 
Score: -10.910229682922363 	 ['covid-19' 'disease_treat_procedure' 'release'] 
Score: -10.937187194824219 	 ['covid-19' 'disease_treat_procedure' 'reduction'] 
Score: -10.941555976867676 	 ['covid-19' 'disease_treat_procedure' 'mechanical ventilation'] 
Score: -10.951570510864258 	 ['covid-19' 'disease_treat_procedure' 'isolation'] 
Score: -10.9551572

In [13]:
# Query: what are the possible therapeutic procedure of covid-19??

from ampligraph.discovery import query_topn

# restore the previously saved model to save time
model = restore_model('TransE.pkl')

triples, scores = query_topn(model, top_n=20, 
                             head=None, 
                             relation='treat_procedure_disease', 
                             tail='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -11.22701358795166 	 ['vaccination' 'treat_procedure_disease' 'covid-19'] 
Score: -11.275487899780273 	 ['administration' 'treat_procedure_disease' 'covid-19'] 
Score: -11.320928573608398 	 ['reduction' 'treat_procedure_disease' 'covid-19'] 
Score: -11.335749626159668 	 ['detection' 'treat_procedure_disease' 'covid-19'] 
Score: -11.370582580566406 	 ['treatment' 'treat_procedure_disease' 'covid-19'] 
Score: -11.393146514892578 	 ['inoculation' 'treat_procedure_disease' 'covid-19'] 
Score: -11.401979446411133 	 ['adoptive transfer' 'treat_procedure_disease' 'covid-19'] 
Score: -11.411062240600586 	 ['injection' 'treat_procedure_disease' 'covid-19'] 
Score: -11.428948402404785 	 ['isolation' 'treat_procedure_disease' 'covid-19'] 
Score: -11.433673858642578 	 ['coagulation' 'treat_procedure_disease' 'covid-19'] 
Score: -11.468090057373047 	 ['delivery' 'treat_procedure_disease' 'covid-19'] 
Score: -11.468165397644043 	 ['change' 'treat_procedure_disease' 'covid-19'] 
Score: -11.490

### Query: What are related diseases of Covid-19

In [14]:
# Predict relation

triples, scores = query_topn(model, top_n=20, 
                             head='covid-19', 
                             relation='disease_disease', 
                             tail= None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -12.314218521118164 	 ['covid-19' 'disease_disease' 'covid-19'] 
Score: -13.747383117675781 	 ['covid-19' 'disease_disease' '2009'] 
Score: -13.805061340332031 	 ['covid-19' 'disease_disease' 'infectious'] 
Score: -13.832538604736328 	 ['covid-19' 'disease_disease' 'severe'] 
Score: -13.853185653686523 	 ['covid-19' 'disease_disease' 'fatal'] 
Score: -13.881415367126465 	 ['covid-19' 'disease_disease' 'major'] 
Score: -13.927454948425293 	 ['covid-19' 'disease_disease' 'contagious'] 
Score: -14.063346862792969 	 ['covid-19' 'disease_disease' 'chronic'] 
Score: -14.13708782196045 	 ['covid-19' 'disease_disease' 'transmissible'] 
Score: -14.186688423156738 	 ['covid-19' 'disease_disease' 'childhood'] 
Score: -14.204536437988281 	 ['covid-19' 'disease_disease' 'acute respiratory syndrome coronavirus 2'] 
Score: -14.283987998962402 	 ['covid-19' 'disease_disease' 'lower'] 
Score: -14.322210311889648 	 ['covid-19' 'disease_disease' 'multi-organ failure'] 
Score: -14.443628311157227 	

In [15]:
# Predict relation

triples, scores = query_topn(model, top_n=20, 
                             head=None, 
                             relation='disease_disease', 
                             tail= 'covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -12.314218521118164 	 ['covid-19' 'disease_disease' 'covid-19'] 
Score: -14.252700805664062 	 ['he' 'disease_disease' 'covid-19'] 
Score: -14.604111671447754 	 ['coronaviruses' 'disease_disease' 'covid-19'] 
Score: -14.629844665527344 	 ['alzheimer' 'disease_disease' 'covid-19'] 
Score: -14.666099548339844 	 ['noroviruses' 'disease_disease' 'covid-19'] 
Score: -14.688876152038574 	 ['sars cov infection' 'disease_disease' 'covid-19'] 
Score: -14.689584732055664 	 ['pneumoniae' 'disease_disease' 'covid-19'] 
Score: -14.724080085754395 	 ['fungal infection' 'disease_disease' 'covid-19'] 
Score: -14.724721908569336 	 ['rabies' 'disease_disease' 'covid-19'] 
Score: -14.733174324035645 	 ['blepharitis' 'disease_disease' 'covid-19'] 
Score: -14.773030281066895 	 ['dad' 'disease_disease' 'covid-19'] 
Score: -14.79124641418457 	 ['covid' 'disease_disease' 'covid-19'] 
Score: -14.80870532989502 	 ['hbov' 'disease_disease' 'covid-19'] 
Score: -14.832033157348633 	 ['cholera' 'disease_disea

### Query: What are the genes/proteins related to Covid-19

In [16]:
# Predict relation

triples, scores = query_topn(model, top_n=25, 
                             head='covid-19', 
                             relation='disease_gene', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -11.301939010620117 	 ['covid-19' 'disease_gene' 'rare'] 
Score: -11.593245506286621 	 ['covid-19' 'disease_gene' 'hong kong'] 
Score: -11.721550941467285 	 ['covid-19' 'disease_gene' 'spring'] 
Score: -11.920463562011719 	 ['covid-19' 'disease_gene' 'lobe'] 
Score: -12.016401290893555 	 ['covid-19' 'disease_gene' 'airway'] 
Score: -12.056889533996582 	 ['covid-19' 'disease_gene' 'wild'] 
Score: -12.132339477539062 	 ['covid-19' 'disease_gene' 'damage'] 
Score: -12.144424438476562 	 ['covid-19' 'disease_gene' 'small'] 
Score: -12.155984878540039 	 ['covid-19' 'disease_gene' 'end'] 
Score: -12.161270141601562 	 ['covid-19' 'disease_gene' 'cell'] 
Score: -12.18165397644043 	 ['covid-19' 'disease_gene' 'past'] 
Score: -12.194324493408203 	 ['covid-19' 'disease_gene' 'impact'] 
Score: -12.207731246948242 	 ['covid-19' 'disease_gene' 'host'] 
Score: -12.241068840026855 	 ['covid-19' 'disease_gene' 'bite'] 
Score: -12.254423141479492 	 ['covid-19' 'disease_gene' 'genome'] 
Score: -12.

In [17]:
# Predict relation

triples, scores = query_topn(model, top_n=25, 
                             head= None, 
                             relation='gene_disease', 
                             tail='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -9.742734909057617 	 ['she' 'gene_disease' 'covid-19'] 
Score: -9.76704216003418 	 ['il' 'gene_disease' 'covid-19'] 
Score: -9.95561408996582 	 ['pattern' 'gene_disease' 'covid-19'] 
Score: -9.9683837890625 	 ['sex' 'gene_disease' 'covid-19'] 
Score: -9.998523712158203 	 ['damage' 'gene_disease' 'covid-19'] 
Score: -10.009273529052734 	 ['impact' 'gene_disease' 'covid-19'] 
Score: -10.01913070678711 	 ['ace2' 'gene_disease' 'covid-19'] 
Score: -10.035797119140625 	 ['protein' 'gene_disease' 'covid-19'] 
Score: -10.038834571838379 	 ['antibody' 'gene_disease' 'covid-19'] 
Score: -10.039408683776855 	 ['all' 'gene_disease' 'covid-19'] 
Score: -10.101898193359375 	 ['csf' 'gene_disease' 'covid-19'] 
Score: -10.102152824401855 	 ['autophagy' 'gene_disease' 'covid-19'] 
Score: -10.107024192810059 	 ['gene' 'gene_disease' 'covid-19'] 
Score: -10.127727508544922 	 ['mabs' 'gene_disease' 'covid-19'] 
Score: -10.131169319152832 	 ['ifn' 'gene_disease' 'covid-19'] 
Score: -10.149798393249

### Query: What species transmit the covid-19

In [18]:
# Predict relation
triples, scores = query_topn(model, top_n=20, 
                             tail=None, 
                             relation='disease_species', 
                             head='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -11.424055099487305 	 ['covid-19' 'disease_species' 'child'] 
Score: -11.514713287353516 	 ['covid-19' 'disease_species' 'dog'] 
Score: -11.559194564819336 	 ['covid-19' 'disease_species' 'cattle'] 
Score: -11.57503890991211 	 ['covid-19' 'disease_species' 'patient'] 
Score: -11.654004096984863 	 ['covid-19' 'disease_species' 'rabbit'] 
Score: -11.669891357421875 	 ['covid-19' 'disease_species' 'specie'] 
Score: -11.695438385009766 	 ['covid-19' 'disease_species' 'woman'] 
Score: -11.699715614318848 	 ['covid-19' 'disease_species' 'piglet'] 
Score: -11.706042289733887 	 ['covid-19' 'disease_species' 'horse'] 
Score: -11.741148948669434 	 ['covid-19' 'disease_species' 'swine'] 
Score: -11.748517990112305 	 ['covid-19' 'disease_species' 'calf'] 
Score: -11.773248672485352 	 ['covid-19' 'disease_species' 'sheep'] 
Score: -11.791015625 	 ['covid-19' 'disease_species' 'human'] 
Score: -11.791841506958008 	 ['covid-19' 'disease_species' 'people'] 
Score: -11.792166709899902 	 ['covid-

### Query: what are symptoms of the covid-19

In [25]:

triples, scores = query_topn(model, top_n=25, 
                             head=None, 
                             relation='symptom_disease', 
                             tail='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -9.744991302490234 	 ['finding' 'symptom_disease' 'covid-19'] 
Score: -9.843525886535645 	 ['severe diarrhea' 'symptom_disease' 'covid-19'] 
Score: -9.850793838500977 	 ['imbalance' 'symptom_disease' 'covid-19'] 
Score: -9.889549255371094 	 ['discharge' 'symptom_disease' 'covid-19'] 
Score: -9.985763549804688 	 ['symptom' 'symptom_disease' 'covid-19'] 
Score: -10.038444519042969 	 ['discomfort' 'symptom_disease' 'covid-19'] 
Score: -10.122014999389648 	 ['weakness' 'symptom_disease' 'covid-19'] 
Score: -10.127248764038086 	 ['joint pain' 'symptom_disease' 'covid-19'] 
Score: -10.127302169799805 	 ['cyanosis' 'symptom_disease' 'covid-19'] 
Score: -10.161859512329102 	 ['nasal congestion' 'symptom_disease' 'covid-19'] 
Score: -10.248138427734375 	 ['syncope' 'symptom_disease' 'covid-19'] 
Score: -10.281612396240234 	 ['watery diarrhea' 'symptom_disease' 'covid-19'] 
Score: -10.328786849975586 	 ['pyrexia' 'symptom_disease' 'covid-19'] 
Score: -10.328922271728516 	 ['anesthesia' 's

In [26]:
triples, scores = query_topn(model, top_n=25, 
                             tail=None, 
                             relation='disease_symptom', 
                             head='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -10.660646438598633 	 ['covid-19' 'disease_symptom' 'discharge'] 
Score: -10.675355911254883 	 ['covid-19' 'disease_symptom' 'discomfort'] 
Score: -10.728038787841797 	 ['covid-19' 'disease_symptom' 'severe diarrhea'] 
Score: -10.75567626953125 	 ['covid-19' 'disease_symptom' 'finding'] 
Score: -10.75689697265625 	 ['covid-19' 'disease_symptom' 'imbalance'] 
Score: -10.820416450500488 	 ['covid-19' 'disease_symptom' 'organ failure'] 
Score: -10.840065002441406 	 ['covid-19' 'disease_symptom' 'symptom'] 
Score: -10.903345108032227 	 ['covid-19' 'disease_symptom' 'insomnia'] 
Score: -10.933250427246094 	 ['covid-19' 'disease_symptom' 'flaccid paralysis'] 
Score: -10.991533279418945 	 ['covid-19' 'disease_symptom' 'emaciation'] 
Score: -11.03813362121582 	 ['covid-19' 'disease_symptom' 'illness'] 
Score: -11.041977882385254 	 ['covid-19' 'disease_symptom' 'febrile'] 
Score: -11.09693717956543 	 ['covid-19' 'disease_symptom' 'ill'] 
Score: -11.105957984924316 	 ['covid-19' 'disease_

### What are potential drugs to treat COVID-19?

In [32]:

triples, scores = query_topn(model, top_n=30, 
                             head=None, 
                             relation='drug_disease', 
                             tail='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -9.394088745117188 	 ['i' 'drug_disease' 'covid-19'] 
Score: -9.472557067871094 	 ['metformin' 'drug_disease' 'covid-19'] 
Score: -9.535256385803223 	 ['5' 'drug_disease' 'covid-19'] 
Score: -9.555984497070312 	 ['ribavirin' 'drug_disease' 'covid-19'] 
Score: -9.582306861877441 	 ['vitamin d' 'drug_disease' 'covid-19'] 
Score: -9.585634231567383 	 ['melatonin' 'drug_disease' 'covid-19'] 
Score: -9.606701850891113 	 ['chloroquine' 'drug_disease' 'covid-19'] 
Score: -9.611522674560547 	 ['drug' 'drug_disease' 'covid-19'] 
Score: -9.615221977233887 	 ['2' 'drug_disease' 'covid-19'] 
Score: -9.621654510498047 	 ['glycyrrhizin' 'drug_disease' 'covid-19'] 
Score: -9.665054321289062 	 ['β' 'drug_disease' 'covid-19'] 
Score: -9.678360939025879 	 ['curcumin' 'drug_disease' 'covid-19'] 
Score: -9.69880485534668 	 ['corticosteroid' 'drug_disease' 'covid-19'] 
Score: -9.712905883789062 	 ['3' 'drug_disease' 'covid-19'] 
Score: -9.743366241455078 	 ['α' 'drug_disease' 'covid-19'] 
Score: -9.

In [23]:
# Compare the raw call and graph embedding results for the symptoms query

## raw call
#dataset.loc[(dataset['subject']=='symptom')].head(20)

#####==> the result of raw call seems to be better