<a href="https://colab.research.google.com/github/HuyenNguyenHelen/CORD-19-KG/blob/master/covid_19_kg_mining_Stage1_Aug22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Preliminaries

In [None]:
# If using Google Colab run this cell 

# select tensorflow version for colab 
%tensorflow_version 1.x

In [None]:
import tensorflow as tf 

print('TensorFlow  version: {}'.format(tf.__version__))

# Get the GPU name
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow  version: 1.15.2
Found GPU at: /device:GPU:0


In [None]:
%%capture 
# Install AmpliGraph library
! pip install ampligraph

# Required to visualize embeddings with tensorboard projector, comment out if not required!
! pip install --user tensorboard

# Required to plot text on embedding clusters, comment out if not required!
! pip install --user git+https://github.com/Phlya/adjustText

In [None]:
# All imports used in this tutorial 
%tensorflow_version 1.x
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf
from ampligraph.datasets import load_fb15k_237
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.discovery import query_topn, discover_facts, find_clusters
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model

def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))

print('Ampligraph version: {}'.format(ampligraph.__version__))

Ampligraph version: 1.4.0


# 2. Loading a Knowledge Graph dataset

In [None]:
from ampligraph.datasets import load_fb15k_237, load_wn18rr, load_yago3_10
import io
import requests

In [None]:
# Downloading the csv file from your GitHub account

re_pdf = requests.get('https://raw.githubusercontent.com/HuyenNguyenHelen/CORD-19-KG/master/final_triples_with_predefined_relations_cleaned_stage1-Aug22.csv').content
dataset = pd.read_csv(io.StringIO(re_pdf.decode('utf-8')))
dataset = dataset[['subject','new_relation', 'object' ]]
dataset.head(5)

Unnamed: 0,subject,new_relation,object
0,japan,species_species,human virus
1,serum,species_species,dog
2,apoptosis,gene_gene,anti-spike protein antibody
3,symptom,species_species,patient
4,aerosol,species_species,guinea pig


## 2.1. Pre-processing data

In [None]:
print('after droppingg null, length of dataset: ',len(dataset))
print('# of null values: ', dataset.isnull().sum().sum())
dataset = dataset.dropna()
print('after droppingg null, length of dataset: ', len(dataset))

after droppingg null, length of dataset:  14165
# of null values:  1062
after droppingg null, length of dataset:  13137


In [None]:
dataset[dataset['object']=='coronavirus']

Unnamed: 0,subject,new_relation,object
200,discovery,species_disease,coronavirus
246,recognition,species_disease,coronavirus
331,bat,species_disease,coronavirus
387,advance,species_disease,coronavirus
484,covs,species_disease,coronavirus
...,...,...,...
13635,syndrome,species_disease,coronavirus
13746,tgev,species_disease,coronavirus
13823,peplomers,species_disease,coronavirus
14015,rna virus,species_disease,coronavirus


In [None]:
# Normalize 'coronavirus' into 'covid-19'
dataset['subject'] = dataset['subject'].apply(lambda x: 'covid-19' if x =='coronavirus' else x)
dataset['object'] = dataset['object'].apply(lambda x: 'covid-19' if x =='coronavirus' else x)

In [None]:
print('Total triples in the KG:', dataset.shape)

Total triples in the KG: (13137, 3)


In [None]:
print('the number of relations: \n', dataset['new_relation'].value_counts())

the number of relations: 
 disease_disease    5924
gene_gene          2814
species_species    1954
species_disease     759
gene_disease        728
drug_drug           605
species_gene        315
gene_drug            38
Name: new_relation, dtype: int64


## 2.2 Create training, validation and test splits

In [None]:
from ampligraph.evaluation import train_test_split_no_unseen
# get the validation set of size 500
test_train, X_valid = train_test_split_no_unseen(dataset.values, 500, seed=0)

# get the test set of size 1000 from the remaining triples
X_train, X_test = train_test_split_no_unseen(test_train, 1000, seed=0)

print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)

Total triples: (13137, 3)
Size of train: (11637, 3)
Size of valid: (500, 3)
Size of test: (1000, 3)


# 3. Model Training

In [None]:
from ampligraph.latent_features import TransE

# Train a KGE model
model = TransE(k=300, 
               epochs=100, 
               eta=1, 
               loss='multiclass_nll', 
               initializer='xavier', initializer_params={'uniform': False},
               regularizer='LP', regularizer_params= {'lambda': 0.001, 'p': 3},
               optimizer= 'adam', optimizer_params= {'lr': 0.0001}, 
               seed= 0, batches_count= 10, verbose=True)

model.fit(X_train)
# ----------------------
# Evaluate: 
# Filtered evaluation with ranking strategy assigning worst rank to break ties

from ampligraph.utils import save_model, restore_model
save_model(model, 'TransE.pkl')
model = restore_model('TransE.pkl')

# create the filter 
X_filter = np.concatenate([X_train, X_valid, X_test], 0)

# compute ranks
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=X_filter)

# ranks are computed per triple
print('Test set:', X_test.shape)
print('Size of ranks:', ranks.shape)

# Aggregate metrics show the aggregate performance of the model on the test set using a single number
display_aggregate_metrics(ranks)
# ----------------------

Average TransE Loss:   0.061970: 100%|██████████| 100/100 [00:03<00:00, 28.01epoch/s]
100%|██████████| 1000/1000 [00:03<00:00, 332.88it/s]


Test set: (1000, 3)
Size of ranks: (1000, 2)
Mean Rank: 263.0615
Mean Reciprocal Rank: 0.18231403467250384
Hits@1: 0.116
Hits@10: 0.327
Hits@100: 0.6075


# 4. Knowledge Discovery

### Relations

In [None]:
# Relations
'''
disease_disease                    87479
gene_gene                          25101
disease_species                    21152
gene_disease                       17414
disease_gene                       17383
treat_procedure_disease            15221
disease_treat_procedure            14696
treat_procedure_treat_procedure    13533
treat_procedure_gene                8913
gene_treat_procedure                8913
treat_procedure_species             5307
gene_species                        3894
disease_symptom                     2501
symptom_disease                     1409
treat_procedure_symptom              391
gene_symptom                         310
symptom_species                      253
symptom_treat_procedure              211
symptom_gene                         170
symptom_symptom                      141
'''

##  Question - Answering

``` 
    <head, relation, ?> 
    <head, ?,        tail>
    <?,    relation, tail>
```

### Query: what are the possible therapeutic procedure of covid-19?? (not supported

### Query: What are related diseases of Covid-19

In [None]:
triples, scores = query_topn(model, top_n=15, 
                             head='covid-19', 
                             relation='disease_disease', 
                             tail= None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -13.649116516113281 	 ['covid-19' 'disease_disease' 'major'] 
Score: -13.85350227355957 	 ['covid-19' 'disease_disease' 'contagious'] 
Score: -13.855691909790039 	 ['covid-19' 'disease_disease' 'respiratory distress syndrome'] 
Score: -14.060480117797852 	 ['covid-19' 'disease_disease' 'critical'] 
Score: -14.180242538452148 	 ['covid-19' 'disease_disease' 'emerging'] 
Score: -14.296860694885254 	 ['covid-19' 'disease_disease'
 'severe acute respiratory syndrome coronavirus 2'] 
Score: -14.348396301269531 	 ['covid-19' 'disease_disease' 'cold'] 
Score: -14.434900283813477 	 ['covid-19' 'disease_disease' 'blood'] 
Score: -14.478034019470215 	 ['covid-19' 'disease_disease' 'distress syndrome'] 
Score: -14.654759407043457 	 ['covid-19' 'disease_disease' 'lung injury'] 
Score: -14.72463607788086 	 ['covid-19' 'disease_disease' 'shock'] 
Score: -14.751083374023438 	 ['covid-19' 'disease_disease' 'transmissible'] 
Score: -14.76009750366211 	 ['covid-19' 'disease_disease' 'autoimmunity

In [None]:
# Predict relation

triples, scores = query_topn(model, top_n=15, 
                             head=None, 
                             relation='disease_disease', 
                             tail= 'covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -16.97625732421875 	 ['palivizumab' 'disease_disease' 'covid-19'] 
Score: -17.022220611572266 	 ['parainfluenza virus' 'disease_disease' 'covid-19'] 
Score: -17.377275466918945 	 ['sars' 'disease_disease' 'covid-19'] 
Score: -17.41254425048828 	 ['fowl typhoid' 'disease_disease' 'covid-19'] 
Score: -17.469900131225586 	 ['world health organization' 'disease_disease' 'covid-19'] 
Score: -17.508962631225586 	 ['covid-19' 'disease_disease' 'covid-19'] 
Score: -17.531511306762695 	 ['mhv-2' 'disease_disease' 'covid-19'] 
Score: -17.576393127441406 	 ['model' 'disease_disease' 'covid-19'] 
Score: -17.576770782470703 	 ['age' 'disease_disease' 'covid-19'] 
Score: -17.673561096191406 	 ['syk' 'disease_disease' 'covid-19'] 
Score: -17.684120178222656 	 ['using' 'disease_disease' 'covid-19'] 
Score: -17.710796356201172 	 ['route' 'disease_disease' 'covid-19'] 
Score: -17.713205337524414 	 ['birnavirus' 'disease_disease' 'covid-19'] 
Score: -17.722515106201172 	 ['pathogen' 'disease_disea

### Query: What are the genes/proteins related to Covid-19

In [None]:
# Not supported
# triples, scores = query_topn(model, top_n=25, 
#                              head='covid-19', 
#                              relation='disease_gene', 
#                              tail=None, 
#                              ents_to_consider=None, 
#                              rels_to_consider=None)

# for triple, score in zip(triples, scores):
#     print('Score: {} \t {} '.format(score, triple))

In [None]:
# Predict relation

triples, scores = query_topn(model, top_n=30, 
                             head= None, 
                             relation='gene_disease', 
                             tail='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -15.425344467163086 	 ['pcr' 'gene_disease' 'covid-19'] 
Score: -15.43449592590332 	 ['covid-19' 'gene_disease' 'covid-19'] 
Score: -15.639074325561523 	 ['gp5' 'gene_disease' 'covid-19'] 
Score: -15.814078330993652 	 ['apobec3g' 'gene_disease' 'covid-19'] 
Score: -15.967984199523926 	 ['ccl3' 'gene_disease' 'covid-19'] 
Score: -16.112394332885742 	 ['sv40' 'gene_disease' 'covid-19'] 
Score: -16.16383934020996 	 ['influenza' 'gene_disease' 'covid-19'] 
Score: -16.17230987548828 	 ['seasonal influenza' 'gene_disease' 'covid-19'] 
Score: -16.204219818115234 	 ['necrosis' 'gene_disease' 'covid-19'] 
Score: -16.228910446166992 	 ['mif' 'gene_disease' 'covid-19'] 
Score: -16.266868591308594 	 ['camel' 'gene_disease' 'covid-19'] 
Score: -16.316211700439453 	 ['pyvs' 'gene_disease' 'covid-19'] 
Score: -16.334028244018555 	 ['triangle' 'gene_disease' 'covid-19'] 
Score: -16.365985870361328 	 ['vzv' 'gene_disease' 'covid-19'] 
Score: -16.38062286376953 	 ['disulfiram' 'gene_disease' 'cov

### Query: What species transmit the covid-19

In [None]:
# Not supported
# triples, scores = query_topn(model, top_n=20, 
#                              tail=None, 
#                              relation='disease_species', 
#                              head='covid-19', 
#                              ents_to_consider=None, 
#                              rels_to_consider=None)

# for triple, score in zip(triples, scores):
#     print('Score: {} \t {} '.format(score, triple))

In [None]:
# Predict relation
triples, scores = query_topn(model, top_n=30, 
                             relation='species_disease', 
                             head=None, 
                             tail='covid-19',
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -12.61575698852539 	 ['zbcov' 'species_disease' 'covid-19'] 
Score: -12.619173049926758 	 ['tgev' 'species_disease' 'covid-19'] 
Score: -13.056673049926758 	 ['peritonitis' 'species_disease' 'covid-19'] 
Score: -13.075456619262695 	 ['recognition' 'species_disease' 'covid-19'] 
Score: -13.133610725402832 	 ['advance' 'species_disease' 'covid-19'] 
Score: -13.256117820739746 	 ['nsp8 protein' 'species_disease' 'covid-19'] 
Score: -13.354997634887695 	 ['rt-pcr/esi-ms' 'species_disease' 'covid-19'] 
Score: -13.385965347290039 	 ['nendou' 'species_disease' 'covid-19'] 
Score: -13.399580001831055 	 ['peplomers' 'species_disease' 'covid-19'] 
Score: -13.437256813049316 	 ['specimen' 'species_disease' 'covid-19'] 
Score: -13.584897994995117 	 ['smith' 'species_disease' 'covid-19'] 
Score: -13.590124130249023 	 ['rna virus' 'species_disease' 'covid-19'] 
Score: -13.628859519958496 	 ['fifo orf' 'species_disease' 'covid-19'] 
Score: -13.648971557617188 	 ['replicase polyproteins' 'speci

### Query: what are symptoms of the covid-19

In [None]:
## ==> Not supported
# triples, scores = query_topn(model, top_n=20, 
#                              head=None, 
#                              relation='symptom_disease', 
#                              tail='coronavirus', 
#                              ents_to_consider=None, 
#                              rels_to_consider=None)

# for triple, score in zip(triples, scores):
#     print('Score: {} \t {} '.format(score, triple))

In [None]:
# triples, scores = query_topn(model, top_n=20, 
#                              tail=None, 
#                              relation='disease_symptom', 
#                              head='coronavirus', 
#                              ents_to_consider=None, 
#                              rels_to_consider=None)

# for triple, score in zip(triples, scores):
#     print('Score: {} \t {} '.format(score, triple))

### Query: Potential Drug

In [None]:
#`drug_disease`is not supported
# triples, scores = query_topn(model, top_n=20, 
#                              head=None, 
#                              relation='drug_disease', 
#                              tail='covid-19', 
#                              ents_to_consider=None, 
#                              rels_to_consider=None)

# for triple, score in zip(triples, scores):
#     print('Score: {} \t {} '.format(score, triple))

In [None]:
#`disease_drug` is not support
# triples, scores = query_topn(model, top_n=20, 
#                              tail=None, 
#                              relation='disease_drug', 
#                              head='covid-19', 
#                              ents_to_consider=None, 
#                              rels_to_consider=None)

# for triple, score in zip(triples, scores):
#     print('Score: {} \t {} '.format(score, triple))