<a href="https://colab.research.google.com/github/Kate5-7-2021/Kate_INFO5731_Spring2021/blob/main/covid_19_kg_mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Preliminaries

In [None]:
# If using Google Colab run this cell 

# select tensorflow version for colab 
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [None]:
import tensorflow as tf 

print('TensorFlow  version: {}'.format(tf.__version__))

# Get the GPU name
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow  version: 1.15.2
Found GPU at: /device:GPU:0


In [None]:
%%capture 
# Install AmpliGraph library
! pip install ampligraph

# Required to visualize embeddings with tensorboard projector, comment out if not required!
! pip install --user tensorboard

# Required to plot text on embedding clusters, comment out if not required!
! pip install --user git+https://github.com/Phlya/adjustText

In [None]:
# All imports used in this tutorial 
%tensorflow_version 1.x
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf
from ampligraph.datasets import load_fb15k_237
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.discovery import query_topn, discover_facts, find_clusters
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model

def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))

print('Ampligraph version: {}'.format(ampligraph.__version__))

Ampligraph version: 1.4.0


# 2. Loading a Knowledge Graph dataset

In [None]:
from ampligraph.datasets import load_fb15k_237, load_wn18rr, load_yago3_10

In [None]:
import pandas as pd
pmc = pd.read_csv('/content/new_triples_with_predefined_relations_pmc_June10_.csv')
pdf = pd.read_csv('/content/new_triples_with_predefined_relations_pdf_June10_.csv')
dataset = pd.concat([pmc, pdf])
print(dataset.columns)
dataset.drop(columns='Unnamed: 0', inplace=True)
#dataset.columns = ['subject',  'object', 'predicate',]
dataset = dataset[['subject','new_relation', 'object' ]]
dataset.head(5)

Index(['Unnamed: 0', 'Unnamed: 0.1', 'subject', 'object', 'new_relation'], dtype='object')


Unnamed: 0,subject,new_relation,object
0,dysfunction,disease_species,child
1,et dysfunction,disease_species,child
2,et dysfunction,disease_species,child
3,rsv,disease_disease,virus
4,treatment,treat_procedure_species,child


In [None]:
print('Total triples in the KG:', dataset.shape)

Total triples in the KG: (244392, 3)


In [None]:
print('the number of relations: \n', dataset['new_relation'].value_counts())

the number of relations: 
 disease_disease                    87479
gene_gene                          25101
disease_species                    21152
gene_disease                       17414
disease_gene                       17383
treat_procedure_disease            15221
disease_treat_procedure            14696
treat_procedure_treat_procedure    13533
treat_procedure_gene                8913
gene_treat_procedure                8913
treat_procedure_species             5307
gene_species                        3894
disease_symptom                     2501
symptom_disease                     1409
treat_procedure_symptom              391
gene_symptom                         310
symptom_species                      253
symptom_treat_procedure              211
symptom_gene                         170
symptom_symptom                      141
Name: new_relation, dtype: int64


## 2.1 Create training, validation and test splits

In [None]:
from ampligraph.evaluation import train_test_split_no_unseen
# get the validation set of size 500
test_train, X_valid = train_test_split_no_unseen(dataset.values, 500, seed=0)

# get the test set of size 1000 from the remaining triples
X_train, X_test = train_test_split_no_unseen(test_train, 1000, seed=0)

print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)

Total triples: (244392, 3)
Size of train: (242892, 3)
Size of valid: (500, 3)
Size of test: (1000, 3)


# 3. Model Training

In [None]:
from ampligraph.latent_features import TransE

# Train a KGE model
model = TransE(k=300, 
               epochs=100, 
               eta=1, 
               loss='multiclass_nll', 
               initializer='xavier', initializer_params={'uniform': False},
               regularizer='LP', regularizer_params= {'lambda': 0.001, 'p': 3},
               optimizer= 'adam', optimizer_params= {'lr': 0.0001}, 
               seed= 0, batches_count= 10, verbose=True)

model.fit(X_train)
# ----------------------
# Evaluate: 
# Filtered evaluation with ranking strategy assigning worst rank to break ties

from ampligraph.utils import save_model, restore_model
save_model(model, 'TransE.pkl')
model = restore_model('TransE.pkl')

# create the filter 
X_filter = np.concatenate([X_train, X_valid, X_test], 0)

# compute ranks
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=X_filter)

# ranks are computed per triple
print('Test set:', X_test.shape)
print('Size of ranks:', ranks.shape)

# Aggregate metrics show the aggregate performance of the model on the test set using a single number
display_aggregate_metrics(ranks)
# ----------------------

Average TransE Loss:   0.045511: 100%|██████████| 100/100 [00:12<00:00,  7.96epoch/s]
100%|██████████| 1000/1000 [00:06<00:00, 162.91it/s]


Test set: (1000, 3)
Size of ranks: (1000, 2)
Mean Rank: 2313.499
Mean Reciprocal Rank: 0.06883078223695022
Hits@1: 0.039
Hits@10: 0.12
Hits@100: 0.3025


# 4. Knowledge Discovery

### Relations

In [None]:
# Relations
'''
disease_disease                    87479
gene_gene                          25101
disease_species                    21152
gene_disease                       17414
disease_gene                       17383
treat_procedure_disease            15221
disease_treat_procedure            14696
treat_procedure_treat_procedure    13533
treat_procedure_gene                8913
gene_treat_procedure                8913
treat_procedure_species             5307
gene_species                        3894
disease_symptom                     2501
symptom_disease                     1409
treat_procedure_symptom              391
gene_symptom                         310
symptom_species                      253
symptom_treat_procedure              211
symptom_gene                         170
symptom_symptom                      141
'''

'\ndisease_disease            87479\ngene_gene                  25101\ngene_disease               17414\ndisease_gene               17383\ndisease_treat_procedure    14696\ngene_treat_procedure        8913\ndisease_symptom             2501\nsymptom_disease             1409\ngene_symptom                 310\nsymptom_treat_procedure      211\nsymptom_gene                 170\nsymptom_symptom              141\nName: new_relation, dtype: int64\n'

##  Question - Answering

``` 
    <head, relation, ?> 
    <head, ?,        tail>
    <?,    relation, tail>
```

### Query: what are the possible therapeutic procedure of covid-19??

In [None]:
# Query: what are the possible therapeutic procedure of covid-19??

from ampligraph.discovery import query_topn

# restore the previously saved model to save time
model = restore_model('TransE.pkl')

triples, scores = query_topn(model, top_n=20, 
                             head='covid-19', 
                             relation='disease_treat_procedure', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -11.44215202331543 	 ['covid-19' 'disease_treat_procedure' 'detected'] 
Score: -11.461198806762695 	 ['covid-19' 'disease_treat_procedure' 'extracted'] 
Score: -11.561827659606934 	 ['covid-19' 'disease_treat_procedure' 'transfusion'] 
Score: -11.609912872314453 	 ['covid-19' 'disease_treat_procedure' 'adjustment'] 
Score: -11.703495979309082 	 ['covid-19' 'disease_treat_procedure' 'stem cell transplantation'] 
Score: -11.722541809082031 	 ['covid-19' 'disease_treat_procedure' 'mechanical ventilation'] 
Score: -11.766433715820312 	 ['covid-19' 'disease_treat_procedure' 'infiltration'] 
Score: -11.784860610961914 	 ['covid-19' 'disease_treat_procedure' '5'] 
Score: -11.820601463317871 	 ['covid-19' 'disease_treat_procedure' 'sialic acid'] 
Score: -11.857441902160645 	 ['covid-19' 'disease_treat_procedure' 'corticosteroid therapy'] 
Score: -11.859466552734375 	 ['covid-19' 'disease_treat_procedure' 'lead'] 
Score: -11.876775741577148 	 ['covid-19' 'disease_treat_procedure' 'oxygen

In [None]:
# Query: what are the possible therapeutic procedure of covid-19??

from ampligraph.discovery import query_topn

# restore the previously saved model to save time
model = restore_model('TransE.pkl')

triples, scores = query_topn(model, top_n=20, 
                             head=None, 
                             relation='treat_procedure_disease', 
                             tail='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -13.19991683959961 	 ['glycyrrhizin' 'treat_procedure_disease' 'covid-19'] 
Score: -13.286637306213379 	 ['nippv' 'treat_procedure_disease' 'covid-19'] 
Score: -13.29110336303711 	 ['combined therapy' 'treat_procedure_disease' 'covid-19'] 
Score: -13.318893432617188 	 ['melatonin' 'treat_procedure_disease' 'covid-19'] 
Score: -13.361358642578125 	 ['cr3022' 'treat_procedure_disease' 'covid-19'] 
Score: -13.365716934204102 	 ['hcov' 'treat_procedure_disease' 'covid-19'] 
Score: -13.443920135498047 	 ['ablation' 'treat_procedure_disease' 'covid-19'] 
Score: -13.482229232788086 	 ['ci' 'treat_procedure_disease' 'covid-19'] 
Score: -13.483879089355469 	 ['pct' 'treat_procedure_disease' 'covid-19'] 
Score: -13.50535774230957 	 ['aspiration' 'treat_procedure_disease' 'covid-19'] 
Score: -13.521445274353027 	 ['influenza vaccination' 'treat_procedure_disease' 'covid-19'] 
Score: -13.527620315551758 	 ['rimantadine' 'treat_procedure_disease' 'covid-19'] 
Score: -13.532459259033203 	 ['v

### Query: What are related diseases of Covid-19

In [None]:
# Predict relation

triples, scores = query_topn(model, top_n=20, 
                             head='covid-19', 
                             relation='disease_disease', 
                             tail= None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -14.012584686279297 	 ['covid-19' 'disease_disease' 'bat coronavirus'] 
Score: -14.041549682617188 	 ['covid-19' 'disease_disease' 'childhood'] 
Score: -14.055595397949219 	 ['covid-19' 'disease_disease' 'fatal'] 
Score: -14.07307243347168 	 ['covid-19' 'disease_disease' 'shortness of breath'] 
Score: -14.091279983520508 	 ['covid-19' 'disease_disease' 'healthcare associated infection'] 
Score: -14.094403266906738 	 ['covid-19' 'disease_disease' 'long'] 
Score: -14.111207962036133 	 ['covid-19' 'disease_disease' 'tract illness'] 
Score: -14.276697158813477 	 ['covid-19' 'disease_disease' 'bovine leukemia virus'] 
Score: -14.285184860229492 	 ['covid-19' 'disease_disease' 'acute lower respiratory tract infection'] 
Score: -14.370065689086914 	 ['covid-19' 'disease_disease' 'arti'] 
Score: -14.377974510192871 	 ['covid-19' 'disease_disease' 'liver transplantation'] 
Score: -14.383066177368164 	 ['covid-19' 'disease_disease' 'bronchointerstitial pneumonia'] 
Score: -14.407201766967

In [None]:
# Predict relation

triples, scores = query_topn(model, top_n=20, 
                             head=None, 
                             relation='disease_disease', 
                             tail= 'covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -18.146244049072266 	 ['covid-19' 'disease_disease' 'covid-19'] 
Score: -18.577428817749023 	 ['canine rabies' 'disease_disease' 'covid-19'] 
Score: -18.679847717285156 	 ['flea' 'disease_disease' 'covid-19'] 
Score: -18.714982986450195 	 ['coli' 'disease_disease' 'covid-19'] 
Score: -18.718536376953125 	 ['ebola fever' 'disease_disease' 'covid-19'] 
Score: -18.72646713256836 	 ['denvs' 'disease_disease' 'covid-19'] 
Score: -18.733434677124023 	 ['hypogonadism' 'disease_disease' 'covid-19'] 
Score: -18.772445678710938 	 ['nasal stuffiness' 'disease_disease' 'covid-19'] 
Score: -18.787410736083984 	 ['facial paralysis' 'disease_disease' 'covid-19'] 
Score: -18.807798385620117 	 ['frailty' 'disease_disease' 'covid-19'] 
Score: -18.869657516479492 	 ['adenoviral pneumonia' 'disease_disease' 'covid-19'] 
Score: -18.890518188476562 	 ['intestinal metaplasia' 'disease_disease' 'covid-19'] 
Score: -18.92822265625 	 ['avian influenza a virus' 'disease_disease' 'covid-19'] 
Score: -18.92

### Query: What are the genes/proteins related to Covid-19

In [None]:
# Predict relation

triples, scores = query_topn(model, top_n=25, 
                             head='covid-19', 
                             relation='disease_gene', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -12.417631149291992 	 ['covid-19' 'disease_gene' 'spring'] 
Score: -12.54544734954834 	 ['covid-19' 'disease_gene' 'lobe'] 
Score: -12.692500114440918 	 ['covid-19' 'disease_gene' 'end'] 
Score: -12.694246292114258 	 ['covid-19' 'disease_gene' 'host cell'] 
Score: -12.70892333984375 	 ['covid-19' 'disease_gene' 'past'] 
Score: -12.710649490356445 	 ['covid-19' 'disease_gene' 'rare'] 
Score: -12.768430709838867 	 ['covid-19' 'disease_gene' 'enterocytes'] 
Score: -12.786975860595703 	 ['covid-19' 'disease_gene' 'small'] 
Score: -12.788283348083496 	 ['covid-19' 'disease_gene' 'airway'] 
Score: -12.85934066772461 	 ['covid-19' 'disease_gene' 'open'] 
Score: -12.885322570800781 	 ['covid-19' 'disease_gene' '12'] 
Score: -12.907169342041016 	 ['covid-19' 'disease_gene' 'large'] 
Score: -12.921226501464844 	 ['covid-19' 'disease_gene' 'killer'] 
Score: -12.998534202575684 	 ['covid-19' 'disease_gene' 'surface glycoprotein'] 
Score: -13.003454208374023 	 ['covid-19' 'disease_gene' 'gen

In [None]:
# Predict relation

triples, scores = query_topn(model, top_n=25, 
                             head= None, 
                             relation='gene_disease', 
                             tail='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -13.549598693847656 	 ['nl63' 'gene_disease' 'covid-19'] 
Score: -13.551021575927734 	 ['aim' 'gene_disease' 'covid-19'] 
Score: -13.60186767578125 	 ['rnase l' 'gene_disease' 'covid-19'] 
Score: -13.632120132446289 	 ['z' 'gene_disease' 'covid-19'] 
Score: -13.664467811584473 	 ['mbl' 'gene_disease' 'covid-19'] 
Score: -13.690570831298828 	 ['trpv4' 'gene_disease' 'covid-19'] 
Score: -13.731022834777832 	 ['cxcl10' 'gene_disease' 'covid-19'] 
Score: -13.74756145477295 	 ['sv40' 'gene_disease' 'covid-19'] 
Score: -13.79036808013916 	 ['n protein' 'gene_disease' 'covid-19'] 
Score: -13.862615585327148 	 ['ifit1' 'gene_disease' 'covid-19'] 
Score: -13.862825393676758 	 ['tnf' 'gene_disease' 'covid-19'] 
Score: -13.8679780960083 	 ['sirna' 'gene_disease' 'covid-19'] 
Score: -13.878103256225586 	 ['cxcl13' 'gene_disease' 'covid-19'] 
Score: -13.882623672485352 	 ['mabs' 'gene_disease' 'covid-19'] 
Score: -13.885960578918457 	 ['cd4' 'gene_disease' 'covid-19'] 
Score: -13.90187644958

### Query: What species transmit the covid-19

In [None]:
# Predict relation
triples, scores = query_topn(model, top_n=20, 
                             tail=None, 
                             relation='disease_species', 
                             head='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -11.919681549072266 	 ['covid-19' 'disease_species' 'cattle'] 
Score: -11.977020263671875 	 ['covid-19' 'disease_species' 'calf'] 
Score: -12.107694625854492 	 ['covid-19' 'disease_species' 'person'] 
Score: -12.145956993103027 	 ['covid-19' 'disease_species' 'infant'] 
Score: -12.186727523803711 	 ['covid-19' 'disease_species' 'sheep'] 
Score: -12.196048736572266 	 ['covid-19' 'disease_species' 'man'] 
Score: -12.223149299621582 	 ['covid-19' 'disease_species' 'rabbit'] 
Score: -12.22695255279541 	 ['covid-19' 'disease_species' 'child'] 
Score: -12.234914779663086 	 ['covid-19' 'disease_species' 'hamster'] 
Score: -12.249801635742188 	 ['covid-19' 'disease_species' 'patient'] 
Score: -12.26108169555664 	 ['covid-19' 'disease_species' 'duck'] 
Score: -12.308273315429688 	 ['covid-19' 'disease_species' 'vector'] 
Score: -12.311029434204102 	 ['covid-19' 'disease_species' 'macaque'] 
Score: -12.330013275146484 	 ['covid-19' 'disease_species' 'woman'] 
Score: -12.344823837280273 	 

### Query: what are symptoms of the covid-19

In [None]:

triples, scores = query_topn(model, top_n=20, 
                             head=None, 
                             relation='symptom_disease', 
                             tail='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -11.760704040527344 	 ['chest tightness' 'symptom_disease' 'covid-19'] 
Score: -11.777921676635742 	 ['severe diarrhea' 'symptom_disease' 'covid-19'] 
Score: -11.908426284790039 	 ['abdominal discomfort' 'symptom_disease' 'covid-19'] 
Score: -11.936378479003906 	 ['hoarseness' 'symptom_disease' 'covid-19'] 
Score: -12.027870178222656 	 ['halitosis' 'symptom_disease' 'covid-19'] 
Score: -12.094209671020508 	 ['rigor' 'symptom_disease' 'covid-19'] 
Score: -12.115575790405273 	 ['agitation' 'symptom_disease' 'covid-19'] 
Score: -12.142049789428711 	 ['intermittent fever' 'symptom_disease' 'covid-19'] 
Score: -12.16540813446045 	 ['prostration' 'symptom_disease' 'covid-19'] 
Score: -12.186314582824707 	 ['hyposmia' 'symptom_disease' 'covid-19'] 
Score: -12.281648635864258 	 ['persistent cough' 'symptom_disease' 'covid-19'] 
Score: -12.295584678649902 	 ['non-productive cough' 'symptom_disease' 'covid-19'] 
Score: -12.311198234558105 	 ['anesthesia' 'symptom_disease' 'covid-19'] 
Sco

In [None]:
triples, scores = query_topn(model, top_n=20, 
                             tail=None, 
                             relation='disease_symptom', 
                             head='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -9.864557266235352 	 ['covid-19' 'disease_symptom' 'symptom'] 
Score: -9.908804893493652 	 ['covid-19' 'disease_symptom' 'discharge'] 
Score: -10.240966796875 	 ['covid-19' 'disease_symptom' 'cyanosis'] 
Score: -10.352489471435547 	 ['covid-19' 'disease_symptom' 'discomfort'] 
Score: -10.36842155456543 	 ['covid-19' 'disease_symptom' 'emaciation'] 
Score: -10.401368141174316 	 ['covid-19' 'disease_symptom' 'illness'] 
Score: -10.485641479492188 	 ['covid-19' 'disease_symptom' 'disorientation'] 
Score: -10.514070510864258 	 ['covid-19' 'disease_symptom' 'imbalance'] 
Score: -10.541074752807617 	 ['covid-19' 'disease_symptom' 'ill'] 
Score: -10.54233455657959 	 ['covid-19' 'disease_symptom' 'finding'] 
Score: -10.548297882080078 	 ['covid-19' 'disease_symptom' 'ageusia'] 
Score: -10.575578689575195 	 ['covid-19' 'disease_symptom' 'insomnia'] 
Score: -10.577934265136719 	 ['covid-19' 'disease_symptom' 'muscle pain'] 
Score: -10.705485343933105 	 ['covid-19' 'disease_symptom' 'sever

In [None]:
# Compare the raw call and graph embedding results for the symptoms query

## raw call
#dataset.loc[(dataset['subject']=='symptom')].head(20)

#####==> the result of raw call seems to be better

In [None]:
##graph embedding
triples, scores = query_topn(model, top_n=20, 
                             head=, 
                             relation='be_related_with', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))