<a href="https://colab.research.google.com/github/HuyenNguyenHelen/CORD-19-KG/blob/master/covid_19_kg_mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Preliminaries

In [None]:
# If using Google Colab run this cell 

# select tensorflow version for colab 
%tensorflow_version 1.x

In [None]:
import tensorflow as tf 

print('TensorFlow  version: {}'.format(tf.__version__))

# Get the GPU name
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow  version: 1.15.2
Found GPU at: /device:GPU:0


In [None]:
%%capture 
# Install AmpliGraph library
! pip install ampligraph

# Required to visualize embeddings with tensorboard projector, comment out if not required!
! pip install --user tensorboard

# Required to plot text on embedding clusters, comment out if not required!
! pip install --user git+https://github.com/Phlya/adjustText

In [None]:
# All imports used in this tutorial 
%tensorflow_version 1.x
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf
from ampligraph.datasets import load_fb15k_237
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.discovery import query_topn, discover_facts, find_clusters
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model

def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))

print('Ampligraph version: {}'.format(ampligraph.__version__))

Ampligraph version: 1.4.0


# 2. Loading a Knowledge Graph dataset

In [None]:
from ampligraph.datasets import load_fb15k_237, load_wn18rr, load_yago3_10

In [None]:
import pandas as pd

dataset = pd.read_csv('/content/final_triples_with_predefined_relations_cleaned_stage1.csv')
print(dataset.columns)
dataset.drop(columns='Unnamed: 0', inplace=True)
#dataset.columns = ['subject',  'object', 'predicate',]
dataset = dataset[['subject','new_relation', 'object' ]]
dataset.head(5)

Index(['Unnamed: 0', 'Unnamed: 0.1', 'subject', 'object', 'new_relation'], dtype='object')


Unnamed: 0,subject,new_relation,object
0,aerosol,be_in,guinea pig
1,uk,transmit,influenza
2,mp,be_related_with,sars-cov infection
3,obesity,be_in,china
4,%,be_related_with,ebv


In [None]:
print('Total triples in the KG:', dataset.shape)

Total triples in the KG: (12302, 3)


## 2.1 Create training, validation and test splits

In [None]:
from ampligraph.evaluation import train_test_split_no_unseen
# get the validation set of size 500
test_train, X_valid = train_test_split_no_unseen(dataset.values, 500, seed=0)

# get the test set of size 1000 from the remaining triples
X_train, X_test = train_test_split_no_unseen(test_train, 1000, seed=0)

print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)

Total triples: (12302, 3)
Size of train: (10802, 3)
Size of valid: (500, 3)
Size of test: (1000, 3)


# 3. Model Training

In [None]:
from ampligraph.latent_features import TransE

# Train a KGE model
model = TransE(k=300, 
               epochs=100, 
               eta=1, 
               loss='multiclass_nll', 
               initializer='xavier', initializer_params={'uniform': False},
               regularizer='LP', regularizer_params= {'lambda': 0.001, 'p': 3},
               optimizer= 'adam', optimizer_params= {'lr': 0.0001}, 
               seed= 0, batches_count= 10, verbose=True)

model.fit(X_train)
# ----------------------
# Evaluate: 
# Filtered evaluation with ranking strategy assigning worst rank to break ties

from ampligraph.utils import save_model, restore_model
save_model(model, 'TransE.pkl')
model = restore_model('TransE.pkl')

# create the filter 
X_filter = np.concatenate([X_train, X_valid, X_test], 0)

# compute ranks
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=X_filter)

# ranks are computed per triple
print('Test set:', X_test.shape)
print('Size of ranks:', ranks.shape)

# Aggregate metrics show the aggregate performance of the model on the test set using a single number
display_aggregate_metrics(ranks)
# ----------------------

Average TransE Loss:   0.112852: 100%|██████████| 100/100 [00:03<00:00, 26.00epoch/s]
100%|██████████| 1000/1000 [00:03<00:00, 304.20it/s]


Test set: (1000, 3)
Size of ranks: (1000, 2)
Mean Rank: 107.834
Mean Reciprocal Rank: 0.2297196173963971
Hits@1: 0.148
Hits@10: 0.402
Hits@100: 0.792


# 4. Knowledge Discovery

## 4.1 Triple completion

``` 
    <head, relation, ?> 
    <head, ?,        tail>
    <?,    relation, tail>
```

In [None]:
# Predict tail

from ampligraph.discovery import query_topn

# restore the previously saved model to save time
model = restore_model('TransE.pkl')

triples, scores = query_topn(model, top_n=30, 
                             head='covid-19', 
                             relation='be_in', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -14.355669975280762 	 ['covid-19' 'be_in' 'china'] 
Score: -14.779093742370605 	 ['covid-19' 'be_in' 'usa'] 
Score: -14.923473358154297 	 ['covid-19' 'be_in' 'australia'] 
Score: -14.94718074798584 	 ['covid-19' 'be_in' 'beijing'] 
Score: -15.050729751586914 	 ['covid-19' 'be_in' 'united state'] 
Score: -15.139759063720703 	 ['covid-19' 'be_in' 'africa'] 
Score: -15.151171684265137 	 ['covid-19' 'be_in' 'malaysia'] 
Score: -15.169055938720703 	 ['covid-19' 'be_in' 'india'] 
Score: -15.224349975585938 	 ['covid-19' 'be_in' 'mexico'] 
Score: -15.250919342041016 	 ['covid-19' 'be_in' 'france'] 
Score: -15.344208717346191 	 ['covid-19' 'be_in' 'sweden'] 
Score: -15.367685317993164 	 ['covid-19' 'be_in' 'nasopharyngeal swab'] 
Score: -15.39233684539795 	 ['covid-19' 'be_in' 'taiwan'] 
Score: -15.394640922546387 	 ['covid-19' 'be_in' 'italy'] 
Score: -15.429180145263672 	 ['covid-19' 'be_in' 'uk'] 
Score: -15.541807174682617 	 ['covid-19' 'be_in' 'u'] 
Score: -15.546154022216797 	 ['c

In [None]:
# Predict relation

triples, scores = query_topn(model, top_n=15, 
                             head=None, 
                             relation='treat', 
                             tail='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -17.283233642578125 	 ['covid-19' 'treat' 'covid-19'] 
Score: -17.303394317626953 	 ['gene' 'treat' 'covid-19'] 
Score: -17.369487762451172 	 ['membrane potential depolarization' 'treat' 'covid-19'] 
Score: -17.422821044921875 	 ['silica' 'treat' 'covid-19'] 
Score: -17.484542846679688 	 ['h' 'treat' 'covid-19'] 
Score: -17.495582580566406 	 ['t7 promoter' 'treat' 'covid-19'] 
Score: -17.534015655517578 	 ['man' 'treat' 'covid-19'] 
Score: -17.540334701538086 	 ['rpl11' 'treat' 'covid-19'] 
Score: -17.556020736694336 	 ['grippe' 'treat' 'covid-19'] 
Score: -17.644805908203125 	 ['america' 'treat' 'covid-19'] 
Score: -17.66366195678711 	 ['np' 'treat' 'covid-19'] 
Score: -17.69957160949707 	 ['hyperoxia' 'treat' 'covid-19'] 
Score: -17.750743865966797 	 ['influenza' 'treat' 'covid-19'] 
Score: -17.78937530517578 	 ['antibody' 'treat' 'covid-19'] 
Score: -17.811866760253906 	 ['poliovirus' 'treat' 'covid-19'] 


In [None]:
# Predict relation

triples, scores = query_topn(model, top_n=20, 
                             head=None, 
                             relation='be_in', 
                             tail='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -17.29572296142578 	 ['covid-19' 'be_in' 'covid-19'] 
Score: -17.888343811035156 	 ['outbreak' 'be_in' 'covid-19'] 
Score: -17.909421920776367 	 ['%' 'be_in' 'covid-19'] 
Score: -18.13340187072754 	 ['sars' 'be_in' 'covid-19'] 
Score: -18.158374786376953 	 ['hcov-nl63 inflection' 'be_in' 'covid-19'] 
Score: -18.24796485900879 	 ['target' 'be_in' 'covid-19'] 
Score: -18.357162475585938 	 ['time' 'be_in' 'covid-19'] 
Score: -18.362089157104492 	 ['lockdown' 'be_in' 'covid-19'] 
Score: -18.43448257446289 	 ['participant' 'be_in' 'covid-19'] 
Score: -18.437740325927734 	 ['measles' 'be_in' 'covid-19'] 
Score: -18.439647674560547 	 ['pandemic influenza' 'be_in' 'covid-19'] 
Score: -18.451114654541016 	 ['influenza' 'be_in' 'covid-19'] 
Score: -18.47132110595703 	 ['s. suis' 'be_in' 'covid-19'] 
Score: -18.488479614257812 	 ['prevalence' 'be_in' 'covid-19'] 
Score: -18.49022674560547 	 ['burden' 'be_in' 'covid-19'] 
Score: -18.498666763305664 	 ['complication' 'be_in' 'covid-19'] 
Sco

In [None]:
# Predict relation
ent_control_list = ['cat', 'human','bat', 'monkey',  'dog' ]
triples, scores = query_topn(model, top_n=30, 
                             head=None, 
                             relation='transmit', 
                             tail='covid-19', 
                             ents_to_consider=ent_control_list, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -14.479377746582031 	 ['human' 'transmit' 'covid-19'] 
Score: -15.203999519348145 	 ['bat' 'transmit' 'covid-19'] 
Score: -15.406490325927734 	 ['dog' 'transmit' 'covid-19'] 
Score: -17.068496704101562 	 ['monkey' 'transmit' 'covid-19'] 
Score: -17.196857452392578 	 ['cat' 'transmit' 'covid-19'] 


In [None]:

triples, scores = query_topn(model, top_n=30, 
                             head=None, 
                             relation='be_related_with', 
                             tail='covid-19', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -16.407957077026367 	 ['covid-19' 'be_related_with' 'covid-19'] 
Score: -16.447853088378906 	 ['patient' 'be_related_with' 'covid-19'] 
Score: -16.633697509765625 	 ['virus' 'be_related_with' 'covid-19'] 
Score: -16.792278289794922 	 ['%' 'be_related_with' 'covid-19'] 
Score: -16.93667221069336 	 ['symptom' 'be_related_with' 'covid-19'] 
Score: -16.963205337524414 	 ['inflection' 'be_related_with' 'covid-19'] 
Score: -17.318044662475586 	 ['data' 'be_related_with' 'covid-19'] 
Score: -17.333995819091797 	 ['child' 'be_related_with' 'covid-19'] 
Score: -17.363147735595703 	 ['antibody' 'be_related_with' 'covid-19'] 
Score: -17.38707733154297 	 ['person' 'be_related_with' 'covid-19'] 
Score: -17.523738861083984 	 ['coronavirus' 'be_related_with' 'covid-19'] 
Score: -17.556407928466797 	 ['influenza' 'be_related_with' 'covid-19'] 
Score: -17.590547561645508 	 ['sars' 'be_related_with' 'covid-19'] 
Score: -17.609272003173828 	 ['influenza virus' 'be_related_with' 'covid-19'] 
Score:

In [None]:
# Compare the raw call and graph embedding results for the symptoms query

## raw call
dataset.loc[(dataset['subject']=='symptom')].head(20)

#####==> the result of raw call seems to be better

Unnamed: 0,subject,new_relation,object
17,symptom,be_related_with,pulmonary edema
51,symptom,be_related_with,cough
53,symptom,be_related_with,nasal congestion
57,symptom,transmit,sars
72,symptom,be_related_with,headache
75,symptom,be_related_with,acute respiratory distress syndrome
108,symptom,be_related_with,influenza-like illness
158,symptom,be_related_with,sore throat
168,symptom,be_related_with,respiratory symptom
188,symptom,be_related_with,congestion


In [None]:
##graph embedding
triples, scores = query_topn(model, top_n=20, 
                             head='symptoms', 
                             relation='be_related_with', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))