<a href="https://colab.research.google.com/github/HuyenNguyenHelen/CORD-19-KG/blob/master/QA-models/covid_19_kg_mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preliminaries

In [1]:
# If using Google Colab run this cell 

# select tensorflow version for colab 
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
import tensorflow as tf 

print('TensorFlow  version: {}'.format(tf.__version__))

# Get the GPU name
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow  version: 1.15.2
Found GPU at: /device:GPU:0


In [3]:
%%capture 
# Install AmpliGraph library
! pip install ampligraph

# Required to visualize embeddings with tensorboard projector, comment out if not required!
! pip install --user tensorboard

# Required to plot text on embedding clusters, comment out if not required!
! pip install --user git+https://github.com/Phlya/adjustText

In [4]:
# All imports used in this tutorial 
%tensorflow_version 1.x
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf
from ampligraph.datasets import load_fb15k_237
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.discovery import query_topn, discover_facts, find_clusters
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model

def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))

print('Ampligraph version: {}'.format(ampligraph.__version__))

Ampligraph version: 1.4.0


In [5]:
from ampligraph.datasets import load_fb15k_237, load_wn18rr, load_yago3_10
import pandas as pd
import requests
import io

## Loading Data 
(a triples dataset)

In [6]:
# Downloading the csv file from your GitHub account
re_pdf = requests.get('https://raw.githubusercontent.com/HuyenNguyenHelen/CORD-19-KG/master/Data/new_triples_with_predefined_relations_pdf_June20_.csv').content
re_pmc = requests.get('https://raw.githubusercontent.com/HuyenNguyenHelen/CORD-19-KG/master/Data/new_triples_with_predefined_relations_pmc_June20_.csv').content
pdf = pd.read_csv(io.StringIO(re_pdf.decode('utf-8')))
pmc =  pd.read_csv(io.StringIO(re_pmc.decode('utf-8')))
dataset = pd.concat([pmc, pdf])
print(dataset.columns)
dataset.drop(columns='Unnamed: 0', inplace=True)
dataset = dataset[['subject','new_relation', 'object' ]]
dataset.head(5)

Index(['Unnamed: 0', 'Unnamed: 0.1', 'subject', 'object', 'new_relation'], dtype='object')


Unnamed: 0,subject,new_relation,object
0,dysfunction,disease_species,child
1,et dysfunction,disease_species,child
2,et dysfunction,disease_species,child
3,rsv,disease_disease,virus
4,treatment,treat_procedure_species,child


In [7]:
print('Total triples in the KG:', dataset.shape)

Total triples in the KG: (230330, 3)


In [8]:
print('the number of relations: \n', dataset['new_relation'].value_counts())

the number of relations: 
 disease_disease                    63192
gene_gene                          24172
disease_species                    22370
gene_disease                       14220
disease_gene                       13347
disease_treat_procedure            10184
treat_procedure_disease             8223
drug_disease                        7250
gene_treat_procedure                6142
treat_procedure_treat_procedure     5625
gene_species                        5298
drug_gene                           5270
drug_drug                           4613
treat_procedure_gene                4602
disease_symptom                     4354
treat_procedure_species             4263
disease_drug                        4044
drug_species                        3680
gene_drug                           3603
symptom_disease                     3411
symptom_symptom                     3266
drug_treat_procedure                3229
treat_procedure_drug                2578
symptom_species               

# Data Processing
Partition the triples into 10 portions that will be used for buiding QA systems

In [9]:
# Shuffle the dataset
dataset = dataset.sample(frac=1, axis=1).reset_index(drop=True)
subsets = np.array_split(dataset, 10)  
len(subsets)


10

# Developing QA system

## Developing model

### Create training, validation and test splits

In [10]:
from ampligraph.evaluation import train_test_split_no_unseen
def train_set_split (ds, test_frac = 200, val_frac = 100 ):
  # get the validation set of size 500
  test_train, X_test = train_test_split_no_unseen(ds.values, test_frac, seed=0)

  # get the test set of size 1000 from the remaining triples
  X_train, X_val = train_test_split_no_unseen(test_train, val_frac, seed=0)

  print('Total triples:', dataset.shape)
  print('Size of train:', X_train.shape)
  print('Size of valid:', X_val.shape)
  print('Size of test:', X_test.shape)
  return X_train, X_val, X_test

### Model Training

In [11]:
from ampligraph.latent_features import TransE

def training (train_ds, val_ds, test_ds):
  # Train a KGE model
  model = TransE(k=300, 
                epochs=100, 
                eta=1, 
                loss='multiclass_nll', 
                initializer='xavier', initializer_params={'uniform': False},
                regularizer='LP', regularizer_params= {'lambda': 0.001, 'p': 3},
                optimizer= 'adam', optimizer_params= {'lr': 0.0001}, 
                seed= 0, batches_count= 100, verbose=True)

  model.fit(train_ds)
  # ----------------------
  # Evaluate: 
  # Filtered evaluation with ranking strategy assigning worst rank to break ties
  from ampligraph.utils import save_model, restore_model
  save_model(model, 'TransE.pkl')
  model = restore_model('TransE.pkl')

  # create the filter 
  X_filter = np.concatenate([train_ds, val_ds, test_ds], 0)

  # compute ranks
  ranks = evaluate_performance(test_ds, 
                              model=model, 
                              filter_triples=X_filter)

  # ranks are computed per triple
  print('Test set:', X_test.shape)
  print('Size of ranks:', ranks.shape)

  # Aggregate metrics show the aggregate performance of the model on the test set using a single number
  display_aggregate_metrics(ranks)
  # ----------------------
  
  return display_aggregate_metrics(ranks)

X_train, X_val, X_test = train_set_split(dataset)
training (X_train, X_val, X_test) #dataset.to_numpy()

Total triples: (230330, 3)
Size of train: (230030, 3)
Size of valid: (100, 3)
Size of test: (200, 3)


Average TransE Loss:   0.023402: 100%|██████████| 100/100 [01:25<00:00,  1.16epoch/s]
100%|██████████| 200/200 [00:02<00:00, 92.39it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 145.925
Mean Reciprocal Rank: 0.3346296712975534
Hits@1: 0.2375
Hits@10: 0.505
Hits@100: 0.7875
Mean Rank: 145.925
Mean Reciprocal Rank: 0.3346296712975534
Hits@1: 0.2375
Hits@10: 0.505
Hits@100: 0.7875


## Knowledge Discovery

In [12]:
from ampligraph.discovery import query_topn

def QA_retrieve (input, top_n ):
  head, relation1, tail = input[0], input[1], input[2]
  if len(relation1.split('_')) <= 2:
    relation2 = '{}_{}'.format(relation1.split('_')[1], relation1.split('_')[0])
  elif len(relation1.split('_')) == 3:
     relation2 = '{}_{}_{}'.format(relation1.split('_')[1], relation1.split('_')[2], relation1.split('_')[0])
  else:
    raise ValueError('error: relation length issue')
  # restore the previously saved model to save time
  model = restore_model('TransE.pkl')
  if relation1  in dataset['new_relation'].tolist():
    triples1, scores1 = query_topn(model, top_n=top_n, 
                                head=head, 
                                relation=relation1, 
                                tail=tail, 
                                ents_to_consider=None, 
                                rels_to_consider=None)
  else:
    triples1, scores1 = [], []

  if relation2  in dataset['new_relation'].tolist():
    triples2, scores2 = query_topn(model, top_n=top_n, 
                                head=tail, 
                                relation=relation2, 
                                tail=head, 
                                ents_to_consider=None, 
                                rels_to_consider=None)
  else:
    triples2, scores2 = [], []

  if len(triples1) + len(triples2) == top_n:
    if len(triples1) ==0: 
      triples = triples1 + triples2.tolist()
      scores = scores1+ scores2.tolist()
    else:
      triples = triples1.tolist() + triples2
      scores = scores1.tolist() + scores2
  else:
    end_idx = round(top_n/2)
    triples = triples1.tolist()[:end_idx] + triples2.tolist()[:end_idx]
    scores = scores1.tolist()[:end_idx] + scores2.tolist()[:end_idx]
   
  for triple, score in zip(triples, scores):
      print('Score: {} \t {} '.format(score, triple))
  return triples, scores
 


In [13]:

# Query2: What are related diseases of Covid-19
print ('------------------------------------------------------------')
print('What are related diseases of Covid-19')
question2 = QA_retrieve (['covid-19', 'disease_disease', None], top_n = 20)

# Query3: What are the genes/proteins related to Covid-19
print ('------------------------------------------------------------')
print('What are the genes/proteins related to Covid-19')
question3 = QA_retrieve (['covid-19', 'disease_gene', None], top_n = 20)

# Query4: What species transmit the covid-19
print ('------------------------------------------------------------')
print('What species transmit the covid-19?')
question4 = QA_retrieve (['covid-19', 'disease_species', None], top_n = 20)

# Query: what are symptoms of the covid-19
print ('------------------------------------------------------------')
print('what are symptoms of the covid-19?')
question5 = QA_retrieve (['covid-19', 'disease_symptom', None], top_n = 20)

# What are potential drugs to treat COVID-19?
print ('------------------------------------------------------------')
print('What are potential drugs to treat COVID-19?')
question6 = QA_retrieve (['covid-19', 'disease_drug', None], top_n = 20)

# Query1: what are the possible therapeutic procedure of covid-19??
print ('------------------------------------------------------------')
print('what are the possible therapeutic procedure of covid-19?')
question1 = QA_retrieve (['covid-19', 'disease_treat_procedure', None], top_n = 20)

------------------------------------------------------------
What are related diseases of Covid-19
Score: -8.52704906463623 	 ['covid-19', 'disease_disease', 'covid-19'] 
Score: -12.436001777648926 	 ['covid-19', 'disease_disease', '2009'] 
Score: -12.915486335754395 	 ['covid-19', 'disease_disease', 'infectious'] 
Score: -12.93936538696289 	 ['covid-19', 'disease_disease', 'major'] 
Score: -13.162408828735352 	 ['covid-19', 'disease_disease', 'severe'] 
Score: -13.30875015258789 	 ['covid-19', 'disease_disease', 'brain'] 
Score: -13.311796188354492 	 ['covid-19', 'disease_disease', 'same'] 
Score: -13.328737258911133 	 ['covid-19', 'disease_disease', 'lower'] 
Score: -13.533295631408691 	 ['covid-19', 'disease_disease', 'cns'] 
Score: -13.549384117126465 	 ['covid-19', 'disease_disease', 'fatal'] 
Score: -8.52704906463623 	 ['covid-19', 'disease_disease', 'covid-19'] 
Score: -13.223125457763672 	 ['he', 'disease_disease', 'covid-19'] 
Score: -13.476348876953125 	 ['same', 'disease_dis

In [14]:
# Compare the raw call and graph embedding results for the symptoms query

## raw call
#dataset.loc[(dataset['subject']=='symptom')].head(20)

#####==> the result of raw call seems to be better

# Train model, and Knowledge mining with different % of dataset

In [16]:
ALL = []

for idx in range(len(subsets)):
  outputs = {} 
  outputs['subset_%s' %(idx+1)] = []
  data = subsets[:idx+1]
  data_concat = pd.concat(data)
  print ('i: {} -- len: {}'.format(idx, len(data_concat)))

  # Spliting dataset for training
  X_train, X_val, X_test = train_set_split(data_concat)
  # Training...
  print('Training..........')
  training (X_train, X_val, X_test)

  # Retrieving
  # Query2: What are related diseases of Covid-19
  print ('------------------------------------------------------------')
  print('What are related diseases of Covid-19')
  question2 = QA_retrieve (['covid-19', 'disease_disease', None], top_n = 20)
  df2 = pd.DataFrame(zip(question2[0],question2[1]), columns = ['triple', 'score'])
  outputs['subset_%s' %(idx+1)].append(df2)

  # Query3: What are the genes/proteins related to Covid-19
  print ('------------------------------------------------------------')
  print('What are the genes/proteins related to Covid-19')
  question3 = QA_retrieve (['covid-19', 'disease_gene', None], top_n = 20)
  df3 = pd.DataFrame(zip(question3[0],question3[1]), columns = ['triple', 'score'])
  outputs['subset_%s' %(idx+1)].append(df3)

  # Query4: What species transmit the covid-19
  print ('------------------------------------------------------------')
  print('What species transmit the covid-19?')
  question4 = QA_retrieve (['covid-19', 'disease_species', None], top_n = 20)
  df4 = pd.DataFrame(zip(question4[0],question4[1]), columns = ['triple', 'score'])
  outputs['subset_%s' %(idx+1)].append(df4)

  # Query: what are symptoms of the covid-19
  print ('------------------------------------------------------------')
  print('what are symptoms of the covid-19?')
  question5 = QA_retrieve (['covid-19', 'disease_symptom', None], top_n = 20)
  df5 = pd.DataFrame(zip(question5[0],question5[1]), columns = ['triple', 'score'])
  outputs['subset_%s' %(idx+1)].append(df5)

  # What are potential drugs to treat COVID-19?
  print ('------------------------------------------------------------')
  print('What are potential drugs to treat COVID-19?')
  question6 = QA_retrieve (['covid-19', 'disease_drug', None], top_n = 20)
  df6 = pd.DataFrame(zip(question6[0],question6[1]), columns = ['triple', 'score'])
  outputs['subset_%s' %(idx+1)].append(df6)

  # Query1: what are the possible therapeutic procedure of covid-19??
  print ('------------------------------------------------------------')
  print('what are the possible therapeutic procedure of covid-19?')
  question1 = QA_retrieve (['covid-19', 'disease_treat_procedure', None], top_n = 20)
  df1 = pd.DataFrame(zip(question1[0],question1[1]), columns = ['triple', 'score'])
  outputs['subset_%s' %(idx+1)].append(df1)
  ALL.append(outputs)
                     
                     

i: 0 -- len: 23033
Total triples: (230330, 3)
Size of train: (22733, 3)
Size of valid: (100, 3)
Size of test: (200, 3)
Training..........


Average TransE Loss:   0.014766: 100%|██████████| 100/100 [00:49<00:00,  2.01epoch/s]
100%|██████████| 200/200 [00:01<00:00, 168.06it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 444.3525
Mean Reciprocal Rank: 0.20265291612421327
Hits@1: 0.15
Hits@10: 0.295
Hits@100: 0.5675
Mean Rank: 444.3525
Mean Reciprocal Rank: 0.20265291612421327
Hits@1: 0.15
Hits@10: 0.295
Hits@100: 0.5675
------------------------------------------------------------
What are related diseases of Covid-19
Score: -12.617729187011719 	 ['covid-19', 'disease_disease', 'covid-19'] 
Score: -15.769609451293945 	 ['covid-19', 'disease_disease', 'feline herpesvirus'] 
Score: -16.083181381225586 	 ['covid-19', 'disease_disease', 'shortness of breath'] 
Score: -16.10369873046875 	 ['covid-19', 'disease_disease', 'dhf'] 
Score: -16.149524688720703 	 ['covid-19', 'disease_disease', 'adenopathy'] 
Score: -16.183517456054688 	 ['covid-19', 'disease_disease', 'infectious'] 
Score: -16.230884552001953 	 ['covid-19', 'disease_disease', 'coryza'] 
Score: -16.25443458557129 	 ['covid-19', 'disease_disease', '1918 influenza pandemic'] 
Score: -16.2624912261

Average TransE Loss:   0.023757: 100%|██████████| 100/100 [00:54<00:00,  1.85epoch/s]
100%|██████████| 200/200 [00:01<00:00, 151.20it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 160.1475
Mean Reciprocal Rank: 0.37041283940912656
Hits@1: 0.2325
Hits@10: 0.585
Hits@100: 0.8175
Mean Rank: 160.1475
Mean Reciprocal Rank: 0.37041283940912656
Hits@1: 0.2325
Hits@10: 0.585
Hits@100: 0.8175
------------------------------------------------------------
What are related diseases of Covid-19
Score: -5.180383205413818 	 ['covid-19', 'disease_disease', 'covid-19'] 
Score: -10.44300651550293 	 ['covid-19', 'disease_disease', '2009'] 
Score: -10.55663776397705 	 ['covid-19', 'disease_disease', 'severe'] 
Score: -10.649063110351562 	 ['covid-19', 'disease_disease', 'lower'] 
Score: -10.75851058959961 	 ['covid-19', 'disease_disease', 'lung disease'] 
Score: -10.885147094726562 	 ['covid-19', 'disease_disease', 'respiratory distress'] 
Score: -10.947751998901367 	 ['covid-19', 'disease_disease', 'brain'] 
Score: -10.965715408325195 	 ['covid-19', 'disease_disease', 'injury'] 
Score: -10.987531661987305 	 ['covid-19', 'disease

Average TransE Loss:   0.014850: 100%|██████████| 100/100 [01:02<00:00,  1.60epoch/s]
100%|██████████| 200/200 [00:01<00:00, 139.77it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 1300.885
Mean Reciprocal Rank: 0.08945390387753367
Hits@1: 0.0525
Hits@10: 0.16
Hits@100: 0.3125
Mean Rank: 1300.885
Mean Reciprocal Rank: 0.08945390387753367
Hits@1: 0.0525
Hits@10: 0.16
Hits@100: 0.3125
------------------------------------------------------------
What are related diseases of Covid-19
Score: -14.507125854492188 	 ['covid-19', 'disease_disease', 'covid-19'] 
Score: -16.800987243652344 	 ['covid-19', 'disease_disease', 'acute disorder'] 
Score: -17.436843872070312 	 ['covid-19', 'disease_disease', 'transmissible'] 
Score: -17.454185485839844 	 ['covid-19', 'disease_disease', 'dementia'] 
Score: -17.472007751464844 	 ['covid-19', 'disease_disease', 'bronchospasm'] 
Score: -17.47338104248047 	 ['covid-19', 'disease_disease', 'human health'] 
Score: -17.486398696899414 	 ['covid-19', 'disease_disease', 'disease caused by virus'] 
Score: -17.566509246826172 	 ['covid-19', 'disease_disease', 'gastrointestinal complaint'] 

Average TransE Loss:   0.024302: 100%|██████████| 100/100 [01:05<00:00,  1.54epoch/s]
100%|██████████| 200/200 [00:01<00:00, 127.17it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 123.68
Mean Reciprocal Rank: 0.3509471814267057
Hits@1: 0.2325
Hits@10: 0.565
Hits@100: 0.8225
Mean Rank: 123.68
Mean Reciprocal Rank: 0.3509471814267057
Hits@1: 0.2325
Hits@10: 0.565
Hits@100: 0.8225
------------------------------------------------------------
What are related diseases of Covid-19
Score: -6.125833034515381 	 ['covid-19', 'disease_disease', 'covid-19'] 
Score: -10.825271606445312 	 ['covid-19', 'disease_disease', 'major'] 
Score: -11.40636920928955 	 ['covid-19', 'disease_disease', '2009'] 
Score: -11.407394409179688 	 ['covid-19', 'disease_disease', 'severe'] 
Score: -11.433307647705078 	 ['covid-19', 'disease_disease', 'inflammatory'] 
Score: -11.469444274902344 	 ['covid-19', 'disease_disease', 'lower'] 
Score: -11.536663055419922 	 ['covid-19', 'disease_disease', 'acute'] 
Score: -11.613351821899414 	 ['covid-19', 'disease_disease', 'infectious'] 
Score: -11.799468040466309 	 ['covid-19', 'disease_disease', 'cri

Average TransE Loss:   0.014512: 100%|██████████| 100/100 [01:09<00:00,  1.44epoch/s]
100%|██████████| 200/200 [00:01<00:00, 115.60it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 1709.7825
Mean Reciprocal Rank: 0.07419045686688941
Hits@1: 0.0475
Hits@10: 0.115
Hits@100: 0.3075
Mean Rank: 1709.7825
Mean Reciprocal Rank: 0.07419045686688941
Hits@1: 0.0475
Hits@10: 0.115
Hits@100: 0.3075
------------------------------------------------------------
What are related diseases of Covid-19
Score: -15.741315841674805 	 ['covid-19', 'disease_disease', 'covid-19'] 
Score: -18.55698013305664 	 ['covid-19', 'disease_disease', 'healthcare associated infection'] 
Score: -18.787097930908203 	 ['covid-19', 'disease_disease', 'bronchospasm'] 
Score: -18.890666961669922 	 ['covid-19', 'disease_disease', 'dilated ventricle'] 
Score: -18.95281982421875 	 ['covid-19', 'disease_disease', 'alris'] 
Score: -18.95684242248535 	 ['covid-19', 'disease_disease', 'prediabetes'] 
Score: -19.026182174682617 	 ['covid-19', 'disease_disease', 'metabolic disorder'] 
Score: -19.04698371887207 	 ['covid-19', 'disease_disease', 'encephalitic'] 


Average TransE Loss:   0.014286: 100%|██████████| 100/100 [01:13<00:00,  1.36epoch/s]
100%|██████████| 200/200 [00:01<00:00, 110.69it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 1608.05
Mean Reciprocal Rank: 0.0956298637104393
Hits@1: 0.0575
Hits@10: 0.1675
Hits@100: 0.34
Mean Rank: 1608.05
Mean Reciprocal Rank: 0.0956298637104393
Hits@1: 0.0575
Hits@10: 0.1675
Hits@100: 0.34
------------------------------------------------------------
What are related diseases of Covid-19
Score: -16.14837646484375 	 ['covid-19', 'disease_disease', 'covid-19'] 
Score: -18.23921012878418 	 ['covid-19', 'disease_disease', 'sir'] 
Score: -18.4324893951416 	 ['covid-19', 'disease_disease', 'transmissible'] 
Score: -18.646465301513672 	 ['covid-19', 'disease_disease', 'disease caused by virus'] 
Score: -18.749019622802734 	 ['covid-19', 'disease_disease', 'atherosclerotic plaque'] 
Score: -18.825416564941406 	 ['covid-19', 'disease_disease', 'bat coronavirus'] 
Score: -18.827144622802734 	 ['covid-19', 'disease_disease', 'parainfluenza 1'] 
Score: -18.856441497802734 	 ['covid-19', 'disease_disease', 'tissue necrosis'] 
Score: -

Average TransE Loss:   0.013876: 100%|██████████| 100/100 [01:16<00:00,  1.30epoch/s]
100%|██████████| 200/200 [00:02<00:00, 98.58it/s] 


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 1653.3325
Mean Reciprocal Rank: 0.09936003890795962
Hits@1: 0.07
Hits@10: 0.1625
Hits@100: 0.3625
Mean Rank: 1653.3325
Mean Reciprocal Rank: 0.09936003890795962
Hits@1: 0.07
Hits@10: 0.1625
Hits@100: 0.3625
------------------------------------------------------------
What are related diseases of Covid-19
Score: -10.328426361083984 	 ['covid-19', 'disease_disease', 'transmissible'] 
Score: -10.894635200500488 	 ['covid-19', 'disease_disease', 'contagious viral disease'] 
Score: -11.314693450927734 	 ['covid-19', 'disease_disease', 'alveolar edema'] 
Score: -11.317959785461426 	 ['covid-19', 'disease_disease', 'healthcare associated infection'] 
Score: -11.363054275512695 	 ['covid-19', 'disease_disease', 'sickle cell anaemia'] 
Score: -11.457233428955078 	 ['covid-19', 'disease_disease', 'bronchospasm'] 
Score: -11.468083381652832 	 ['covid-19', 'disease_disease', 'diarrhoeal disease'] 
Score: -11.49835205078125 	 ['covid-19', 'disea

Average TransE Loss:   0.023421: 100%|██████████| 100/100 [01:19<00:00,  1.25epoch/s]
100%|██████████| 200/200 [00:02<00:00, 94.98it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 140.6625
Mean Reciprocal Rank: 0.4133833624840358
Hits@1: 0.315
Hits@10: 0.595
Hits@100: 0.8
Mean Rank: 140.6625
Mean Reciprocal Rank: 0.4133833624840358
Hits@1: 0.315
Hits@10: 0.595
Hits@100: 0.8
------------------------------------------------------------
What are related diseases of Covid-19
Score: -7.649994373321533 	 ['covid-19', 'disease_disease', 'covid-19'] 
Score: -13.206191062927246 	 ['covid-19', 'disease_disease', '2009'] 
Score: -13.228767395019531 	 ['covid-19', 'disease_disease', 'infectious'] 
Score: -13.321197509765625 	 ['covid-19', 'disease_disease', 'major'] 
Score: -13.436370849609375 	 ['covid-19', 'disease_disease', 'inflammatory'] 
Score: -13.452075958251953 	 ['covid-19', 'disease_disease', 'severe'] 
Score: -13.572145462036133 	 ['covid-19', 'disease_disease', 'inflammatory disease'] 
Score: -13.686674118041992 	 ['covid-19', 'disease_disease', 'lower'] 
Score: -13.799914360046387 	 ['covid-19', 'disease_di

Average TransE Loss:   0.013772: 100%|██████████| 100/100 [01:21<00:00,  1.23epoch/s]
100%|██████████| 200/200 [00:02<00:00, 92.92it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 1944.93
Mean Reciprocal Rank: 0.10894323444063556
Hits@1: 0.07
Hits@10: 0.175
Hits@100: 0.3075
Mean Rank: 1944.93
Mean Reciprocal Rank: 0.10894323444063556
Hits@1: 0.07
Hits@10: 0.175
Hits@100: 0.3075
------------------------------------------------------------
What are related diseases of Covid-19
Score: -17.08624839782715 	 ['covid-19', 'disease_disease', 'covid-19'] 
Score: -17.193843841552734 	 ['covid-19', 'disease_disease', 'obstructive lung disease'] 
Score: -17.274696350097656 	 ['covid-19', 'disease_disease', 'systemic sclerosis'] 
Score: -17.28112030029297 	 ['covid-19', 'disease_disease', 'fibrinous pleuritis'] 
Score: -17.29165267944336 	 ['covid-19', 'disease_disease', 'diffuse alveolar hemorrhage'] 
Score: -17.366825103759766 	 ['covid-19', 'disease_disease', 'endocrine'] 
Score: -17.36727523803711 	 ['covid-19', 'disease_disease', 'leukoerythroblastosis'] 
Score: -17.37020492553711 	 ['covid-19', 'disease_disease', 'l

Average TransE Loss:   0.023431: 100%|██████████| 100/100 [01:25<00:00,  1.18epoch/s]
100%|██████████| 200/200 [00:02<00:00, 91.62it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 147.2
Mean Reciprocal Rank: 0.33460937781339256
Hits@1: 0.2375
Hits@10: 0.495
Hits@100: 0.7875
Mean Rank: 147.2
Mean Reciprocal Rank: 0.33460937781339256
Hits@1: 0.2375
Hits@10: 0.495
Hits@100: 0.7875
------------------------------------------------------------
What are related diseases of Covid-19
Score: -8.577434539794922 	 ['covid-19', 'disease_disease', 'covid-19'] 
Score: -12.51475715637207 	 ['covid-19', 'disease_disease', '2009'] 
Score: -12.98316764831543 	 ['covid-19', 'disease_disease', 'major'] 
Score: -13.074155807495117 	 ['covid-19', 'disease_disease', 'infectious'] 
Score: -13.209430694580078 	 ['covid-19', 'disease_disease', 'brain'] 
Score: -13.312311172485352 	 ['covid-19', 'disease_disease', 'severe'] 
Score: -13.329235076904297 	 ['covid-19', 'disease_disease', 'same'] 
Score: -13.381096839904785 	 ['covid-19', 'disease_disease', 'lower'] 
Score: -13.640403747558594 	 ['covid-19', 'disease_disease', 'cns'] 
Score

In [17]:
ALL

[{'subset_1': [                                               triple      score
   0               [covid-19, disease_disease, covid-19] -12.617729
   1     [covid-19, disease_disease, feline herpesvirus] -15.769609
   2    [covid-19, disease_disease, shortness of breath] -16.083181
   3                    [covid-19, disease_disease, dhf] -16.103699
   4             [covid-19, disease_disease, adenopathy] -16.149525
   5             [covid-19, disease_disease, infectious] -16.183517
   6                 [covid-19, disease_disease, coryza] -16.230885
   7   [covid-19, disease_disease, 1918 influenza pan... -16.254435
   8                [covid-19, disease_disease, relapse] -16.262491
   9                 [covid-19, disease_disease, uremia] -16.326002
   10            [zika virus, disease_disease, covid-19] -11.198883
   11             [hyperemia, disease_disease, covid-19] -11.275060
   12         [remyelination, disease_disease, covid-19] -11.416758
   13   [chronic hepatitis c, diseas

In [45]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.makedirs('/content/drive/MyDrive/KG-output-2')

for i in range(len(ALL)): 
  print('i----------%s'%i)
  dir = os.makedirs('/content/drive/MyDrive/KG-output-2/subset_%s'%str(i+1))
  for j  in range(len(list(ALL[i].values())[0])):
    print('j----------%s'%j)
    path = '/content/drive/MyDrive/KG-output-2/subset_%s'%str(i+1) +'/query_' + str(j) + '.csv'
    print(path)
    print((list(ALL[i].values()))[0][j])
    with open(path, 'w') as file:
      (list(ALL[i].values()))[0][j].to_csv(file)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
i----------0
j----------0
/content/drive/MyDrive/KG-output-2/subset_1/query_0.csv
                                               triple      score
0               [covid-19, disease_disease, covid-19] -12.617729
1     [covid-19, disease_disease, feline herpesvirus] -15.769609
2    [covid-19, disease_disease, shortness of breath] -16.083181
3                    [covid-19, disease_disease, dhf] -16.103699
4             [covid-19, disease_disease, adenopathy] -16.149525
5             [covid-19, disease_disease, infectious] -16.183517
6                 [covid-19, disease_disease, coryza] -16.230885
7   [covid-19, disease_disease, 1918 influenza pan... -16.254435
8                [covid-19, disease_disease, relapse] -16.262491
9                 [covid-19, disease_disease, uremia] -16.326002
10            [zika virus, disease_disease, covid-19] -11.198883
11       

# QA EVALUATION

In [None]:
# Load groundtruth file
