<a href="https://colab.research.google.com/github/HuyenNguyenHelen/CORD-19-KG/blob/master/QA-models/covid_19_kg_mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preliminaries

In [1]:
# If using Google Colab run this cell 

# select tensorflow version for colab 
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
import tensorflow as tf 

print('TensorFlow  version: {}'.format(tf.__version__))

# Get the GPU name
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow  version: 1.15.2
Found GPU at: /device:GPU:0


In [3]:
%%capture 
# Install AmpliGraph library
! pip install ampligraph

# Required to visualize embeddings with tensorboard projector, comment out if not required!
! pip install --user tensorboard

# Required to plot text on embedding clusters, comment out if not required!
! pip install --user git+https://github.com/Phlya/adjustText

In [4]:
# All imports used in this tutorial 
%tensorflow_version 1.x
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf
from ampligraph.datasets import load_fb15k_237
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.discovery import query_topn, discover_facts, find_clusters
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model

def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))

print('Ampligraph version: {}'.format(ampligraph.__version__))

Ampligraph version: 1.4.0


In [5]:
from ampligraph.datasets import load_fb15k_237, load_wn18rr, load_yago3_10
import pandas as pd
import requests
import io

## Loading Data 
(a triples dataset)

In [14]:
# Downloading the csv file from your GitHub account
# re_pdf = requests.get('https://raw.githubusercontent.com/HuyenNguyenHelen/CORD-19-KG/master/Data/new_triples_with_predefined_relations_pdf_June20_.csv').content
# re_pmc = requests.get('https://raw.githubusercontent.com/HuyenNguyenHelen/CORD-19-KG/master/Data/new_triples_with_predefined_relations_pmc_June20_.csv').content
# pdf = pd.read_csv(io.StringIO(re_pdf.decode('utf-8')))
# pmc =  pd.read_csv(io.StringIO(re_pmc.decode('utf-8')))
# dataset = pd.concat([pmc, pdf])

re_dataset = requests.get('https://raw.githubusercontent.com/HuyenNguyenHelen/CORD-19-KG/master/Data/all-final-cleaned-triples_2.csv').content
dataset = pd.read_csv(io.StringIO(re_dataset.decode('utf-8')))

print(dataset.columns)
dataset.drop(columns='Unnamed: 0', inplace=True)
# dataset = dataset[['subject','new_relation', 'object' ]]
dataset.head(5)

Index(['Unnamed: 0', 'subject', 'new_relation', 'object'], dtype='object')


Unnamed: 0,subject,new_relation,object
0,dysfunction,disease_species,child
1,et dysfunction,disease_species,child
2,rsv,disease_disease,virus
3,treatment,treat_procedure_species,child
4,urgency,disease_disease,virus


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [15]:
print('Total triples in the KG:', dataset.shape)

Total triples in the KG: (86275, 3)


In [16]:
print('the number of relations: \n', dataset['new_relation'].value_counts())

the number of relations: 
 disease_disease                    22991
gene_gene                          10985
gene_disease                        6347
disease_species                     5956
disease_gene                        5684
treat_procedure_disease             3757
disease_treat_procedure             3471
gene_treat_procedure                2618
drug_disease                        2420
treat_procedure_gene                2287
gene_species                        2140
treat_procedure_treat_procedure     2115
drug_gene                           2040
gene_drug                           1539
drug_drug                           1505
disease_symptom                     1475
disease_drug                        1407
symptom_disease                     1296
treat_procedure_species             1214
drug_treat_procedure                1124
treat_procedure_drug                1094
drug_species                         892
symptom_symptom                      528
gene_symptom                  

## Data exploration

# Data Processing
Partition the triples into 10 portions that will be used for buiding QA systems

In [17]:
# Shuffle the dataset
dataset = dataset.sample(frac=1, axis=1).reset_index(drop=True)
subsets = np.array_split(dataset, 10)  
len(subsets)


10

# Developing QA system

## Developing model

### Create training, validation and test splits

In [18]:
from ampligraph.evaluation import train_test_split_no_unseen
def train_set_split (ds, test_frac = 200, val_frac = 100 ):
  # get the validation set of size 500
  test_train, X_test = train_test_split_no_unseen(ds.values, test_frac, seed=0)

  # get the test set of size 1000 from the remaining triples
  X_train, X_val = train_test_split_no_unseen(test_train, val_frac, seed=0)

  print('Total triples:', dataset.shape)
  print('Size of train:', X_train.shape)
  print('Size of valid:', X_val.shape)
  print('Size of test:', X_test.shape)
  return X_train, X_val, X_test

### Model Training

In [19]:
from ampligraph.latent_features import TransE

def training (train_ds, val_ds, test_ds):
  # Train a KGE model
  model = TransE(k=300, 
                epochs=100, 
                eta=1, 
                loss='multiclass_nll', 
                initializer='xavier', initializer_params={'uniform': False},
                regularizer='LP', regularizer_params= {'lambda': 0.001, 'p': 3},
                optimizer= 'adam', optimizer_params= {'lr': 0.0001}, 
                seed= 0, batches_count= 100, verbose=True)

  model.fit(train_ds)
  # ----------------------
  # Evaluate: 
  # Filtered evaluation with ranking strategy assigning worst rank to break ties
  from ampligraph.utils import save_model, restore_model
  save_model(model, 'TransE.pkl')
  model = restore_model('TransE.pkl')

  # create the filter 
  X_filter = np.concatenate([train_ds, val_ds, test_ds], 0)

  # compute ranks
  ranks = evaluate_performance(test_ds, 
                              model=model, 
                              filter_triples=X_filter)

  # ranks are computed per triple
  print('Test set:', X_test.shape)
  print('Size of ranks:', ranks.shape)

  # Aggregate metrics show the aggregate performance of the model on the test set using a single number
  display_aggregate_metrics(ranks)
  # ----------------------
  
  return display_aggregate_metrics(ranks)

X_train, X_val, X_test = train_set_split(dataset)
scores = training (X_train, X_val, X_test) #dataset.to_numpy()
print('........score: \n {}'.format(scores))

Total triples: (86275, 3)
Size of train: (85975, 3)
Size of valid: (100, 3)
Size of test: (200, 3)


Average TransE Loss:   0.040553: 100%|██████████| 100/100 [01:17<00:00,  1.29epoch/s]
100%|██████████| 200/200 [00:01<00:00, 110.92it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 529.9825
Mean Reciprocal Rank: 0.09701796544116059
Hits@1: 0.0425
Hits@10: 0.1875
Hits@100: 0.505
Mean Rank: 529.9825
Mean Reciprocal Rank: 0.09701796544116059
Hits@1: 0.0425
Hits@10: 0.1875
Hits@100: 0.505
........score: 
 None


## Knowledge Discovery

In [20]:
from ampligraph.discovery import query_topn

def QA_retrieve (input, top_n ):
  head, relation1, tail = input[0], input[1], input[2]
  if len(relation1.split('_')) <= 2:
    relation2 = '{}_{}'.format(relation1.split('_')[1], relation1.split('_')[0])
  elif len(relation1.split('_')) == 3:
     relation2 = '{}_{}_{}'.format(relation1.split('_')[1], relation1.split('_')[2], relation1.split('_')[0])
  else:
    raise ValueError('error: relation length issue')
  # restore the previously saved model to save time
  model = restore_model('TransE.pkl')
  if relation1  in dataset['new_relation'].tolist():
    triples1, scores1 = query_topn(model, top_n=top_n, 
                                head=head, 
                                relation=relation1, 
                                tail=tail, 
                                ents_to_consider=None, 
                                rels_to_consider=None)
  else:
    triples1, scores1 = [], []

  if relation2  in dataset['new_relation'].tolist():
    triples2, scores2 = query_topn(model, top_n=top_n, 
                                head=tail, 
                                relation=relation2, 
                                tail=head, 
                                ents_to_consider=None, 
                                rels_to_consider=None)
  else:
    triples2, scores2 = [], []

  if len(triples1) + len(triples2) == top_n:
    if len(triples1) ==0: 
      triples = triples1 + triples2.tolist()
      scores = scores1+ scores2.tolist()
    else:
      triples = triples1.tolist() + triples2
      scores = scores1.tolist() + scores2
  else:
    end_idx = round(top_n/2)
    triples = triples1.tolist()[:end_idx] + triples2.tolist()[:end_idx]
    scores = scores1.tolist()[:end_idx] + scores2.tolist()[:end_idx]
   
  for triple, score in zip(triples, scores):
      print('Score: {} \t {} '.format(score, triple))
  return triples, scores
 


In [21]:


# Query1: What are the genes/proteins related to Covid-19
print ('------------------------------------------------------------')
print('What are the genes/proteins related to Covid-19')
question3 = QA_retrieve (['covid-19', 'disease_gene', None], top_n = 20)

# Query2: What species transmit the covid-19
print ('------------------------------------------------------------')
print('What species transmit the covid-19?')
question4 = QA_retrieve (['covid-19', 'disease_species', None], top_n = 20)

# Query3: what are symptoms of the covid-19
print ('------------------------------------------------------------')
print('what are symptoms of the covid-19?')
question5 = QA_retrieve (['covid-19', 'disease_symptom', None], top_n = 20)

# Query4: what are the possible therapeutic procedure of covid-19??
print ('------------------------------------------------------------')
print('what are the possible therapeutic procedure of covid-19?')
question1 = QA_retrieve (['covid-19', 'disease_treat_procedure', None], top_n = 20)

# Query5: What are potential drugs to treat COVID-19?
print ('------------------------------------------------------------')
print('What are potential drugs to treat COVID-19?')
question6 = QA_retrieve (['covid-19', 'disease_drug', None], top_n = 20)

# Query6: What are related diseases of Covid-19
print ('------------------------------------------------------------')
print('What are related diseases of Covid-19')
question2 = QA_retrieve (['covid-19', 'disease_disease', None], top_n = 20)


------------------------------------------------------------
What are the genes/proteins related to Covid-19
Score: -15.625078201293945 	 ['covid-19', 'disease_gene', 'rna'] 
Score: -15.880897521972656 	 ['covid-19', 'disease_gene', 'virus'] 
Score: -15.975225448608398 	 ['covid-19', 'disease_gene', 'rbd'] 
Score: -16.344764709472656 	 ['covid-19', 'disease_gene', 'rna virus'] 
Score: -16.443073272705078 	 ['covid-19', 'disease_gene', 'vaccine'] 
Score: -16.652219772338867 	 ['covid-19', 'disease_gene', 'pdcov'] 
Score: -16.673877716064453 	 ['covid-19', 'disease_gene', 'recombinant virus'] 
Score: -16.686025619506836 	 ['covid-19', 'disease_gene', 'ctd'] 
Score: -16.717275619506836 	 ['covid-19', 'disease_gene', 'recruitment'] 
Score: -16.71895408630371 	 ['covid-19', 'disease_gene', 'pandemic'] 
Score: -16.455123901367188 	 ['rna', 'gene_disease', 'covid-19'] 
Score: -16.68697166442871 	 ['2009', 'gene_disease', 'covid-19'] 
Score: -16.90829086303711 	 ['critical', 'gene_disease', 'c

# Train model, and Knowledge mining with different % of dataset

In [22]:
ALL = []

for idx in range(len(subsets)):
  outputs = {} 
  outputs['subset_%s' %(idx+1)] = []
  data = subsets[:idx+1]
  data_concat = pd.concat(data)
  print ('i: {} -- len: {}'.format(idx, len(data_concat)))

  # Spliting dataset for training
  X_train, X_val, X_test = train_set_split(data_concat)
  # Training...
  print('Training..........')
  training (X_train, X_val, X_test)

  # Retrieving
  # Query1: What are the genes/proteins related to Covid-19
  print ('------------------------------------------------------------')
  print('What are the genes/proteins related to Covid-19')
  question3 = QA_retrieve (['covid-19', 'disease_gene', None], top_n = 20)
  df3 = pd.DataFrame(zip(question3[0],question3[1]), columns = ['triple', 'score'])
  outputs['subset_%s' %(idx+1)].append(df3)

  # Query2: What species transmit the covid-19
  print ('------------------------------------------------------------')
  print('What species transmit the covid-19?')
  question4 = QA_retrieve (['covid-19', 'disease_species', None], top_n = 20)
  df4 = pd.DataFrame(zip(question4[0],question4[1]), columns = ['triple', 'score'])
  outputs['subset_%s' %(idx+1)].append(df4)

  # Query3: what are symptoms of the covid-19
  print ('------------------------------------------------------------')
  print('what are symptoms of the covid-19?')
  question5 = QA_retrieve (['covid-19', 'disease_symptom', None], top_n = 20)
  df5 = pd.DataFrame(zip(question5[0],question5[1]), columns = ['triple', 'score'])
  outputs['subset_%s' %(idx+1)].append(df5)

  # Query4: what are the possible therapeutic procedure of covid-19??
  print ('------------------------------------------------------------')
  print('what are the possible therapeutic procedure of covid-19?')
  question1 = QA_retrieve (['covid-19', 'disease_treat_procedure', None], top_n = 20)
  df1 = pd.DataFrame(zip(question1[0],question1[1]), columns = ['triple', 'score'])
  outputs['subset_%s' %(idx+1)].append(df1)

  #Query5: What are potential drugs to treat COVID-19?
  print ('------------------------------------------------------------')
  print('What are potential drugs to treat COVID-19?')
  question6 = QA_retrieve (['covid-19', 'disease_drug', None], top_n = 20)
  df6 = pd.DataFrame(zip(question6[0],question6[1]), columns = ['triple', 'score'])
  outputs['subset_%s' %(idx+1)].append(df6)

  # Query6: What are related diseases of Covid-19
  print ('------------------------------------------------------------')
  print('What are related diseases of Covid-19')
  question2 = QA_retrieve (['covid-19', 'disease_disease', None], top_n = 20)
  df2 = pd.DataFrame(zip(question2[0],question2[1]), columns = ['triple', 'score'])
  outputs['subset_%s' %(idx+1)].append(df2)

  ALL.append(outputs)
                     
                     

i: 0 -- len: 8628
Total triples: (86275, 3)
Size of train: (8328, 3)
Size of valid: (100, 3)
Size of test: (200, 3)
Training..........


Average TransE Loss:   0.018197: 100%|██████████| 100/100 [00:46<00:00,  2.15epoch/s]
100%|██████████| 200/200 [00:01<00:00, 181.34it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 556.3
Mean Reciprocal Rank: 0.06937704003756455
Hits@1: 0.0375
Hits@10: 0.1175
Hits@100: 0.395
Mean Rank: 556.3
Mean Reciprocal Rank: 0.06937704003756455
Hits@1: 0.0375
Hits@10: 0.1175
Hits@100: 0.395
------------------------------------------------------------
What are the genes/proteins related to Covid-19
Score: -16.992660522460938 	 ['covid-19', 'disease_gene', 'rna'] 
Score: -17.315692901611328 	 ['covid-19', 'disease_gene', 'sinv'] 
Score: -17.399818420410156 	 ['covid-19', 'disease_gene', 'aav'] 
Score: -17.408527374267578 	 ['covid-19', 'disease_gene', 'rbd'] 
Score: -17.525470733642578 	 ['covid-19', 'disease_gene', 'sars cov'] 
Score: -17.531782150268555 	 ['covid-19', 'disease_gene', 'virus'] 
Score: -17.569530487060547 	 ['covid-19', 'disease_gene', 'hiv'] 
Score: -17.85011863708496 	 ['covid-19', 'disease_gene', 'cov'] 
Score: -17.890954971313477 	 ['covid-19', 'disease_gene', 'hcv'] 
Score: -17.897302627563477 	 ['covi

Average TransE Loss:   0.022131: 100%|██████████| 100/100 [00:54<00:00,  1.83epoch/s]
100%|██████████| 200/200 [00:01<00:00, 153.17it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 971.9025
Mean Reciprocal Rank: 0.060993422818477666
Hits@1: 0.0325
Hits@10: 0.11
Hits@100: 0.3125
Mean Rank: 971.9025
Mean Reciprocal Rank: 0.060993422818477666
Hits@1: 0.0325
Hits@10: 0.11
Hits@100: 0.3125
------------------------------------------------------------
What are the genes/proteins related to Covid-19
Score: -16.512775421142578 	 ['covid-19', 'disease_gene', 'rna'] 
Score: -16.85662078857422 	 ['covid-19', 'disease_gene', 'virus'] 
Score: -17.04883575439453 	 ['covid-19', 'disease_gene', 'ctd'] 
Score: -17.135082244873047 	 ['covid-19', 'disease_gene', 'hcv'] 
Score: -17.294879913330078 	 ['covid-19', 'disease_gene', 'vrs'] 
Score: -17.332229614257812 	 ['covid-19', 'disease_gene', 'sars cov'] 
Score: -17.360843658447266 	 ['covid-19', 'disease_gene', 'coronaviruses'] 
Score: -17.4549560546875 	 ['covid-19', 'disease_gene', 'iav'] 
Score: -17.464805603027344 	 ['covid-19', 'disease_gene', 'ebov'] 
Score: -17.54400634765

Average TransE Loss:   0.035085: 100%|██████████| 100/100 [00:57<00:00,  1.75epoch/s]
100%|██████████| 200/200 [00:01<00:00, 141.24it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 306.3575
Mean Reciprocal Rank: 0.12988072987901111
Hits@1: 0.06
Hits@10: 0.27
Hits@100: 0.5825
Mean Rank: 306.3575
Mean Reciprocal Rank: 0.12988072987901111
Hits@1: 0.06
Hits@10: 0.27
Hits@100: 0.5825
------------------------------------------------------------
What are the genes/proteins related to Covid-19
Score: -14.250892639160156 	 ['covid-19', 'disease_gene', 'rna'] 
Score: -14.777214050292969 	 ['covid-19', 'disease_gene', 'coronaviruses'] 
Score: -14.807819366455078 	 ['covid-19', 'disease_gene', 'mhv'] 
Score: -14.878227233886719 	 ['covid-19', 'disease_gene', 'virus'] 
Score: -15.32345962524414 	 ['covid-19', 'disease_gene', 'influenza virus'] 
Score: -15.328792572021484 	 ['covid-19', 'disease_gene', 'ctd'] 
Score: -15.351289749145508 	 ['covid-19', 'disease_gene', 'sars cov'] 
Score: -15.488044738769531 	 ['covid-19', 'disease_gene', 'rbd'] 
Score: -15.531342506408691 	 ['covid-19', 'disease_gene', 'iav'] 
Score: -15.606

Average TransE Loss:   0.023994: 100%|██████████| 100/100 [01:01<00:00,  1.62epoch/s]
100%|██████████| 200/200 [00:01<00:00, 139.03it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 1866.0125
Mean Reciprocal Rank: 0.02535577512918856
Hits@1: 0.0075
Hits@10: 0.0575
Hits@100: 0.2
Mean Rank: 1866.0125
Mean Reciprocal Rank: 0.02535577512918856
Hits@1: 0.0075
Hits@10: 0.0575
Hits@100: 0.2
------------------------------------------------------------
What are the genes/proteins related to Covid-19
Score: -17.31427764892578 	 ['covid-19', 'disease_gene', 'scv'] 
Score: -17.77655029296875 	 ['covid-19', 'disease_gene', 'rna'] 
Score: -17.813369750976562 	 ['covid-19', 'disease_gene', 'rbd'] 
Score: -17.878761291503906 	 ['covid-19', 'disease_gene', 'virus'] 
Score: -18.034626007080078 	 ['covid-19', 'disease_gene', 'cholesterol depletion'] 
Score: -18.23752784729004 	 ['covid-19', 'disease_gene', 'eb'] 
Score: -18.24419403076172 	 ['covid-19', 'disease_gene', 'vaccine'] 
Score: -18.329879760742188 	 ['covid-19', 'disease_gene', 'viroplasms'] 
Score: -18.36844253540039 	 ['covid-19', 'disease_gene', 'paramyxovirus'] 
Sco

Average TransE Loss:   0.023711: 100%|██████████| 100/100 [01:03<00:00,  1.57epoch/s]
100%|██████████| 200/200 [00:01<00:00, 128.09it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 1959.4775
Mean Reciprocal Rank: 0.04000631943715649
Hits@1: 0.02
Hits@10: 0.0775
Hits@100: 0.23
Mean Rank: 1959.4775
Mean Reciprocal Rank: 0.04000631943715649
Hits@1: 0.02
Hits@10: 0.0775
Hits@100: 0.23
------------------------------------------------------------
What are the genes/proteins related to Covid-19
Score: -18.794063568115234 	 ['covid-19', 'disease_gene', 'ctd'] 
Score: -18.858598709106445 	 ['covid-19', 'disease_gene', 'mhv'] 
Score: -18.86144256591797 	 ['covid-19', 'disease_gene', 'virus'] 
Score: -18.974933624267578 	 ['covid-19', 'disease_gene', 'vaccine'] 
Score: -19.193103790283203 	 ['covid-19', 'disease_gene', 'rbds'] 
Score: -19.196659088134766 	 ['covid-19', 'disease_gene', 'veev'] 
Score: -19.201875686645508 	 ['covid-19', 'disease_gene', 'rna'] 
Score: -19.257387161254883 	 ['covid-19', 'disease_gene', 'np'] 
Score: -19.464431762695312 	 ['covid-19', 'disease_gene', 'pandemic'] 
Score: -19.472126007080078 	 

Average TransE Loss:   0.038125: 100%|██████████| 100/100 [01:06<00:00,  1.51epoch/s]
100%|██████████| 200/200 [00:01<00:00, 125.94it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 507.985
Mean Reciprocal Rank: 0.0952236102962825
Hits@1: 0.04
Hits@10: 0.2
Hits@100: 0.5775
Mean Rank: 507.985
Mean Reciprocal Rank: 0.0952236102962825
Hits@1: 0.04
Hits@10: 0.2
Hits@100: 0.5775
------------------------------------------------------------
What are the genes/proteins related to Covid-19
Score: -15.054226875305176 	 ['covid-19', 'disease_gene', 'rna'] 
Score: -15.448773384094238 	 ['covid-19', 'disease_gene', 'virus'] 
Score: -15.615264892578125 	 ['covid-19', 'disease_gene', 'rbd'] 
Score: -15.822732925415039 	 ['covid-19', 'disease_gene', 'ctd'] 
Score: -15.988676071166992 	 ['covid-19', 'disease_gene', 'np'] 
Score: -16.08625030517578 	 ['covid-19', 'disease_gene', 'vaccine'] 
Score: -16.12884521484375 	 ['covid-19', 'disease_gene', 'paramyxovirus'] 
Score: -16.240314483642578 	 ['covid-19', 'disease_gene', 'rna virus'] 
Score: -16.248159408569336 	 ['covid-19', 'disease_gene', 'recombinant virus'] 
Score: -16.2706

Average TransE Loss:   0.024386: 100%|██████████| 100/100 [01:09<00:00,  1.45epoch/s]
100%|██████████| 200/200 [00:01<00:00, 122.83it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 2345.055
Mean Reciprocal Rank: 0.030812081480885406
Hits@1: 0.0125
Hits@10: 0.06
Hits@100: 0.225
Mean Rank: 2345.055
Mean Reciprocal Rank: 0.030812081480885406
Hits@1: 0.0125
Hits@10: 0.06
Hits@100: 0.225
------------------------------------------------------------
What are the genes/proteins related to Covid-19
Score: -17.56232452392578 	 ['covid-19', 'disease_gene', 'rna'] 
Score: -17.872711181640625 	 ['covid-19', 'disease_gene', 'virus'] 
Score: -18.133346557617188 	 ['covid-19', 'disease_gene', 'recombinant virus'] 
Score: -18.208608627319336 	 ['covid-19', 'disease_gene', 'vaccine'] 
Score: -18.32511329650879 	 ['covid-19', 'disease_gene', 'hdv'] 
Score: -18.482139587402344 	 ['covid-19', 'disease_gene', 'scv'] 
Score: -18.497039794921875 	 ['covid-19', 'disease_gene', 'dmvs'] 
Score: -18.53898811340332 	 ['covid-19', 'disease_gene', 'apn'] 
Score: -18.544384002685547 	 ['covid-19', 'disease_gene', 'nw arenavirus'] 
Score: -18

Average TransE Loss:   0.024299: 100%|██████████| 100/100 [01:12<00:00,  1.38epoch/s]
100%|██████████| 200/200 [00:01<00:00, 117.70it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 2360.4225
Mean Reciprocal Rank: 0.039317425383731494
Hits@1: 0.015
Hits@10: 0.0925
Hits@100: 0.23
Mean Rank: 2360.4225
Mean Reciprocal Rank: 0.039317425383731494
Hits@1: 0.015
Hits@10: 0.0925
Hits@100: 0.23
------------------------------------------------------------
What are the genes/proteins related to Covid-19
Score: -18.683218002319336 	 ['covid-19', 'disease_gene', 'eb'] 
Score: -18.690147399902344 	 ['covid-19', 'disease_gene', 'vaccine'] 
Score: -18.73745346069336 	 ['covid-19', 'disease_gene', 'scv'] 
Score: -18.795608520507812 	 ['covid-19', 'disease_gene', 'rbds'] 
Score: -18.82703399658203 	 ['covid-19', 'disease_gene', 'fp'] 
Score: -18.828813552856445 	 ['covid-19', 'disease_gene', 'recombinant virus'] 
Score: -18.83747100830078 	 ['covid-19', 'disease_gene', 'virus'] 
Score: -18.99185562133789 	 ['covid-19', 'disease_gene', 'rna virus'] 
Score: -19.05899429321289 	 ['covid-19', 'disease_gene', 'lyme neuroborreliosis']

Average TransE Loss:   0.025186: 100%|██████████| 100/100 [01:14<00:00,  1.34epoch/s]
100%|██████████| 200/200 [00:01<00:00, 111.20it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 2730.945
Mean Reciprocal Rank: 0.04129539045236935
Hits@1: 0.0225
Hits@10: 0.075
Hits@100: 0.1825
Mean Rank: 2730.945
Mean Reciprocal Rank: 0.04129539045236935
Hits@1: 0.0225
Hits@10: 0.075
Hits@100: 0.1825
------------------------------------------------------------
What are the genes/proteins related to Covid-19
Score: -19.659103393554688 	 ['covid-19', 'disease_gene', 'rna'] 
Score: -19.661136627197266 	 ['covid-19', 'disease_gene', 'scv'] 
Score: -19.671628952026367 	 ['covid-19', 'disease_gene', 'virus'] 
Score: -19.817495346069336 	 ['covid-19', 'disease_gene', 'rbd'] 
Score: -20.035140991210938 	 ['covid-19', 'disease_gene', 'tgev infection'] 
Score: -20.11060333251953 	 ['covid-19', 'disease_gene', 'vaccine'] 
Score: -20.149614334106445 	 ['covid-19', 'disease_gene', 'eb'] 
Score: -20.193748474121094 	 ['covid-19', 'disease_gene', 'ctd'] 
Score: -20.200366973876953 	 ['covid-19', 'disease_gene', 'er'] 
Score: -20.33269119262

Average TransE Loss:   0.040552: 100%|██████████| 100/100 [01:17<00:00,  1.30epoch/s]
100%|██████████| 200/200 [00:01<00:00, 112.09it/s]


Test set: (200, 3)
Size of ranks: (200, 2)
Mean Rank: 526.9875
Mean Reciprocal Rank: 0.09582543958143581
Hits@1: 0.0425
Hits@10: 0.1925
Hits@100: 0.505
Mean Rank: 526.9875
Mean Reciprocal Rank: 0.09582543958143581
Hits@1: 0.0425
Hits@10: 0.1925
Hits@100: 0.505
------------------------------------------------------------
What are the genes/proteins related to Covid-19
Score: -15.581380844116211 	 ['covid-19', 'disease_gene', 'rna'] 
Score: -15.82620620727539 	 ['covid-19', 'disease_gene', 'virus'] 
Score: -15.905933380126953 	 ['covid-19', 'disease_gene', 'rbd'] 
Score: -16.300006866455078 	 ['covid-19', 'disease_gene', 'rna virus'] 
Score: -16.378931045532227 	 ['covid-19', 'disease_gene', 'vaccine'] 
Score: -16.619203567504883 	 ['covid-19', 'disease_gene', 'recombinant virus'] 
Score: -16.622913360595703 	 ['covid-19', 'disease_gene', 'recruitment'] 
Score: -16.63533592224121 	 ['covid-19', 'disease_gene', 'pdcov'] 
Score: -16.668365478515625 	 ['covid-19', 'disease_gene', 'ctd'] 
Sc

In [23]:
ALL

[{'subset_1': [                                       triple      score
   0               [covid-19, disease_gene, rna] -16.992661
   1              [covid-19, disease_gene, sinv] -17.315693
   2               [covid-19, disease_gene, aav] -17.399818
   3               [covid-19, disease_gene, rbd] -17.408527
   4          [covid-19, disease_gene, sars cov] -17.525471
   5             [covid-19, disease_gene, virus] -17.531782
   6               [covid-19, disease_gene, hiv] -17.569530
   7               [covid-19, disease_gene, cov] -17.850119
   8               [covid-19, disease_gene, hcv] -17.890955
   9        [covid-19, disease_gene, sars cov s] -17.897303
   10            [lower, gene_disease, covid-19] -15.974579
   11            [virus, gene_disease, covid-19] -15.994793
   12              [cns, gene_disease, covid-19] -16.208183
   13          [vaccine, gene_disease, covid-19] -16.233412
   14  [prrsv infection, gene_disease, covid-19] -16.246607
   15              [ade, gen

In [24]:
from google.colab import drive
drive.mount('/content/drive')
import os
dir_path = '/content/drive/MyDrive/KG-output-2'
isExist = os.path.exists(dir_path)

if not isExist:
  os.makedirs(dir_path)

for i in range(len(ALL)): 
  print('i----------%s'%i)
  dir_s_path = '/content/drive/MyDrive/KG-output-2/subset_%s'%str(i+1)
  isExist_s = os.path.exists(dir_s_path)
  if not isExist_s:
    dir = os.makedirs(dir_s_path)
  for j  in range(len(list(ALL[i].values())[0])):
    print('j----------%s'%j)
    path = '/content/drive/MyDrive/KG-output-2/subset_%s'%str(i+1) +'/query_' + str(j) + '.csv'
    print(path)
    print((list(ALL[i].values()))[0][j])
    with open(path, 'w') as file:
      (list(ALL[i].values()))[0][j].to_csv(file)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
i----------0
j----------0
/content/drive/MyDrive/KG-output-2/subset_1/query_0.csv
                                       triple      score
0               [covid-19, disease_gene, rna] -16.992661
1              [covid-19, disease_gene, sinv] -17.315693
2               [covid-19, disease_gene, aav] -17.399818
3               [covid-19, disease_gene, rbd] -17.408527
4          [covid-19, disease_gene, sars cov] -17.525471
5             [covid-19, disease_gene, virus] -17.531782
6               [covid-19, disease_gene, hiv] -17.569530
7               [covid-19, disease_gene, cov] -17.850119
8               [covid-19, disease_gene, hcv] -17.890955
9        [covid-19, disease_gene, sars cov s] -17.897303
10            [lower, gene_disease, covid-19] -15.974579
11            [virus, gene_disease, covid-19] -15.994793
12              [cns, gene_disease, covid-19] -1

# QA EVALUATION

In [25]:
# Load groundtruth file
