In [26]:
import pandas as pd
import numpy as np
import requests
import json
import time
import os 
import gzip
import timeit

from scripts.vectology_functions import create_aaa_distances, create_pair_distances, embed_text, encode_traits, create_efo_nxo

from pandas_profiling import ProfileReport

import seaborn as sns

# Apply the default theme
sns.set_theme()

In [27]:
# globals
ebi_data = 'data/UK_Biobank_master_file.tsv'
#efo_nodes = 'data/efo-nodes.tsv'
#efo_data = 'data/efo_data.txt.gz'
efo_nodes = 'data/epigraphdb_efo_nodes.csv'
efo_rels = 'data/epigraphdb_efo_rels.csv'
nxontology_measure = 'batet'

modelData = [
    {'name':'BioSentVec','model':'BioSentVec'},
    {'name':'BioBERT','model':'biobert_v1.1_pubmed'},
    {'name':'BlueBERT','model':'NCBI_BERT_pubmed_mimic_uncased_L-12_H-768_A-12'},
    {'name':'GUSE','model':'GUSEv4'},
    {'name':'BERT-EFO','model':'BERT-EFO'},
    {'name':'Zooma','model':'Zooma'}
]

pallete="hls"
output='output/trait-trait'

In [28]:
#create nxontology network of EFO relationships
efo_rel_df=pd.read_csv(efo_rels)
efo_nx = create_efo_nxo(df=efo_rel_df,child_col='efo.id',parent_col='parent_efo.id')
efo_nx.freeze()

In [30]:
# read cleaned EBI data
ebi_df = pd.read_csv('output/ebi-ukb-cleaned.tsv',sep='\t')
print(ebi_df.head())

                                      query       MAPPED_TERM_LABEL  \
0           Vascular disorders of intestine        vascular disease   
1           Vascular disorders of intestine        vascular disease   
2                              Gonarthrosis  osteoarthritis || knee   
3  Psoriatic and enteropathic arthropathies     psoriatic arthritis   
4          Pain associated with micturition                 dysuria   

  MAPPING_TYPE           id                               full_id  
0        Broad  EFO_0004264  http://www.ebi.ac.uk/efo/EFO_0004264  
1        Broad  EFO_0009431  http://www.ebi.ac.uk/efo/EFO_0009431  
2        Broad  EFO_0004616  http://www.ebi.ac.uk/efo/EFO_0004616  
3      ? Broad  EFO_0003778  http://www.ebi.ac.uk/efo/EFO_0003778  
4      ? Broad  EFO_0003901  http://www.ebi.ac.uk/efo/EFO_0003901  


In [61]:
%%time

# create nx score for each full_id

f = f"{output}/nx-ebi-pairs.csv.gz"
if os.path.exists('nx-ebi-pairs.csv.gz'):
    print('nx for ebi done')
else:
    o = gzip.open(f,'wb')
    efos = list(ebi_df['full_id'])
    for i in range(0,len(efos)-1):
        if i % 100 == 0:
            print(i)
        for j in range(i,len(efos)):
            e1 = efos[i]
            e2 = efos[j]
            if e1 != e2:
                res = similarity = efo_nx.similarity(e1,e2).results()
                nx_val = res[nxontology_measure]
                #print(i,e1,e2,nx_val)
                o.write(f"{e1},{e2},{nx_val}\n".encode('utf-8'))
    o.close()
print('Done')

UsageError: Line magic function `%%time` not found.


In [33]:
%%time
# run all against all for EBI query data
m = modelData[0]
for m in modelData:
    name = m['name']
    f = f'output/{name}-ebi-encode.npy'
    if os.path.exists(f):
        print(m)
        dd = np.load(f'output/{name}-ebi-encode.npy')
        print(len(dd))
        aaa = create_aaa_distances(dd)
        np.save(f'{output}/{name}-ebi-aaa.npy',aaa)
        #print(len(aaa))
    else:
        print(f,'does not exist')


{'name': 'BioSentVec', 'model': 'BioSentVec'}
1303
Creating distances...
1303
1303
{'name': 'BioBERT', 'model': 'biobert_v1.1_pubmed'}
1303
Creating distances...
1303
1303
{'name': 'BlueBERT', 'model': 'NCBI_BERT_pubmed_mimic_uncased_L-12_H-768_A-12'}
1303
Creating distances...
1303
1303
{'name': 'GUSE', 'model': 'GUSEv4'}
1303
Creating distances...
1303
1303
output/BERT-EFO-ebi-encode.npy does not exist
output/Zooma-ebi-encode.npy does not exist


In [42]:
def write_to_file(model_name,pairwise_data):
    print('writing',model_name)
    f = f'{output}/{model_name}-ebi-query-pairwise.tsv.gz'
    if os.path.exists(f):
        print('Already done',f)
    else:
        fo = gzip.open(f,'w')
        fo.write("q1\tq2\tscore\n".encode('utf-8'))
        ebi_list = ebi_df['query']
        for i in range(0,len(ebi_list)):
            if i % 100 == 0:
                print(i)
            # write to file
            mCount=0
            for j in range(0,len(ebi_list)):
                #if i != j:
                #print(pairwise_data[i],pairwise_data[j])
                score = 1-pairwise_data[i][j]
                fo.write(f"{ebi_list[i]}\t{ebi_list[j]}\t{score}\n".encode('utf-8'))
                mCount+=1

# create pairwise files
for m in modelData:
    name = m['name']
    f = f'{output}/{name}-ebi-aaa.npy'
    if os.path.exists(f):
        dd = np.load(f'{output}/{name}-ebi-aaa.npy')
        #a=np.load('output/BioSentVec-ebi-aaa.npy')
        print(len(dd))
        #print(len(dd[0]))
        write_to_file(model_name=name,pairwise_data=dd)


1303
writing BioSentVec
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1303
writing BioBERT
Already done output/trait-trait/BioBERT-ebi-query-pairwise.tsv.gz
1303
writing BlueBERT
Already done output/trait-trait/BlueBERT-ebi-query-pairwise.tsv.gz
1303
writing GUSE
Already done output/trait-trait/GUSE-ebi-query-pairwise.tsv.gz


In [40]:
# format BERT EFO data
df = pd.read_csv(f'data/BERT-EFO-ebi-query-pairwise.csv.gz')
df.rename(columns={'text_1':'q1','text_2':'q2'},inplace=True)
df.to_csv(f'{output}/BERT-EFO-ebi-query-pairwise.tsv.gz',compression='gzip',index=False,sep='\t')

In [57]:
%%time
a=1
time.sleep(10)
print(a)

1
CPU times: user 1.07 ms, sys: 1.51 ms, total: 2.59 ms
Wall time: 10 s
