In [26]:
import pandas as pd
import numpy as np
import requests
import json
import time
import os 
import gzip
import timeit

from scripts.vectology_functions import create_aaa_distances, create_pair_distances, embed_text, encode_traits, create_efo_nxo

from pandas_profiling import ProfileReport

import seaborn as sns

# Apply the default theme
sns.set_theme()

In [27]:
# globals
ebi_data = 'data/UK_Biobank_master_file.tsv'
#efo_nodes = 'data/efo-nodes.tsv'
#efo_data = 'data/efo_data.txt.gz'
efo_nodes = 'data/epigraphdb_efo_nodes.csv'
efo_rels = 'data/epigraphdb_efo_rels.csv'
nxontology_measure = 'batet'

modelData = [
    {'name':'BioSentVec','model':'BioSentVec'},
    {'name':'BioBERT','model':'biobert_v1.1_pubmed'},
    {'name':'BlueBERT','model':'NCBI_BERT_pubmed_mimic_uncased_L-12_H-768_A-12'},
    {'name':'GUSE','model':'GUSEv4'},
    {'name':'BERT-EFO','model':'BERT-EFO'},
    {'name':'Zooma','model':'Zooma'}
]

pallete="hls"
output='output/trait-trait'

In [28]:
#create nxontology network of EFO relationships
efo_rel_df=pd.read_csv(efo_rels)
efo_nx = create_efo_nxo(df=efo_rel_df,child_col='efo.id',parent_col='parent_efo.id')
efo_nx.freeze()

In [87]:
# read cleaned EBI data
ebi_df = pd.read_csv('output/ebi-ukb-cleaned.tsv',sep='\t')
print(ebi_df.head())
print(ebi_df.shape)

#now we need one to one mappings of query and EFO, so drop duplicates
ebi_df_dedup = ebi_df.drop_duplicates(subset=['query'])
print(ebi_df_dedup.shape)

                                      query       MAPPED_TERM_LABEL  \
0           Vascular disorders of intestine        vascular disease   
1           Vascular disorders of intestine        vascular disease   
2                              Gonarthrosis  osteoarthritis || knee   
3  Psoriatic and enteropathic arthropathies     psoriatic arthritis   
4          Pain associated with micturition                 dysuria   

  MAPPING_TYPE           id                               full_id  
0        Broad  EFO_0004264  http://www.ebi.ac.uk/efo/EFO_0004264  
1        Broad  EFO_0009431  http://www.ebi.ac.uk/efo/EFO_0009431  
2        Broad  EFO_0004616  http://www.ebi.ac.uk/efo/EFO_0004616  
3      ? Broad  EFO_0003778  http://www.ebi.ac.uk/efo/EFO_0003778  
4      ? Broad  EFO_0003901  http://www.ebi.ac.uk/efo/EFO_0003901  
(1303, 5)
(1240, 5)


In [85]:
%%time

# create nx score for each full_id

f = f"{output}/nx-ebi-pairs.csv.gz"
if os.path.exists('nx-ebi-pairs.csv.gz'):
    print('nx for ebi done')
else:
    o = gzip.open(f,'wb')
    o.write("q1\tq2\tscore\n".encode('utf-8'))
    efos = list(ebi_df_dedup['full_id'])
    for i in range(0,len(efos)-1):
        if i % 100 == 0:
            print(i)    
        for j in range(i,len(efos)):
            e1 = efos[i]
            e2 = efos[j]
            if e1 != e2:
                res = similarity = efo_nx.similarity(e1,e2).results()
                nx_val = res[nxontology_measure]
                #print(i,e1,e2,nx_val)
                o.write(f"{e1}\t{e2}\t{nx_val}\n".encode('utf-8'))
    o.close()
print('Done')

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
Done
CPU times: user 1min 18s, sys: 983 ms, total: 1min 19s
Wall time: 1min 20s


In [33]:
%%time
# run all against all for EBI query data
m = modelData[0]
for m in modelData:
    name = m['name']
    f = f'output/{name}-ebi-encode.npy'
    if os.path.exists(f):
        print(m)
        dd = np.load(f'output/{name}-ebi-encode.npy')
        print(len(dd))
        aaa = create_aaa_distances(dd)
        np.save(f'{output}/{name}-ebi-aaa.npy',aaa)
        #print(len(aaa))
    else:
        print(f,'does not exist')


{'name': 'BioSentVec', 'model': 'BioSentVec'}
1303
Creating distances...
1303
1303
{'name': 'BioBERT', 'model': 'biobert_v1.1_pubmed'}
1303
Creating distances...
1303
1303
{'name': 'BlueBERT', 'model': 'NCBI_BERT_pubmed_mimic_uncased_L-12_H-768_A-12'}
1303
Creating distances...
1303
1303
{'name': 'GUSE', 'model': 'GUSEv4'}
1303
Creating distances...
1303
1303
output/BERT-EFO-ebi-encode.npy does not exist
output/Zooma-ebi-encode.npy does not exist


In [89]:
def write_to_file(model_name,pairwise_data):
    print('writing',model_name)
    f = f'{output}/{model_name}-ebi-query-pairwise.tsv.gz'
    if os.path.exists(f):
        print('Already done',f)
    else:
        
        fo = gzip.open(f,'w')
        fo.write("q1\tq2\tscore\n".encode('utf-8'))
        ebi_list = ebi_df['query']
        for i in range(0,len(ebi_list)):
            if i % 100 == 0:
                print(i)
            # write to file
            mCount=0
            for j in range(i,len(ebi_list)):
                if ebi_list[i] in dedup_query_list and ebi_list[j] in dedup_query_list:
                    #if i != j:
                    #print(pairwise_data[i],pairwise_data[j])
                    score = 1-pairwise_data[i][j]
                    fo.write(f"{ebi_list[i]}\t{ebi_list[j]}\t{score}\n".encode('utf-8'))
                    mCount+=1



dedup_query_list=list(ebi_df_dedup['query'])

# create pairwise files
for m in modelData:
    name = m['name']
    f = f'{output}/{name}-ebi-aaa.npy'
    if os.path.exists(f):
        dd = np.load(f'{output}/{name}-ebi-aaa.npy')
        #a=np.load('output/BioSentVec-ebi-aaa.npy')
        print(len(dd))
        #print(len(dd[0]))
        write_to_file(model_name=name,pairwise_data=dd)


1303
writing BioSentVec
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1303
writing BioBERT
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1303
writing BlueBERT
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1303
writing GUSE
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300


In [90]:
# format BERT EFO data
df = pd.read_csv(f'data/BERT-EFO-ebi-query-pairwise.csv.gz')
df.rename(columns={'text_1':'q1','text_2':'q2'},inplace=True)
df = df[df['q1'].isin(dedup_query_list) & df['q2'].isin(dedup_query_list)]
df.to_csv(f'{output}/BERT-EFO-ebi-query-pairwise.tsv.gz',compression='gzip',index=False,sep='\t')

In [94]:
# test matrix
print(ebi_df.shape)
#ebi_df_dedup=ebi_df.drop_duplicates(subset=['query'])
#print(ebi_df_dedup.shape)
#head(n=10)['query'])[0:10]
e_list=list(ebi_df_dedup['full_id'][0:10])
#q_list
#print(q_list)
print(e_list)

#nx
nx_df = pd.read_csv(f'{output}/nx-ebi-pairs.csv.gz',sep='\t')
print(nx_df.head())
nx_df = nx_df[nx_df['q1'].isin(e_list) & nx_df['q2'].isin(e_list)]
print(nx_df.shape)

#nx_df = nx_df.pivot(index='q1',columns='q2')
print(nx_df.head())
#sns.clustermap(nx_df)

(1303, 5)
['http://www.ebi.ac.uk/efo/EFO_0004264', 'http://www.ebi.ac.uk/efo/EFO_0004616', 'http://www.ebi.ac.uk/efo/EFO_0003778', 'http://www.ebi.ac.uk/efo/EFO_0003901', 'http://www.ebi.ac.uk/efo/EFO_0004247', 'http://www.ebi.ac.uk/efo/EFO_0003917', 'http://www.ebi.ac.uk/efo/EFO_0004143', 'http://www.ebi.ac.uk/efo/EFO_0004143', 'http://www.ebi.ac.uk/efo/EFO_0000217', 'http://www.ebi.ac.uk/efo/EFO_0000487']
                                     q1                                    q2  \
0  http://www.ebi.ac.uk/efo/EFO_0004264  http://www.ebi.ac.uk/efo/EFO_0004616   
1  http://www.ebi.ac.uk/efo/EFO_0004264  http://www.ebi.ac.uk/efo/EFO_0003778   
2  http://www.ebi.ac.uk/efo/EFO_0004264  http://www.ebi.ac.uk/efo/EFO_0003901   
3  http://www.ebi.ac.uk/efo/EFO_0004264  http://www.ebi.ac.uk/efo/EFO_0004247   
4  http://www.ebi.ac.uk/efo/EFO_0004264  http://www.ebi.ac.uk/efo/EFO_0003917   

      score  
0  0.263158  
1  0.172414  
2  0.181818  
3  0.384615  
4  0.200000  
(96, 3)
          

ValueError: could not convert string to float: 'http://www.ebi.ac.uk/efo/EFO_0004264'

In [74]:
iris = sns.load_dataset("iris")
print(iris)

     sepal_length  sepal_width  petal_length  petal_width    species
0             5.1          3.5           1.4          0.2     setosa
1             4.9          3.0           1.4          0.2     setosa
2             4.7          3.2           1.3          0.2     setosa
3             4.6          3.1           1.5          0.2     setosa
4             5.0          3.6           1.4          0.2     setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  virginica
146           6.3          2.5           5.0          1.9  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica
149           5.9          3.0           5.1          1.8  virginica

[150 rows x 5 columns]
