In [64]:
import pandas as pd
import numpy as np
import requests
import json
import time
import os 

from scripts.vectology_functions import create_aaa_distances, embed_text
from nxontology import NXOntology


In [49]:
# get the EBI UKB data
#get ebi data
url='https://raw.githubusercontent.com/EBISPOT/EFO-UKB-mappings/master/UK_Biobank_master_file.tsv'
ebi_df = pd.read_csv(url,sep='\t')
#create test set
print(ebi_df.head())
print(ebi_df.shape)

                                ZOOMA QUERY       MAPPED_TERM_LABEL  \
0           Vascular disorders of intestine        vascular disease   
1                              Gonarthrosis  osteoarthritis || knee   
2  Psoriatic and enteropathic arthropathies     psoriatic arthritis   
3          Pain associated with micturition                 dysuria   
4                                Other mood           mood disorder   

            MAPPED_TERM_URI MAPPING_TYPE  \
0  EFO_0004264, EFO_0009431        Broad   
1               EFO_0004616        Broad   
2               EFO_0003778      ? Broad   
3               EFO_0003901      ? Broad   
4               EFO_0004247      ? Broad   

  ICD10_CODE/SELF_REPORTED_TRAIT_FIELD_CODE COMMENTS/TICKET   AI  
0                                       K55            DONE  NaN  
1                                       M17            DONE  NaN  
2                                       M07            DONE  NaN  
3                                       

In [50]:
# preprocess
process_text=[]
for k,g in ebi_df.groupby(np.arange(len(ebi_df))//20):
    params={'text_list':list(g['ZOOMA QUERY'])}
    process_res = requests.post('http://vectology-api.mrcieu.ac.uk/preprocess',data=json.dumps(params))
    process_text.extend([d['result'].replace('unspecified','').replace('nec','') for d in process_res.json()])
print(len(process_text))
    
ebi_df.loc[:, 'processed'] = process_text
print(ebi_df.head())

1565
                                ZOOMA QUERY       MAPPED_TERM_LABEL  \
0           Vascular disorders of intestine        vascular disease   
1                              Gonarthrosis  osteoarthritis || knee   
2  Psoriatic and enteropathic arthropathies     psoriatic arthritis   
3          Pain associated with micturition                 dysuria   
4                                Other mood           mood disorder   

            MAPPED_TERM_URI MAPPING_TYPE  \
0  EFO_0004264, EFO_0009431        Broad   
1               EFO_0004616        Broad   
2               EFO_0003778      ? Broad   
3               EFO_0003901      ? Broad   
4               EFO_0004247      ? Broad   

  ICD10_CODE/SELF_REPORTED_TRAIT_FIELD_CODE COMMENTS/TICKET   AI  \
0                                       K55            DONE  NaN   
1                                       M17            DONE  NaN   
2                                       M07            DONE  NaN   
3                              

In [55]:
# embed all the variables using BioSentVec
def encode_traits(trait_df,col,name,model):

    vectorList=[]
    count = 0
    #loop through 10 rows at a time
    for k,g in trait_df.groupby(np.arange(len(trait_df))//10):
        #get text for embedding
        textList=list(g[col])
        res = embed_text(textList,model)
        
        #add vectors to list
        for i in range(0,len(textList)):
            vectorList.append(res[i])
            
        count+=10
        if count % 1000 == 0:
            print(count,trait_df.shape[0])

    print(len(vectorList),'vectors created')        
    trait_df[name] = vectorList
    return trait_df

In [56]:
%%time

f='data/ebi-ukb-vec.tsv.gz'
if os.path.exists(f):
    print('Already done')
    #ebi_df = pd.read_csv(f,sep='\t')
else:
    ebi_df = encode_traits(trait_df=ebi_df,col='processed',name='BioSentVec',model='BioSentVec')
    ebi_df.to_csv(f,sep='\t',compression='gzip')
print(ebi_df.head())

100 1565
200 1565
300 1565
400 1565
500 1565
600 1565
700 1565
800 1565
900 1565
1000 1565
1100 1565
1200 1565
1300 1565
1400 1565
1500 1565
1565 vectors created
                                ZOOMA QUERY       MAPPED_TERM_LABEL  \
0           Vascular disorders of intestine        vascular disease   
1                              Gonarthrosis  osteoarthritis || knee   
2  Psoriatic and enteropathic arthropathies     psoriatic arthritis   
3          Pain associated with micturition                 dysuria   
4                                Other mood           mood disorder   

            MAPPED_TERM_URI MAPPING_TYPE  \
0  EFO_0004264, EFO_0009431        Broad   
1               EFO_0004616        Broad   
2               EFO_0003778      ? Broad   
3               EFO_0003901      ? Broad   
4               EFO_0004247      ? Broad   

  ICD10_CODE/SELF_REPORTED_TRAIT_FIELD_CODE COMMENTS/TICKET   AI  \
0                                       K55            DONE  NaN   
1         

In [60]:
%%time

#get EFO data
efo_data = 'data/efo-nodes.tsv'
efo_df=pd.read_csv(efo_data,sep='\t',names=['name','label','type'])
print(efo_df.head())
print(efo_df.shape)

                                        name                          label  \
0   http://www.orpha.net/ORDO/Orphanet_90342  Xeroderma pigmentosum variant   
1     http://www.orpha.net/ORDO/Orphanet_910          Xeroderma pigmentosum   
2  http://purl.obolibrary.org/obo/HP_0002140                Ischemic stroke   
3  http://purl.obolibrary.org/obo/HP_0002637              Cerebral ischemia   
4       http://www.ebi.ac.uk/efo/EFO_0008524           small cell carcinoma   

            type  
0  typed-literal  
1  typed-literal  
2  typed-literal  
3  typed-literal  
4  typed-literal  
(25390, 3)
CPU times: user 43.2 ms, sys: 7.09 ms, total: 50.3 ms
Wall time: 48.7 ms


In [62]:
%%time

f='data/efo-vec.tsv.gz'
if os.path.exists(f):
    print('Already done')
    efo_df = pd.read_csv(f,sep='\t')
else:
    efo_df = encode_traits(trait_df=efo_df,col='label',name='BioSentVec',model='BioSentVec')
    efo_df.to_csv(f,sep='\t',compression='gzip')
    
print(efo_df.head())


Already done
   Unnamed: 0                                       name  \
0           0   http://www.orpha.net/ORDO/Orphanet_90342   
1           1     http://www.orpha.net/ORDO/Orphanet_910   
2           2  http://purl.obolibrary.org/obo/HP_0002140   
3           3  http://purl.obolibrary.org/obo/HP_0002637   
4           4       http://www.ebi.ac.uk/efo/EFO_0008524   

                           label           type  \
0  Xeroderma pigmentosum variant  typed-literal   
1          Xeroderma pigmentosum  typed-literal   
2                Ischemic stroke  typed-literal   
3              Cerebral ischemia  typed-literal   
4           small cell carcinoma  typed-literal   

                                          BioSentVec  
0  [-0.045969586819410324, 0.5196870565414429, -0...  
1  [0.01842655800282955, 0.7100013494491577, -0.7...  
2  [0.5974032282829285, -0.6174705028533936, -0.1...  
3  [0.46593865752220154, -0.6334501504898071, 0.0...  
4  [0.4248107373714447, 0.033654093742370605

In [66]:

def create_efo_nxo() -> NXOntology:
    nxo = NXOntology()
    
    edges = []
    efo_data='data/efo_data.txt.gz'
    efo_df=pd.read_json(efo_data)
    for i,row in efo_df.iterrows():
        child = row['childLabel']['value']
        parent = row['parentLabel']['value']
        edges.append((parent,child))
    print(edges[0:10])
    nxo.graph.add_edges_from(edges)
    return nxo

efo = create_efo_nxo()
efo.freeze()

[('Proteinuria', 'albuminuria'), ('Myopia', 'pathological myopia'), ('dementia', 'AIDS dementia'), ('abnormality of blood and blood-forming tissues', 'Menorrhagia'), ('abnormality of blood and blood-forming tissues', 'thrombocytopenia'), ('abnormality of blood and blood-forming tissues', 'aplastic anemia'), ('abnormality of blood and blood-forming tissues', 'Histiocytosis'), ('abnormality of blood and blood-forming tissues', 'leukopenia'), ('abnormality of blood and blood-forming tissues', 'anemia'), ('Nausea and vomiting', 'Nausea')]
