In [1]:
import pandas as pd
import numpy as np
import requests
import json
import time
import os 

from scripts.vectology_functions import create_aaa_distances, create_pair_distances, embed_text
from nxontology import NXOntology


Creating distances...
1 1
1


In [2]:
# get the EBI UKB data
#get ebi data
url='https://raw.githubusercontent.com/EBISPOT/EFO-UKB-mappings/master/UK_Biobank_master_file.tsv'
ebi_df = pd.read_csv(url,sep='\t')
#create test set
print(ebi_df.head())
print(ebi_df.shape)

                                ZOOMA QUERY       MAPPED_TERM_LABEL  \
0           Vascular disorders of intestine        vascular disease   
1                              Gonarthrosis  osteoarthritis || knee   
2  Psoriatic and enteropathic arthropathies     psoriatic arthritis   
3          Pain associated with micturition                 dysuria   
4                                Other mood           mood disorder   

            MAPPED_TERM_URI MAPPING_TYPE  \
0  EFO_0004264, EFO_0009431        Broad   
1               EFO_0004616        Broad   
2               EFO_0003778      ? Broad   
3               EFO_0003901      ? Broad   
4               EFO_0004247      ? Broad   

  ICD10_CODE/SELF_REPORTED_TRAIT_FIELD_CODE COMMENTS/TICKET   AI  
0                                       K55            DONE  NaN  
1                                       M17            DONE  NaN  
2                                       M07            DONE  NaN  
3                                       

In [3]:
# preprocess
process_text=[]
for k,g in ebi_df.groupby(np.arange(len(ebi_df))//20):
    params={'text_list':list(g['ZOOMA QUERY'])}
    process_res = requests.post('http://vectology-api.mrcieu.ac.uk/preprocess',data=json.dumps(params))
    process_text.extend([d['result'].replace('unspecified','').replace('nec','') for d in process_res.json()])
print(len(process_text))
    
ebi_df.loc[:, 'processed'] = process_text
print(ebi_df.head())

1565
                                ZOOMA QUERY       MAPPED_TERM_LABEL  \
0           Vascular disorders of intestine        vascular disease   
1                              Gonarthrosis  osteoarthritis || knee   
2  Psoriatic and enteropathic arthropathies     psoriatic arthritis   
3          Pain associated with micturition                 dysuria   
4                                Other mood           mood disorder   

            MAPPED_TERM_URI MAPPING_TYPE  \
0  EFO_0004264, EFO_0009431        Broad   
1               EFO_0004616        Broad   
2               EFO_0003778      ? Broad   
3               EFO_0003901      ? Broad   
4               EFO_0004247      ? Broad   

  ICD10_CODE/SELF_REPORTED_TRAIT_FIELD_CODE COMMENTS/TICKET   AI  \
0                                       K55            DONE  NaN   
1                                       M17            DONE  NaN   
2                                       M07            DONE  NaN   
3                              

In [4]:
# embed all the variables using BioSentVec
def encode_traits(trait_df,col,name,model):

    vectorList=[]
    count = 0
    #loop through 10 rows at a time
    for k,g in trait_df.groupby(np.arange(len(trait_df))//10):
        #get text for embedding
        textList=list(g[col])
        res = embed_text(textList,model)
        
        #add vectors to list
        for i in range(0,len(textList)):
            vectorList.append(res[i])
            
        count+=10
        if count % 1000 == 0:
            print(count,trait_df.shape[0])

    print(len(vectorList),'vectors created')        
    trait_df[name] = vectorList
    return trait_df

In [5]:
%%time

f='data/ebi-ukb-vec.tsv.gz'
if os.path.exists(f):
    print('Already done')
    ebi_df = pd.read_csv(f,sep='\t')
    #issues with reading vectors from CSV 
    ebi_df['BioSentVec'] = ebi_df['BioSentVec'].apply(lambda x: [float(y) for y in x.replace('[','').replace(']','').split(',')])
else:
    ebi_df = encode_traits(trait_df=ebi_df,col='processed',name='BioSentVec',model='BioSentVec')
    ebi_df.to_csv(f,sep='\t',compression='gzip')
print(ebi_df.head())

Already done
   Unnamed: 0                               ZOOMA QUERY  \
0           0           Vascular disorders of intestine   
1           1                              Gonarthrosis   
2           2  Psoriatic and enteropathic arthropathies   
3           3          Pain associated with micturition   
4           4                                Other mood   

        MAPPED_TERM_LABEL           MAPPED_TERM_URI MAPPING_TYPE  \
0        vascular disease  EFO_0004264, EFO_0009431        Broad   
1  osteoarthritis || knee               EFO_0004616        Broad   
2     psoriatic arthritis               EFO_0003778      ? Broad   
3                 dysuria               EFO_0003901      ? Broad   
4           mood disorder               EFO_0004247      ? Broad   

  ICD10_CODE/SELF_REPORTED_TRAIT_FIELD_CODE COMMENTS/TICKET   AI  \
0                                       K55            DONE  NaN   
1                                       M17            DONE  NaN   
2                  

In [34]:
%%time

#get EFO data
efo_data = 'data/efo-nodes.tsv'
efo_df=pd.read_csv(efo_data,sep='\t',names=['name','label','type'])
print(efo_df.head())
print(efo_df.shape)

#check all terms in EBI data set are here
efo_labels = list(efo_df['label'].str.lower())
ukb_labels = list(ebi_df['MAPPED_TERM_LABEL'].str.lower())
efo_check = list(set(ukb_labels) - set(efo_labels))
efo_check

                                        name                          label  \
0   http://www.orpha.net/ORDO/Orphanet_90342  Xeroderma pigmentosum variant   
1     http://www.orpha.net/ORDO/Orphanet_910          Xeroderma pigmentosum   
2  http://purl.obolibrary.org/obo/HP_0002140                Ischemic stroke   
3  http://purl.obolibrary.org/obo/HP_0002637              Cerebral ischemia   
4       http://www.ebi.ac.uk/efo/EFO_0008524           small cell carcinoma   

            type  
0  typed-literal  
1  typed-literal  
2  typed-literal  
3  typed-literal  
4  typed-literal  
(25390, 3)
CPU times: user 61.4 ms, sys: 6.32 ms, total: 67.7 ms
Wall time: 70.5 ms


[nan,
 'hepatitis b infection',
 'cardiac pacemaker',
 'hiv infection  || aids',
 'hearing assistance system/able to hear with hearing aids',
 'medical examination',
 "spinal fracture'",
 'first degree perineal laceration',
 'pilonidal asbcess',
 'anus disease||rectal disease',
 'need to sleep during day',
 'creatine clearance measurement',
 'gastroenteritis  || dysentery',
 'hepatitis a',
 'gallstones  || cholelithiasis',
 'ear pain||otitis media with effusion',
 'subsequent st elevation (stemi) and non-st elevation (nstemi) myocardian infarction',
 'hypothyroidism  || myxedema',
 'diarrhoea||gastroenteritis',
 'scleroderma  || systemic scleroderma',
 'obstetric labor complications',
 'uterus neoplasm',
 'abnormality of the renal pelvis  || abnormality of the ureter',
 'congenital ichthyosis',
 'staphylococcal skin infection',
 'liver disease  || biliary tract disease  || pancreas disease',
 'skull factures',
 "dementia  || alzheimer's disease  || cognitive impairment",
 'excessive da

In [7]:
%%time

f='data/efo-vec.tsv.gz'
if os.path.exists(f):
    print('Already done')
    efo_df = pd.read_csv(f,sep='\t')
    #issues with reading vectors from CSV 
    efo_df['BioSentVec'] = efo_df['BioSentVec'].apply(lambda x: [float(y) for y in x.replace('[','').replace(']','').split(',')])
else:
    efo_df = encode_traits(trait_df=efo_df,col='label',name='BioSentVec',model='BioSentVec')
    efo_df.to_csv(f,sep='\t',compression='gzip')
    
print(efo_df.head())


Already done
   Unnamed: 0                                       name  \
0           0   http://www.orpha.net/ORDO/Orphanet_90342   
1           1     http://www.orpha.net/ORDO/Orphanet_910   
2           2  http://purl.obolibrary.org/obo/HP_0002140   
3           3  http://purl.obolibrary.org/obo/HP_0002637   
4           4       http://www.ebi.ac.uk/efo/EFO_0008524   

                           label           type  \
0  Xeroderma pigmentosum variant  typed-literal   
1          Xeroderma pigmentosum  typed-literal   
2                Ischemic stroke  typed-literal   
3              Cerebral ischemia  typed-literal   
4           small cell carcinoma  typed-literal   

                                          BioSentVec  
0  [-0.045969586819410324, 0.5196870565414429, -0...  
1  [0.01842655800282955, 0.7100013494491577, -0.7...  
2  [0.5974032282829285, -0.6174705028533936, -0.1...  
3  [0.46593865752220154, -0.6334501504898071, 0.0...  
4  [0.4248107373714447, 0.033654093742370605

In [8]:

def create_efo_nxo() -> NXOntology:
    nxo = NXOntology()
    
    edges = []
    efo_data='data/efo_data.txt.gz'
    efo_df=pd.read_json(efo_data)
    for i,row in efo_df.iterrows():
        child = row['childLabel']['value']
        parent = row['parentLabel']['value']
        edges.append((parent,child))
    print(edges[0:10])
    nxo.graph.add_edges_from(edges)
    return nxo

efo_nx = create_efo_nxo()
efo_nx.freeze()

[('Proteinuria', 'albuminuria'), ('Myopia', 'pathological myopia'), ('dementia', 'AIDS dementia'), ('abnormality of blood and blood-forming tissues', 'Menorrhagia'), ('abnormality of blood and blood-forming tissues', 'thrombocytopenia'), ('abnormality of blood and blood-forming tissues', 'aplastic anemia'), ('abnormality of blood and blood-forming tissues', 'Histiocytosis'), ('abnormality of blood and blood-forming tissues', 'leukopenia'), ('abnormality of blood and blood-forming tissues', 'anemia'), ('Nausea and vomiting', 'Nausea')]


In [18]:
%%time

#ebi_df['BioSentVec']
v1 = list(ebi_df['BioSentVec'])

v2 = list(efo_df['BioSentVec'])


CPU times: user 2.24 ms, sys: 34 µs, total: 2.28 ms
Wall time: 2.29 ms


In [19]:
%%time
# cosine of all against all
dd = create_pair_distances(v1,v2)

Creating distances...
1565 25390
1565
CPU times: user 29.7 s, sys: 524 ms, total: 30.2 s
Wall time: 30.5 s


IndexError: index 100 is out of bounds for axis 0 with size 100