In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
from typing import Dict, Any, Tuple, List
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import networkx as nx
from loguru import logger
from icecream import ic

from funcs import utils, paths, info, info_revised, info_stage2
from funcs.data_processing import stage1_processing, stage1_nx, mapping_routine
from funcs.plots import stage1_plots

In [3]:
project_root = utils.find_project_root()
print(project_root)
data_dir = utils.find_data_root()
print(data_dir)

/data/ik18445_cache/projects/vectology/stage2
/data/ik18445_cache/projects/vectology/stage2/data


In [4]:
MODELS = {
    "biosentvec_model_dir": project_root / "models" / "biosentvec",
    "bioconceptvec_model_dir": project_root / "models" / "bioconceptvec",
}
for k, v in MODELS.items():
    assert v.exists()

# Load data

## source data

In [5]:
UKB_MASTER = pd.read_csv(paths.init["ukb_master"], sep="\t")
print(UKB_MASTER.info())
UKB_MASTER

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1565 entries, 0 to 1564
Data columns (total 7 columns):
 #   Column                                     Non-Null Count  Dtype 
---  ------                                     --------------  ----- 
 0   ZOOMA QUERY                                1565 non-null   object
 1   MAPPED_TERM_LABEL                          1552 non-null   object
 2   MAPPED_TERM_URI                            1564 non-null   object
 3   MAPPING_TYPE                               1563 non-null   object
 4   ICD10_CODE/SELF_REPORTED_TRAIT_FIELD_CODE  1564 non-null   object
 5   COMMENTS/TICKET                            1564 non-null   object
 6   AI                                         1 non-null      object
dtypes: object(7)
memory usage: 85.7+ KB
None


Unnamed: 0,ZOOMA QUERY,MAPPED_TERM_LABEL,MAPPED_TERM_URI,MAPPING_TYPE,ICD10_CODE/SELF_REPORTED_TRAIT_FIELD_CODE,COMMENTS/TICKET,AI
0,Vascular disorders of intestine,vascular disease,"EFO_0004264, EFO_0009431",Broad,K55,DONE,
1,Gonarthrosis,osteoarthritis || knee,EFO_0004616,Broad,M17,DONE,
2,Psoriatic and enteropathic arthropathies,psoriatic arthritis,EFO_0003778,? Broad,M07,DONE,
3,Pain associated with micturition,dysuria,EFO_0003901,? Broad,R30,DONE,
4,Other mood,mood disorder,EFO_0004247,? Broad,F38,DONE,
...,...,...,...,...,...,...,...
1560,Candidiasis,"Candidiasis, Invasive",EFO_1001283,Narrow,B37,DONE,
1561,Other bacterial intestinal infections,intestinal disease||bacterial disease,EFO_0009431||EFO_0000771,Broad,A04,DONE,
1562,Viral and other specified intestinal infections,intestinal disease||viral disease,EFO_0009431||EFO_0000763,Broad,A08,DONE,
1563,Other predominantly sexually transmitted disea...,bacterial sexually transmitted disease,EFO_0003955,Narrow,A63,DONE,


## stage1 cache

In [6]:
EFO_NODES = stage1_processing.get_efo_nodes()
EFO_NODES

2022-10-06 22:05:58.871 | INFO     | funcs.data_processing.stage1_processing:get_efo_nodes:24 - <class 'pandas.core.frame.DataFrame'>
Int64Index: 25380 entries, 0 to 25389
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   efo_label  25380 non-null  object
 1   efo_id     25380 non-null  object
dtypes: object(2)
memory usage: 594.8+ KB



Unnamed: 0,efo_label,efo_id
0,xeroderma pigmentosum variant,http://www.orpha.net/ORDO/Orphanet_90342
1,xeroderma pigmentosum,http://www.orpha.net/ORDO/Orphanet_910
2,ischemic stroke,http://purl.obolibrary.org/obo/HP_0002140
3,cerebral ischemia,http://purl.obolibrary.org/obo/HP_0002637
4,small cell carcinoma,http://www.ebi.ac.uk/efo/EFO_0008524
...,...,...
25385,acetazolamide-responsive myotonia,http://www.orpha.net/ORDO/Orphanet_99736
25386,complete androgen insensitivity syndrome,http://www.orpha.net/ORDO/Orphanet_99429
25387,intermediate dend syndrome,http://www.orpha.net/ORDO/Orphanet_99989
25388,epiblepharon,http://www.orpha.net/ORDO/Orphanet_99169


In [7]:
EFO_RELS = pd.read_csv(paths.init["efo_edges"])
EFO_RELS

Unnamed: 0,efo.id,parent_efo.id
0,http://www.orpha.net/ORDO/Orphanet_90342,http://www.orpha.net/ORDO/Orphanet_910
1,http://www.orpha.net/ORDO/Orphanet_910,http://www.orpha.net/ORDO/Orphanet_363245
2,http://www.orpha.net/ORDO/Orphanet_910,http://www.orpha.net/ORDO/Orphanet_183422
3,http://www.orpha.net/ORDO/Orphanet_910,http://www.orpha.net/ORDO/Orphanet_139027
4,http://www.orpha.net/ORDO/Orphanet_910,http://www.orpha.net/ORDO/Orphanet_98097
...,...,...
43127,http://www.orpha.net/ORDO/Orphanet_99736,http://www.orpha.net/ORDO/Orphanet_612
43128,http://www.orpha.net/ORDO/Orphanet_99429,http://www.orpha.net/ORDO/Orphanet_754
43129,http://www.orpha.net/ORDO/Orphanet_99989,http://www.orpha.net/ORDO/Orphanet_79134
43130,http://www.orpha.net/ORDO/Orphanet_99169,http://www.orpha.net/ORDO/Orphanet_98567


In [8]:
EBI_DATA = stage1_processing.get_ebi_data()
EBI_DATA

2022-10-06 22:05:58.987 | INFO     | funcs.data_processing.stage1_processing:get_ebi_data:32 - <class 'pandas.core.frame.DataFrame'>
RangeIndex: 1191 entries, 0 to 1190
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   query              1191 non-null   object
 1   MAPPED_TERM_LABEL  1191 non-null   object
 2   MAPPED_TERM_URI    1191 non-null   object
 3   MAPPING_TYPE       1191 non-null   object
 4   id                 1191 non-null   object
 5   full_id            1191 non-null   object
 6   mapping_id         1191 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 65.3+ KB

2022-10-06 22:05:58.989 | INFO     | funcs.data_processing.stage1_processing:get_ebi_data:33 - 
Exact       603
Broad       527
Narrow       47
?             7
? Broad       3
? Narrow      2
? Exact       1
Narrow?       1
Name: MAPPING_TYPE, dtype: int64


Unnamed: 0,query,MAPPED_TERM_LABEL,MAPPED_TERM_URI,MAPPING_TYPE,id,full_id,mapping_id
0,gonarthrosis,osteoarthritis || knee,EFO_0004616,Broad,EFO_0004616,http://www.ebi.ac.uk/efo/EFO_0004616,1
1,psoriatic and enteropathic arthropathies,psoriatic arthritis,EFO_0003778,? Broad,EFO_0003778,http://www.ebi.ac.uk/efo/EFO_0003778,2
2,pain associated with micturition,dysuria,EFO_0003901,? Broad,EFO_0003901,http://www.ebi.ac.uk/efo/EFO_0003901,3
3,other mood,mood disorder,EFO_0004247,? Broad,EFO_0004247,http://www.ebi.ac.uk/efo/EFO_0004247,4
4,preterm delivery,premature birth,EFO_0003917,? Exact,EFO_0003917,http://www.ebi.ac.uk/efo/EFO_0003917,5
...,...,...,...,...,...,...,...
1186,malignant neoplasm without specification of site,cancer,EFO_0000311,Broad,EFO_0000311,http://www.ebi.ac.uk/efo/EFO_0000311,1187
1187,other and unspecified types of non-hodgkin's l...,non-Hodgkins lymphoma,EFO_0005952,Exact,EFO_0005952,http://www.ebi.ac.uk/efo/EFO_0005952,1188
1188,candidiasis,"Candidiasis, Invasive",EFO_1001283,Narrow,EFO_1001283,http://www.ebi.ac.uk/efo/EFO_1001283,1189
1189,other predominantly sexually transmitted disea...,bacterial sexually transmitted disease,EFO_0003955,Narrow,EFO_0003955,http://www.ebi.ac.uk/efo/EFO_0003955,1190


In [9]:
EFO_NX = nx.read_gpickle(paths.stage2["efo_nx"])
EFO_NX

<nxontology.ontology.NXOntology at 0x7f35bf49c1f0>

In [10]:
STAGE1_WA_NX_1_ALL = pd.read_csv(paths.stage1["stage1_wa_nx_1_all"])
print(STAGE1_WA_NX_1_ALL.info())
print(STAGE1_WA_NX_1_ALL.head())

STAGE1_WA_NX_10_ALL = pd.read_csv(paths.stage1["stage1_wa_nx_10_all"])
print(STAGE1_WA_NX_10_ALL.info())
print(STAGE1_WA_NX_10_ALL.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10602 entries, 0 to 10601
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   efo     10602 non-null  object 
 1   Model   10602 non-null  object 
 2   value   10602 non-null  float64
dtypes: float64(1), object(2)
memory usage: 248.6+ KB
None
                                    efo         Model  value
0  http://www.ebi.ac.uk/efo/EFO_0004616  BLUEBERT-EFO  0.412
1  http://www.ebi.ac.uk/efo/EFO_0003778  BLUEBERT-EFO  1.000
2  http://www.ebi.ac.uk/efo/EFO_0003901  BLUEBERT-EFO  0.125
3  http://www.ebi.ac.uk/efo/EFO_0004247  BLUEBERT-EFO  0.529
4  http://www.ebi.ac.uk/efo/EFO_0003917  BLUEBERT-EFO  0.529
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9424 entries, 0 to 9423
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   efo     9424 non-null   object 
 1   Model   9424 non-null   object 
 2   value   9424 non-null  

In [11]:
STAGE1_COM_SCORES = pd.read_csv(paths.stage1["stage1_com_scores"], sep="\t")
print(STAGE1_COM_SCORES.info())
STAGE1_COM_SCORES

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129795 entries, 0 to 129794
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   m1            129795 non-null  int64  
 1   m2            129795 non-null  int64  
 2   e1            129795 non-null  object 
 3   e2            129795 non-null  object 
 4   q1            129795 non-null  object 
 5   q2            129795 non-null  object 
 6   EFO-batet     129795 non-null  float64
 7   BLUEBERT-EFO  129795 non-null  float64
 8   BioBERT       129795 non-null  float64
 9   BioSentVec    129795 non-null  float64
 10  BlueBERT      129795 non-null  float64
 11  GUSE          129795 non-null  float64
 12  Spacy         129795 non-null  float64
 13  SciSpacy      129795 non-null  float64
 14  Levenshtein   129795 non-null  float64
dtypes: float64(9), int64(2), object(4)
memory usage: 14.9+ MB
None


Unnamed: 0,m1,m2,e1,e2,q1,q2,EFO-batet,BLUEBERT-EFO,BioBERT,BioSentVec,BlueBERT,GUSE,Spacy,SciSpacy,Levenshtein
0,84,85,http://www.ebi.ac.uk/efo/EFO_0000249,http://www.ebi.ac.uk/efo/EFO_0000465,alzheimer s disease,endocarditis valve unspecified,0.217391,-6.919846,0.902862,0.089074,0.589859,0.084499,0.333612,0.171020,0.326531
1,84,86,http://www.ebi.ac.uk/efo/EFO_0000249,http://www.ebi.ac.uk/efo/EFO_0000493,alzheimer s disease,family history of other conditions,0.055556,-8.149734,0.894628,0.125254,0.739749,0.165959,0.468795,0.174757,0.377358
2,84,87,http://www.ebi.ac.uk/efo/EFO_0000249,http://www.ebi.ac.uk/efo/EFO_0000668,alzheimer s disease,unspecified maternal hypertension,0.217391,-7.260237,0.882922,0.142996,0.718458,0.130364,0.558215,0.250431,0.307692
3,84,88,http://www.ebi.ac.uk/efo/EFO_0000249,http://www.ebi.ac.uk/efo/EFO_0000712,alzheimer s disease,stroke not specified as haemorrhage or infarction,0.380952,-5.399814,0.843176,0.219896,0.674608,0.057737,0.503609,0.269646,0.264706
4,84,89,http://www.ebi.ac.uk/efo/EFO_0000249,http://www.ebi.ac.uk/efo/EFO_0000713,alzheimer s disease,subarachnoid haemorrhage,0.380952,-3.965668,0.852716,0.188517,0.625172,0.197011,0.246184,0.037284,0.325581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129790,1181,1188,http://www.ebi.ac.uk/efo/EFO_0005803,http://www.ebi.ac.uk/efo/EFO_0005952,other diseases of blood and blood-forming organs,other and unspecified types of non-hodgkin's l...,0.300000,-5.626929,0.882949,0.248690,0.730082,0.344262,0.727008,0.427487,0.475248
129791,1181,1191,http://www.ebi.ac.uk/efo/EFO_0005803,http://www.ebi.ac.uk/efo/EFO_0005539,other diseases of blood and blood-forming organs,other disorders of adrenal gland,0.500000,-4.607293,0.937697,0.397043,0.781411,0.431633,0.743512,0.537745,0.575000
129792,1182,1188,http://www.ebi.ac.uk/efo/EFO_0000540,http://www.ebi.ac.uk/efo/EFO_0005952,other disorders involving the immune mechanism...,other and unspecified types of non-hodgkin's l...,0.238095,-5.535294,0.906498,0.241107,0.723806,0.474068,0.755674,0.448147,0.368000
129793,1182,1191,http://www.ebi.ac.uk/efo/EFO_0000540,http://www.ebi.ac.uk/efo/EFO_0005539,other disorders involving the immune mechanism...,other disorders of adrenal gland,0.500000,-5.313255,0.938729,0.240665,0.774076,0.440988,0.743499,0.502559,0.500000
