In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import json
from pathlib import Path
from typing import Any, Dict, List

import requests
import pandas as pd
import janitor
import numpy as np
from loguru import logger

In [3]:
import sys
sys.path.append(".")

import fn

In [4]:
file_path = Path(".") / "data" / "unmapped_to_EFO.csv"
assert file_path.exists(), file_path

orig_df = pd.read_csv(file_path)
print(orig_df.info())
orig_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596 entries, 0 to 595
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Phecode        596 non-null    float64
 1   description    596 non-null    object 
 2   dataset        596 non-null    object 
 3   MVP_Cases      596 non-null    float64
 4   MVP_Controls   596 non-null    float64
 5   trait_type     596 non-null    object 
 6   UKBB_filename  371 non-null    object 
 7   UKBB_Cases     371 non-null    float64
 8   UKBB_Controls  371 non-null    float64
dtypes: float64(5), object(4)
memory usage: 42.0+ KB
None


Unnamed: 0,Phecode,description,dataset,MVP_Cases,MVP_Controls,trait_type,UKBB_filename,UKBB_Cases,UKBB_Controls
0,8.51,Intestinal e.coli,MVP,57.0,463932.0,Phecode,,,
1,31.0,Diseases due to other mycobacteria,MVP,790.0,463199.0,Phecode,,,
2,41.11,Methicillin sensitive Staphylococcus aureus,MVP,4227.0,459762.0,Phecode,,,
3,41.12,Methicillin resistant Staphylococcus aureus,MVP,4266.0,459723.0,Phecode,,,
4,41.8,H. pylori,MVP,2616.0,461373.0,Phecode,,,


In [5]:
# check dupes

print(orig_df["Phecode"].drop_duplicates().pipe(len))
orig_df[orig_df["Phecode"].duplicated(keep=False)]

595


Unnamed: 0,Phecode,description,dataset,MVP_Cases,MVP_Controls,trait_type,UKBB_filename,UKBB_Cases,UKBB_Controls
592,620.0,N89 Other noninflammatory disorders of vagina,UKBB+MVP,492.0,463497.0,icd10,icd10-N89-both_sexes.tsv.bgz,1692.0,418839.0
593,620.0,N90 Other noninflammatory disorders of vulva a...,UKBB+MVP,492.0,463497.0,icd10,icd10-N90-both_sexes.tsv.bgz,2175.0,418356.0


In [6]:
TERMS = ["Y65 Other misadventures", "Dieseases due to other mycobacteria", "foo/bar", "foo / bar"]

for _ in TERMS:
    print(fn.clean_text(_))

other misadventures
dieseases due to other mycobacteria
foo bar
foo   bar


In [7]:
TERMS = ["Y65 Other misadventures", "Dieseases due to other mycobacteria"]

for idx, _ in enumerate(TERMS):
    print(idx, fn.search_epigraphdb_efo(_), "\n")

0 [{'id': 'http://purl.obolibrary.org/obo/MONDO_0016677', 'name': 'toxic or drug-related embryofetopathy', 'score': 0.38665379999999994}, {'id': 'http://purl.obolibrary.org/obo/CHEBI_50177', 'name': 'dermatologic drug', 'score': 0.3778478999999999}, {'id': 'http://www.ebi.ac.uk/efo/EFO_0006405', 'name': 'G142', 'score': 0.358098}] 

1 [{'id': 'http://www.orpha.net/ORDO/Orphanet_183710', 'name': 'Genetic susceptibility to infections due to particular pathogens', 'score': 0.7555482}, {'id': 'http://purl.obolibrary.org/obo/MONDO_0020636', 'name': 'mendelian susceptibility to mycobacterial diseases due to a complete deficiency', 'score': 0.6545917000000001}, {'id': 'http://www.orpha.net/ORDO/Orphanet_99898', 'name': 'Mendelian susceptibility to mycobacterial diseases due to complete IFNgammaR1 deficiency', 'score': 0.6521646000000001}] 



In [8]:
mapping_df = orig_df \
    .rename_column("Phecode", "id") \
    .rename_column("description", "label") \
    .select_columns(["id", "label"]) \
    .transform_column("label", fn.clean_text, "clean_text")
print(mapping_df.info())
mapping_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596 entries, 0 to 595
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          596 non-null    float64
 1   label       596 non-null    object 
 2   clean_text  596 non-null    object 
dtypes: float64(1), object(2)
memory usage: 14.1+ KB
None


Unnamed: 0,id,label,clean_text
0,8.51,Intestinal e.coli,intestinal e coli
1,31.00,Diseases due to other mycobacteria,diseases due to other mycobacteria
2,41.11,Methicillin sensitive Staphylococcus aureus,methicillin sensitive staphylococcus aureus
3,41.12,Methicillin resistant Staphylococcus aureus,methicillin resistant staphylococcus aureus
4,41.80,H. pylori,h pylori
...,...,...,...
591,704.12,L65 Other nonscarring hair loss,other nonscarring hair loss
592,620.00,N89 Other noninflammatory disorders of vagina,other noninflammatory disorders of vagina
593,620.00,N90 Other noninflammatory disorders of vulva a...,other noninflammatory disorders of vulva and p...
594,790.10,R70 Elevated erythrocyte sedimentation rate an...,elevated erythrocyte sedimentation rate and ab...


In [9]:
def search_epigraphdb_efo_wrapper(s: pd.Series) -> List[List[Dict[str, Any]]]:
    echo_step = 200
    total = len(s)
    res = []
    for idx, _ in enumerate(s.tolist()):
        if idx % echo_step == 0:
            logger.info(f"#{idx}/{total}")
        item = fn.search_epigraphdb_efo(text=_)
        res.append(item)
    return res

mapping_df = mapping_df \
    .transform_column(
        column_name="clean_text", 
        dest_column_name="efo_cands", 
        function=search_epigraphdb_efo_wrapper, elementwise=False)

cache_path = Path(".") / "mapping_results.json"
with cache_path.open("w") as f:
    json.dump(mapping_df.to_dict(orient="records"), f)

2022-06-08 14:46:18.522 | INFO     | __main__:search_epigraphdb_efo_wrapper:7 - #0/596
2022-06-08 14:46:37.805 | INFO     | __main__:search_epigraphdb_efo_wrapper:7 - #200/596
2022-06-08 14:46:56.132 | INFO     | __main__:search_epigraphdb_efo_wrapper:7 - #400/596


In [10]:
mapping_df.head()

Unnamed: 0,id,label,clean_text,efo_cands
0,8.51,Intestinal e.coli,intestinal e coli,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...
1,31.0,Diseases due to other mycobacteria,diseases due to other mycobacteria,[{'id': 'http://www.orpha.net/ORDO/Orphanet_18...
2,41.11,Methicillin sensitive Staphylococcus aureus,methicillin sensitive staphylococcus aureus,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...
3,41.12,Methicillin resistant Staphylococcus aureus,methicillin resistant staphylococcus aureus,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...
4,41.8,H. pylori,h pylori,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...


In [11]:
mapping_df_1 = mapping_df \
    .transform_column(
        column_name="efo_cands",
        dest_column_name="highest_score",
        function=lambda item: item[0]["score"] if len(item) > 0 else None
    ) \
   .transform_column(
        column_name="highest_score",
        dest_column_name="score_bin",
        elementwise=False,
        function=lambda s: pd.cut(s, bins=[0, 0.6, 0.8, 0.9, 1.0])
    ) \
   .transform_column(
        column_name="score_bin",
        elementwise=True,
        function=lambda e: None if e is None else str(e)
    )
mapping_df_1

Unnamed: 0,id,label,clean_text,efo_cands,highest_score,score_bin
0,8.51,Intestinal e.coli,intestinal e coli,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...,0.734050,"(0.6, 0.8]"
1,31.00,Diseases due to other mycobacteria,diseases due to other mycobacteria,[{'id': 'http://www.orpha.net/ORDO/Orphanet_18...,0.736293,"(0.6, 0.8]"
2,41.11,Methicillin sensitive Staphylococcus aureus,methicillin sensitive staphylococcus aureus,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...,0.938938,"(0.9, 1.0]"
3,41.12,Methicillin resistant Staphylococcus aureus,methicillin resistant staphylococcus aureus,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...,0.940465,"(0.9, 1.0]"
4,41.80,H. pylori,h pylori,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...,0.752023,"(0.6, 0.8]"
...,...,...,...,...,...,...
591,704.12,L65 Other nonscarring hair loss,other nonscarring hair loss,[{'id': 'http://purl.obolibrary.org/obo/HP_000...,0.693470,"(0.6, 0.8]"
592,620.00,N89 Other noninflammatory disorders of vagina,other noninflammatory disorders of vagina,[{'id': 'http://purl.obolibrary.org/obo/MONDO_...,0.696924,"(0.6, 0.8]"
593,620.00,N90 Other noninflammatory disorders of vulva a...,other noninflammatory disorders of vulva and p...,[{'id': 'http://purl.obolibrary.org/obo/MONDO_...,0.683544,"(0.6, 0.8]"
594,790.10,R70 Elevated erythrocyte sedimentation rate an...,elevated erythrocyte sedimentation rate and ab...,[{'id': 'http://www.ebi.ac.uk/efo/EFO_0005047'...,0.766044,"(0.6, 0.8]"


In [12]:
mapping_df_1["score_bin"].value_counts()

(0.6, 0.8]    258
(0.8, 0.9]    182
(0.9, 1.0]    138
(0.0, 0.6]     18
Name: score_bin, dtype: int64

In [13]:
mapping_df_1["score_bin"].isna().pipe(sum)

0

In [14]:
mapping_df_1.filter_column_isin("score_bin", [None])

Unnamed: 0,id,label,clean_text,efo_cands,highest_score,score_bin
