In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import json
from pathlib import Path
from typing import Any, Dict, List

import scispacy
import spacy

import requests
import pandas as pd
import janitor
import numpy as np
from loguru import logger

In [3]:
import sys
sys.path.append(".")

import fn

In [4]:
model_path = Path(".") / "models" /  "en_core_sci_md-0.5.0" / "en_core_sci_md" / "en_core_sci_md-0.5.0"
assert model_path.exists(), model_path

In [5]:
nlp = spacy.load(model_path)

In [6]:
TERMS = ["Y65 Other misadventures", "Dieseases due to other mycobacteria"]

for idx, _ in enumerate(TERMS):
    doc = nlp(_)
    print(idx)
    print(doc.ents)

0
(misadventures,)
1
(Dieseases, mycobacteria)


In [7]:
foo = doc.ents

In [8]:
str(list(foo)[0])

'Dieseases'

In [9]:
mapping_df_path = Path(".") / "mapping_results.json"
assert mapping_df_path.exists(), mapping_df_path

with mapping_df_path.open() as f:
    mapping_df = pd.json_normalize(json.load(f))

print(mapping_df.info())
mapping_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596 entries, 0 to 595
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          596 non-null    float64
 1   label       596 non-null    object 
 2   clean_text  596 non-null    object 
 3   efo_cands   596 non-null    object 
dtypes: float64(1), object(3)
memory usage: 18.8+ KB
None


Unnamed: 0,id,label,clean_text,efo_cands
0,8.51,Intestinal e.coli,intestinal e coli,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...
1,31.00,Diseases due to other mycobacteria,diseases due to other mycobacteria,[{'id': 'http://www.orpha.net/ORDO/Orphanet_18...
2,41.11,Methicillin sensitive Staphylococcus aureus,methicillin sensitive staphylococcus aureus,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...
3,41.12,Methicillin resistant Staphylococcus aureus,methicillin resistant staphylococcus aureus,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...
4,41.80,H. pylori,h pylori,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...
...,...,...,...,...
591,704.12,L65 Other nonscarring hair loss,other nonscarring hair loss,[{'id': 'http://purl.obolibrary.org/obo/HP_000...
592,620.00,N89 Other noninflammatory disorders of vagina,other noninflammatory disorders of vagina,[{'id': 'http://purl.obolibrary.org/obo/MONDO_...
593,620.00,N90 Other noninflammatory disorders of vulva a...,other noninflammatory disorders of vulva and p...,[{'id': 'http://purl.obolibrary.org/obo/MONDO_...
594,790.10,R70 Elevated erythrocyte sedimentation rate an...,elevated erythrocyte sedimentation rate and ab...,[{'id': 'http://www.ebi.ac.uk/efo/EFO_0005047'...


In [10]:
def get_ner(s: pd.Series) -> List:
    echo_step = 200
    total = len(s)
    res = []
    for idx, _ in enumerate(s.tolist()):
        if idx % echo_step == 0:
            logger.info(f"#{idx}/{total}")
        doc = nlp(_)
        item = [str(_) for _ in doc.ents]
        res.append(item)
    return res
    

ner_df = mapping_df.transform_column("label", lambda s: get_ner(s), "ents", elementwise=False)

print(ner_df.info())
print(ner_df.head())

2022-06-08 14:47:51.943 | INFO     | __main__:get_ner:7 - #0/596
2022-06-08 14:47:53.135 | INFO     | __main__:get_ner:7 - #200/596
2022-06-08 14:47:54.362 | INFO     | __main__:get_ner:7 - #400/596


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596 entries, 0 to 595
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          596 non-null    float64
 1   label       596 non-null    object 
 2   clean_text  596 non-null    object 
 3   efo_cands   596 non-null    object 
 4   ents        596 non-null    object 
dtypes: float64(1), object(4)
memory usage: 23.4+ KB
None
      id                                        label  \
0   8.51                            Intestinal e.coli   
1  31.00           Diseases due to other mycobacteria   
2  41.11  Methicillin sensitive Staphylococcus aureus   
3  41.12  Methicillin resistant Staphylococcus aureus   
4  41.80                                    H. pylori   

                                    clean_text  \
0                            intestinal e coli   
1           diseases due to other mycobacteria   
2  methicillin sensitive staphylococcus aureus   
3  methi

In [11]:
def search_epigraphdb_efo_wrapper(s: pd.Series) -> List[List[Dict[str, Any]]]:
    echo_step = 200
    total = len(s)
    res = []
    for idx, _ in enumerate(s.tolist()):
        if idx % echo_step == 0:
            logger.info(f"#{idx}/{total}")
        item = [
            {
                "ent": __,
                "efo_cands": fn.search_epigraphdb_efo(text=__.lower(), limit=3),
            }
            for __ in _ if __ is not None
        ]
        res.append(item)
    return res

ner_df = ner_df \
    .transform_column(
        column_name="ents",
        dest_column_name="ent_efo_cands",
        function=search_epigraphdb_efo_wrapper, elementwise=False
    )

cache_path = Path(".") / "ner_results.json"
with cache_path.open("w") as f:
    json.dump(ner_df.to_dict(orient="records"), f)

2022-06-08 14:47:55.533 | INFO     | __main__:search_epigraphdb_efo_wrapper:7 - #0/596
2022-06-08 14:48:30.776 | INFO     | __main__:search_epigraphdb_efo_wrapper:7 - #200/596
2022-06-08 14:49:07.335 | INFO     | __main__:search_epigraphdb_efo_wrapper:7 - #400/596


In [12]:
def annotate_epigraphdb_url(item):
    template = "https://epigraphdb.org/ontology/efo?id={id}"
    new_item = item.copy()
    new_item["epigraphdb_url"] = template.format(id=new_item["id"])
    return new_item

annotated_df = ner_df \
    .transform_column(
        column_name="efo_cands",
        dest_column_name="efo_cands",
        function=lambda e: [annotate_epigraphdb_url(_) for _ in e],
    ) \
    .transform_column(
        column_name="ent_efo_cands",
        dest_column_name="ent_efo_cands",
        function=lambda e: [{"ent": _["ent"], "efo_cands": [annotate_epigraphdb_url(__) for __ in _["efo_cands"]]} for _ in e]
    )


cache_path = Path(".") / "annotated_results.json"
with cache_path.open("w") as f:
    json.dump(annotated_df.to_dict(orient="records"), f)

In [13]:
annotated_df_1 = annotated_df \
    .transform_column(
        column_name="efo_cands",
        dest_column_name="highest_score",
        function=lambda item: item[0]["score"] if len(item) > 0 else None
    ) \
   .transform_column(
        column_name="highest_score",
        dest_column_name="score_bin",
        elementwise=False,
        function=lambda s: pd.cut(s, bins=[0, 0.6, 0.8, 0.9, 1.0])
    ) \
   .transform_column(
        column_name="score_bin",
        elementwise=True,
        function=lambda e: None if e is None else str(e)
    )
annotated_df_1

Unnamed: 0,id,label,clean_text,efo_cands,ents,ent_efo_cands,highest_score,score_bin
0,8.51,Intestinal e.coli,intestinal e coli,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...,[Intestinal e.coli],"[{'ent': 'Intestinal e.coli', 'efo_cands': [{'...",0.734050,"(0.6, 0.8]"
1,31.00,Diseases due to other mycobacteria,diseases due to other mycobacteria,[{'id': 'http://www.orpha.net/ORDO/Orphanet_18...,"[Diseases, mycobacteria]","[{'ent': 'Diseases', 'efo_cands': [{'id': 'htt...",0.736293,"(0.6, 0.8]"
2,41.11,Methicillin sensitive Staphylococcus aureus,methicillin sensitive staphylococcus aureus,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...,"[Methicillin, sensitive, Staphylococcus aureus]","[{'ent': 'Methicillin', 'efo_cands': [{'id': '...",0.938938,"(0.9, 1.0]"
3,41.12,Methicillin resistant Staphylococcus aureus,methicillin resistant staphylococcus aureus,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...,"[Methicillin, resistant, Staphylococcus aureus]","[{'ent': 'Methicillin', 'efo_cands': [{'id': '...",0.940465,"(0.9, 1.0]"
4,41.80,H. pylori,h pylori,[{'id': 'http://purl.obolibrary.org/obo/NCBITa...,[H. pylori],"[{'ent': 'H. pylori', 'efo_cands': [{'id': 'ht...",0.752023,"(0.6, 0.8]"
...,...,...,...,...,...,...,...,...
591,704.12,L65 Other nonscarring hair loss,other nonscarring hair loss,[{'id': 'http://purl.obolibrary.org/obo/HP_000...,[nonscarring hair],"[{'ent': 'nonscarring hair', 'efo_cands': [{'i...",0.693470,"(0.6, 0.8]"
592,620.00,N89 Other noninflammatory disorders of vagina,other noninflammatory disorders of vagina,[{'id': 'http://purl.obolibrary.org/obo/MONDO_...,"[N89, noninflammatory disorders, vagina]","[{'ent': 'N89', 'efo_cands': []}, {'ent': 'non...",0.696924,"(0.6, 0.8]"
593,620.00,N90 Other noninflammatory disorders of vulva a...,other noninflammatory disorders of vulva and p...,[{'id': 'http://purl.obolibrary.org/obo/MONDO_...,"[noninflammatory disorders, vulva, perineum]","[{'ent': 'noninflammatory disorders', 'efo_can...",0.683544,"(0.6, 0.8]"
594,790.10,R70 Elevated erythrocyte sedimentation rate an...,elevated erythrocyte sedimentation rate and ab...,[{'id': 'http://www.ebi.ac.uk/efo/EFO_0005047'...,"[R70, Elevated, erythrocyte, rate, abnormality...","[{'ent': 'R70', 'efo_cands': [{'id': 'http://w...",0.766044,"(0.6, 0.8]"


In [14]:
cache_path = Path(".") / "res_full.json"
with cache_path.open("w") as f:
    json.dump(annotated_df_1.to_dict(orient="records"), f)

In [15]:
annotated_df_1["score_bin"].value_counts()

(0.6, 0.8]    258
(0.8, 0.9]    182
(0.9, 1.0]    138
(0.0, 0.6]     18
Name: score_bin, dtype: int64

In [16]:
cache_path = Path(".") / "res_high.json"
with cache_path.open("w") as f:
    df = annotated_df_1.filter_column_isin(column_name="score_bin", iterable=["(0.9, 1.0]", "(0.8, 0.9]"])
    print(len(df))
    json.dump(df.to_dict(orient="records"), f)
    
cache_path = Path(".") / "res_medium.json"
with cache_path.open("w") as f:
    df = annotated_df_1.filter_column_isin(column_name="score_bin", iterable=["(0.6, 0.8]"])
    print(len(df))
    json.dump(df.to_dict(orient="records"), f)
    
cache_path = Path(".") / "res_low.json"
with cache_path.open("w") as f:
    df = annotated_df_1.filter_column_isin(column_name="score_bin", iterable=["(0.0, 0.6]"])
    print(len(df))
    json.dump(df.to_dict(orient="records"), f)

320
258
18
