In [1]:
from typing import List
import json
from pathlib import Path
from pprint import pprint
import re

import pandas as pd
import janitor
import requests
from pydash import py_

from funcs import utils, paths

---
# cleaning

In [2]:
data_root = utils.find_data_root()
data_dir = data_root / "bioconceptvec"
sample_file_path = data_dir / "bioconcepts2pubtatorcentral.sample"
assert sample_file_path.exists(), sample_file_path

In [3]:
df_raw = pd \
    .read_csv(
        sample_file_path, sep="\t", 
        names=["idx", "ent_type", "ent_id", "term", "source"]) \
    .dropna().reset_index(drop=True)
df_raw

Unnamed: 0,idx,ent_type,ent_id,term,source
0,3958000,Species,9606,patients,SR4GN
1,3958000,Disease,MESH:D012544,Scheuermann's kyphosis,TaggerOne
2,23574000,Chemical,MESH:D008012,Astrazeneca,TaggerOne
3,23574000,Species,9615,dogs,SR4GN
4,23574000,Disease,MESH:D007153,antibody omalizumab,TaggerOne
...,...,...,...,...,...
802,20635000,Mutation,rs779184767,C243A,tmVar
803,20635000,Gene,155030,Gag,GNormPlus
804,20635000,Gene,57379,AID,GNormPlus
805,20635000,Chemical,MESH:D009584,N,TaggerOne


In [4]:
def valid_p(term: str) -> bool:
    """
    - term should be long enough
    - term should not contain digit
    """
    num_char_limit = 2
    if len(term) <= num_char_limit:
        return False
    find_digits = re.findall(r"\d", term)
    if len(find_digits) > 0:
        return False
    return True

def clean_term(term: str) -> List[str]:
    # split by "|"
    terms = term.split("|")
    terms = [_.strip().lower() for _ in terms]
    # drop invalid terms
    terms = [_ for _ in terms if valid_p(_)]
    terms = py_.chain(terms).uniq().value()
    return terms

In [5]:
df = df_raw \
    .transform_column("term", clean_term, "expand_terms") \
    .explode("expand_terms") \
    .dropna() \
    .drop_duplicates(subset=["ent_id", "expand_terms"])
df

Unnamed: 0,idx,ent_type,ent_id,term,source,expand_terms
0,3958000,Species,9606,patients,SR4GN,patients
1,3958000,Disease,MESH:D012544,Scheuermann's kyphosis,TaggerOne,scheuermann's kyphosis
2,23574000,Chemical,MESH:D008012,Astrazeneca,TaggerOne,astrazeneca
3,23574000,Species,9615,dogs,SR4GN,dogs
4,23574000,Disease,MESH:D007153,antibody omalizumab,TaggerOne,antibody omalizumab
...,...,...,...,...,...,...
800,20635000,Chemical,MESH:C032259,SDS|SDS-polyacrylamide,TaggerOne,sds-polyacrylamide
801,20635000,Gene,1123,CHn,GNormPlus,chn
803,20635000,Gene,155030,Gag,GNormPlus,gag
804,20635000,Gene,57379,AID,GNormPlus,aid


In [44]:
df.groupby("expand_terms").size().sort_values()

expand_terms
(c   o b                      1
nitrosating amines            1
non-hodgkin's lymphoma        1
non-small cell lung cancer    1
nontuberculous pneumonia      1
                             ..
formamide                     1
zno                           1
semet                         2
tuberculosis                  2
bacteria                      2
Length: 678, dtype: int64

In [47]:
df[df.duplicated(keep=False, subset=["expand_terms"])]

Unnamed: 0,idx,ent_type,ent_id,term,source,expand_terms
66,20760000,Species,1773,TUBERCULOSIS,SR4GN,tuberculosis
145,3130000,Disease,MESH:D014376,tuberculosis|Tuberculosis,TaggerOne,tuberculosis
175,31134000,Species,2,Bacteria|bacteria|bacterial,SR4GN,bacteria
333,15356000,Species,562,Escherichia coli|bacteria,SR4GN,bacteria
497,19448000,Chemical,MESH:D012645,SeMet|Selenomethionine|selenomethionine,TaggerOne|CTD|MESH,semet
561,9853000,Chemical,MESH:C517785,SeMet|seleno-L-methionine,TaggerOne,semet


In [48]:
df1 = df.drop_duplicates(subset=["expand_terms"])
df1

Unnamed: 0,idx,ent_type,ent_id,term,source,expand_terms
0,3958000,Species,9606,patients,SR4GN,patients
1,3958000,Disease,MESH:D012544,Scheuermann's kyphosis,TaggerOne,scheuermann's kyphosis
2,23574000,Chemical,MESH:D008012,Astrazeneca,TaggerOne,astrazeneca
3,23574000,Species,9615,dogs,SR4GN,dogs
4,23574000,Disease,MESH:D007153,antibody omalizumab,TaggerOne,antibody omalizumab
...,...,...,...,...,...,...
800,20635000,Chemical,MESH:C032259,SDS|SDS-polyacrylamide,TaggerOne,sds-polyacrylamide
801,20635000,Gene,1123,CHn,GNormPlus,chn
803,20635000,Gene,155030,Gag,GNormPlus,gag
804,20635000,Gene,57379,AID,GNormPlus,aid


In [49]:
df1[df1.duplicated(keep=False, subset=["expand_terms"])]

Unnamed: 0,idx,ent_type,ent_id,term,source,expand_terms


---
# indexing

In [50]:
ES_URL = "http://ieu-mrbssd1.epi.bris.ac.uk:26550"

r = requests.get(ES_URL)
assert r.ok
print(r.json())

{'name': '53d4a935d550', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'cfbqZlymRuas-lywIWKaOg', 'version': {'number': '7.5.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '3ae9ac9a93c95bd0cdc054951cf95d88e1e18d96', 'build_date': '2019-12-16T22:57:37.835892Z', 'build_snapshot': False, 'lucene_version': '8.3.0', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}


In [51]:
INDEX_NAME = "test-bioconcepts"

In [52]:
index_url = ES_URL + "/" + INDEX_NAME

put_data = {
    "settings": {
        "analysis": {
            "analyzer": {
                "substring": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "apostrophe",
                        "kstem",
                        "substring"           
                    ]
                },
                "exact": {
                    "type": "custom",
                    "tokenizer": "keyword",
                    "filter": [
                        "lowercase",
                        "apostrophe",
                        "kstem",
                    ]
                }
            },
            "filter": {
                "substring": {
                    "type":"shingle",
                    "output_unigrams" : True
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "ent_id": {
              "type": "keyword",
            },
            "ent_term": {
                "type": "text",
            },
            "ent_term_norm": {
                "type": "text",
                "analyzer": "exact",
                "search_analyzer": "substring",
            }
        }
    }
}
pprint(put_data)

{'mappings': {'properties': {'ent_id': {'type': 'keyword'},
                             'ent_term': {'type': 'text'},
                             'ent_term_norm': {'analyzer': 'exact',
                                               'search_analyzer': 'substring',
                                               'type': 'text'}}},
 'settings': {'analysis': {'analyzer': {'exact': {'filter': ['lowercase',
                                                             'apostrophe',
                                                             'kstem'],
                                                  'tokenizer': 'keyword',
                                                  'type': 'custom'},
                                        'substring': {'filter': ['lowercase',
                                                                 'apostrophe',
                                                                 'kstem',
                                                                 'substrin

In [54]:
url = ES_URL + f"/{INDEX_NAME}"
r = requests.get(url)
if r.ok:
    print("Remove index")
    r = requests.delete(url)
    
print("Init index")
r = requests.put(url, json=put_data)

In [55]:
for _ in df1.to_dict(orient="records"):
    ent_id = _["ent_type"] + "_" + _["ent_id"].replace(":", "_")
    ent_term = _["expand_terms"]
    doc = {
        "ent_id": ent_id,
        "ent_term": ent_term,
        "ent_term_norm": ent_term,
    }
    url = ES_URL + f"/{INDEX_NAME}" + "/_doc/"
    r = requests.post(url, json=doc)

In [95]:
def search_term(term: str):
    search_data = {
        "query": {
            "match": {
                "ent_term_norm": term
            }
        },
        "aggs": {
            "dedup": {
                "terms": {"field": "ent_id"},
                "aggs": {"dedup_docs": {"top_hits": {"size": 1}}}
            }
        }
    }
    # index_name = INDEX_NAME
    index_name = "bioconcepts"
    url = ES_URL + f"/{index_name}" + "/_search"
    r = requests.get(url, json=search_data)
    r.raise_for_status()
    res = r.json()["aggregations"]["dedup"]["buckets"]
    return res

In [96]:
LABELS = [
    "lung disease",
    "aid",
    "Scheuermann's kyphosis"
]

for idx, label in enumerate(LABELS):
    print(f"#{idx} {label}")
    search_res = search_term(label)
    pprint(search_res)
    print("\n")

#0 lung disease
[{'dedup_docs': {'hits': {'hits': [{'_id': 'ObIcgoEBh99XIFUo0scP',
                                    '_index': 'bioconcepts',
                                    '_score': 10.015286,
                                    '_source': {'ent_id': 'Disease_MESH_D008171',
                                                'ent_term': 'lung disease',
                                                'ent_term_norm': 'lung '
                                                                 'disease'},
                                    '_type': '_doc'}],
                          'max_score': 10.015286,
                          'total': {'relation': 'eq', 'value': 551}}},
  'doc_count': 551,
  'key': 'Disease_MESH_D008171'},
 {'dedup_docs': {'hits': {'hits': [{'_id': 've-xhYEBh99XIFUoKG_2',
                                    '_index': 'bioconcepts',
                                    '_score': 12.41745,
                                    '_source': {'ent_id': 'Disease_MESH_D0031

In [69]:
for idx, _ in list(df_raw.iterrows())[:50]:
    print(f"#{idx} {_['term']}")
    search_res = search_term(_["term"])
    print(search_res)
    print("\n")

#0 patients
[{'_index': 'bioconcepts', '_type': '_doc', '_id': '8bENgoEBh99XIFUoPNaa', '_score': 8.672219, '_source': {'ent_id': 'Species_9606', 'ent_term': 'patient', 'ent_term_norm': 'patient'}}, {'_index': 'bioconcepts', '_type': '_doc', '_id': '3bENgoEBh99XIFUoO9ZM', '_score': 8.672219, '_source': {'ent_id': 'Species_9606', 'ent_term': 'patients', 'ent_term_norm': 'patients'}}, {'_index': 'bioconcepts', '_type': '_doc', '_id': '_rISgoEBh99XIFUonjEH', '_score': 8.672219, '_source': {'ent_id': 'Species_9606', 'ent_term': 'patients', 'ent_term_norm': 'patients'}}, {'_index': 'bioconcepts', '_type': '_doc', '_id': 'W7ISgoEBh99XIFUoozKG', '_score': 8.672219, '_source': {'ent_id': 'Species_9606', 'ent_term': 'patient', 'ent_term_norm': 'patient'}}]


#1 Scheuermann's kyphosis
[{'_index': 'bioconcepts', '_type': '_doc', '_id': '3rENgoEBh99XIFUoO9Zs', '_score': 9.770832, '_source': {'ent_id': 'Disease_MESH_D012544', 'ent_term': "scheuermann's kyphosis", 'ent_term_norm': "scheuermann's kyph

---

In [76]:
from funcs.data_processing.stage1_processing import get_ebi_data

In [77]:
ebi_data = get_ebi_data(verbose=False)
print(ebi_data.info())
SAMPLE_SIZE = 50
ebi_sample = ebi_data[ebi_data["MAPPING_TYPE"] == "Exact"] \
    [["query", "MAPPED_TERM_LABEL", "MAPPED_TERM_URI"]] \
    .head(SAMPLE_SIZE).drop_duplicates().reset_index(drop=True)
ebi_sample

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1191 entries, 0 to 1190
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   query              1191 non-null   object
 1   MAPPED_TERM_LABEL  1191 non-null   object
 2   MAPPED_TERM_URI    1191 non-null   object
 3   MAPPING_TYPE       1191 non-null   object
 4   id                 1191 non-null   object
 5   full_id            1191 non-null   object
 6   mapping_id         1191 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 65.3+ KB
None


Unnamed: 0,query,MAPPED_TERM_LABEL,MAPPED_TERM_URI
0,alzheimer s disease,Alzheimer's disease,EFO_0000249
1,endocarditis valve unspecified,endocarditis,EFO_0000465
2,family history of other conditions,family history,EFO_0000493
3,unspecified maternal hypertension,preeclampsia,EFO_0000668
4,stroke not specified as haemorrhage or infarction,stroke,EFO_0000712
5,subarachnoid haemorrhage,subarachnoid hemorrhage,EFO_0000713
6,unspecified human immunodeficiency virus,HIV infection,EFO_0000764
7,other endocrine disorders,endocrine system disease,EFO_0001379
8,other diseases of liver,liver disease,EFO_0001421
9,primary disorders of muscles,muscular disease,EFO_0002970


In [78]:
def search_term(term: str):
    
    search_data = {
        "query": {
            "match": {
                "ent_term_norm": term
            }
        }
    }
    # index_name = INDEX_NAME
    index_name = "bioconcepts"
    url = ES_URL + f"/{index_name}" + "/_search"
    r = requests.get(url, json=search_data)
    r.raise_for_status()
    res = r.json()["hits"]["hits"]
    return res

In [80]:
for idx, row in ebi_sample.iterrows():
    query = row['query']
    print(f"#{idx} {query}")
    search_res = search_term(term=query)
    pprint(search_res)
    print("\n---\n")

#0 alzheimer s disease
[{'_id': 'irIYgoEBh99XIFUoE3gL',
  '_index': 'bioconcepts',
  '_score': 7.263795,
  '_source': {'ent_id': 'Disease_MESH_D000544',
              'ent_term': 'alzheimer',
              'ent_term_norm': 'alzheimer'},
  '_type': '_doc'},
 {'_id': 'FrIYgoEBh99XIFUoOHvC',
  '_index': 'bioconcepts',
  '_score': 7.263795,
  '_source': {'ent_id': 'Disease_MESH_D000544',
              'ent_term': "alzheimer's type dementia",
              'ent_term_norm': "alzheimer's type dementia"},
  '_type': '_doc'},
 {'_id': 'p7IZgoEBh99XIFUoNIv4',
  '_index': 'bioconcepts',
  '_score': 7.263795,
  '_source': {'ent_id': 'Disease_MESH_D000544',
              'ent_term': "alzheimer's brains",
              'ent_term_norm': "alzheimer's brains"},
  '_type': '_doc'},
 {'_id': 'nrIZgoEBh99XIFUoo5JQ',
  '_index': 'bioconcepts',
  '_score': 7.263795,
  '_source': {'ent_id': 'Disease_MESH_D000544',
              'ent_term': "alzheimer's dementia",
              'ent_term_norm': "alzheimer's d

---

In [87]:
def search_term(term: str):
    exclude_terms = ["and", "ands", "not", "with", "other"]
    search_data = {
        "query": {
            "bool": {
                "should": {
                     "match": {
                        "ent_term_norm": term
                    }       
                },
                "must_not": [{"term": {"ent_term_norm": _}} for _ in exclude_terms]
            }
        },
        "aggs": {
            "dedup": {
                "terms": {
                    "field": "ent_id"
                },
                "aggs": {
                    "dedup_docs": {
                        "top_hits": {
                            "size": 1
                        }
                    }
                }
            }
        }
    }
    index_name = "bioconcepts"
    url = ES_URL + f"/{index_name}" + "/_search"
    r = requests.get(url, json=search_data)
    r.raise_for_status()
    # res = r.json()["hits"]["hits"]
    res = r.json()
    return res

In [88]:
terms = [
    "gonarthrosis",
    "psoriatic and enteropathic arthropathies",
    "pain associated with micturition",
    "other mood",
    "preterm delivery",
    "injury of nerves at forearm level",
    "injury of nerves at wrist and hand level",
    "gastritis and duodenitis",
    "problems related to physical environment",
]
for idx, _ in enumerate(terms):
    search_res = search_term(_)
    print(f"#{idx} {_}")
    pprint(search_res)
    print("\n---\n")

#0 gonarthrosis
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'aggregations': {'dedup': {'buckets': [],
                            'doc_count_error_upper_bound': 0,
                            'sum_other_doc_count': 0}},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 397}

---

#1 psoriatic and enteropathic arthropathies
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'aggregations': {'dedup': {'buckets': [{'dedup_docs': {'hits': {'hits': [{'_id': 'JbIYgoEBh99XIFUowYQ6',
                                                                           '_index': 'bioconcepts',
                                                                           '_score': 10.000624,
                                                                           '_source': {'ent_id': 'Disease_MESH_D007592',
                                                                        