Overall strategy:

- two approaches then merge
  - composite term mapping:
    - first try whole term mapping
      -> then for unmapped, try word ngram mapping
  - equivalent term mapping
    - query the whole term with scispacy embedding

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from typing import Any, Dict, List
from pprint import pprint

import pandas as pd
import numpy as np
import janitor
import spacy
import requests
from pydash import py_
from Levenshtein import distance as levenshtein_distance
from tqdm import tqdm

from funcs import utils, query, paths, settings

In [3]:
model_path = paths.models["scispacy_lg"]
assert model_path.exists(), model_path
scispacy_model = spacy.load(model_path)

In [4]:
proj_root = utils.find_project_root("docker-compose.yml")

data_path = proj_root / "data" / "output" / "mvp_traits_clean.json"
assert data_path.exists(), data_path

with data_path.open() as f:
    input_df = pd.json_normalize(json.load(f))

input_df.info()
input_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2079 entries, 0 to 2078
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   trait_id          2079 non-null   object
 1   trait_term        2079 non-null   object
 2   phenotype         2079 non-null   object
 3   dataset           2079 non-null   object
 4   trait_type        2079 non-null   object
 5   trait_term_query  2079 non-null   object
dtypes: object(6)
memory usage: 97.6+ KB


Unnamed: 0,trait_id,trait_term,phenotype,dataset,trait_type,trait_term_query
0,8-00,Intestinal infection,8,UKBB+MVP,phecode,[Intestinal infection]
1,85-01,Bacterial enteritis,8.5,UKBB+MVP,phecode,[Bacterial enteritis]
2,851-02,Intestinal e.coli,8.51,MVP,phecode,[Intestinal e.coli]
3,852-03,Intestinal infection due to C. difficile,8.52,UKBB+MVP,phecode,[Intestinal infection due to C. difficile]
4,86-04,Viral Enteritis,8.6,UKBB+MVP,phecode,[Viral Enteritis]
...,...,...,...,...,...,...
2074,dbp-at-enrollment-2076,Diastolic blood pressure,DBP (at enrollment),UKBB+MVP,vital status,[Diastolic blood pressure]
2075,p-pulse-at-enrollment-2077,Heart rate,P (Pulse at enrollment),UKBB+MVP,vital status,[Heart rate]
2076,height-in-2078,Heigth,Height (in),MVP,vital status,[Heigth]
2077,weight-lb-2079,Weight,Weight (lb),UKBB+MVP,vital status,[Weight]


# composite term mapping

In [5]:
def unpick_subcomponent(term_list: List[str]) -> List[str]:
    valid = [True] * len(term_list)
    for idx, term in enumerate(term_list):
        sub_list = term_list[:idx] + term_list[(idx+1):]
        for compare_term in sub_list:
            if term.lower() in compare_term.lower() \
                and  term.lower() != compare_term.lower():
                valid[idx] = False
    res = [_ for idx, _ in enumerate(term_list) if valid[idx]]
    return res

term_list = ["bacterial enteritis", "enteritis", "body mass", "body mass index", "mass index"]
unpick_subcomponent(term_list)

['bacterial enteritis', 'body mass index']

In [6]:
def query_composite(term_list: List[str]) -> List[Dict[str, Any]]:
    def _query(term: str): 
        es_url = settings.es_url
        index_name = "efo-vectors"
        url = f"{es_url}/{index_name}/_search"
        payload = {
            "query": {
              "match": {
                  "vector_term.raw": {
                      "query": term
                  }
              }
            },
            "_source": ["ent_id", "ent_term", "primary_term", "vector_term"],
        }
        r = requests.get(url, json=payload)
        r.raise_for_status()
        results = [_ for _ in r.json()["hits"]["hits"]]
        return results
    nested_res = [_query(_) for _ in term_list]
    flat_res = py_.chain(nested_res) \
        .flatten() \
        .uniq_by(lambda e: e["_source"]["ent_id"]) \
        .map(lambda e: e["_source"]) \
        .value()
    # when having both "body mass" and "body mass index", always favour "body mass index"
    terms = [_["vector_term"] for _ in flat_res]
    valid_terms = unpick_subcomponent(terms)
    valid_res = [_ for _ in flat_res if _["vector_term"] in valid_terms]
    return valid_res

In [7]:
test_terms = [
    ["Intestinal infection"],
    ["Bacterial enteritis"],
    ["Body mass index"],
    ["Tuberculosis"],
]

for idx, _ in enumerate(test_terms):
    print(f"# {idx}, {_}")
    search_res = query_composite(_)
    pprint(search_res)
    print("\n")

# 0, ['Intestinal infection']
[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005741',
  'ent_term': 'infectious disease',
  'primary_term': False,
  'vector_term': 'infection'},
 {'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0000544',
  'ent_term': 'infection',
  'primary_term': True,
  'vector_term': 'infection'}]


# 1, ['Bacterial enteritis']
[{'ent_id': 'http://purl.obolibrary.org/obo/MONDO_0000916',
  'ent_term': 'intestinal infectious disease',
  'primary_term': False,
  'vector_term': 'bacterial enteritis'}]


# 2, ['Body mass index']
[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0004340',
  'ent_term': 'body mass index',
  'primary_term': True,
  'vector_term': 'body mass index'}]


# 3, ['Tuberculosis']
[{'ent_id': 'http://purl.obolibrary.org/obo/MONDO_0018076',
  'ent_term': 'tuberculosis',
  'primary_term': True,
  'vector_term': 'tuberculosis'},
 {'ent_id': 'http://www.orpha.net/ORDO/Orphanet_3389',
  'ent_term': 'Tuberculosis',
  'primary_term': True,
  'vector_term': 'Tuberculosis'}]


# equivalence term mapping

In [8]:
def query_equivalence_embeddings(term_list: List[str], score_threshold=0.9) -> List[Dict[str, Any]]:
    def _query(term: str): 
        es_url = settings.es_url
        index_name = "efo-vectors"
        url = f"{es_url}/{index_name}/_knn_search"
        vector = scispacy_model(term).vector.tolist()
        empty_vector = np.equal(np.sum(vector), 0)
        if empty_vector:
            return []
        payload = {
            "knn": {
                "field": "vector",
                "query_vector": vector,
                "k": 3,
                "num_candidates": 10,
            },
            "_source": ["ent_id", "ent_term", "primary_term", "vector_term"],
        }
        r = requests.get(url, json=payload)
        try:
            r.raise_for_status()
        except Exception as e:
            print(r.text)
            raise
        results = [_ for _ in r.json()["hits"]["hits"] if _["_score"] >= score_threshold]
        return results
    nested_res = [_query(_) for _ in term_list]
    res = py_.chain(nested_res) \
        .flatten() \
        .uniq_by(lambda e: e["_source"]["ent_id"]) \
        .map(lambda e: e["_source"]) \
        .value()
    return res

In [9]:
test_terms = [
    ["Intestinal infection"],
    ["Bacterial enteritis"],
    ["Body mass index"],
]

for idx, _ in enumerate(test_terms):
    print(f"# {idx}, {_}")
    search_res = query_equivalence_embeddings(_)
    pprint(search_res)
    print("\n")

# 0, ['Intestinal infection']
[]


# 1, ['Bacterial enteritis']
[{'ent_id': 'http://purl.obolibrary.org/obo/MONDO_0000916',
  'ent_term': 'intestinal infectious disease',
  'primary_term': False,
  'vector_term': 'bacterial enteritis'},
 {'ent_id': 'http://purl.obolibrary.org/obo/MONDO_0043579',
  'ent_term': 'enteritis',
  'primary_term': True,
  'vector_term': 'enteritis'}]


# 2, ['Body mass index']
[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0004340',
  'ent_term': 'body mass index',
  'primary_term': True,
  'vector_term': 'body mass index'},
 {'ent_id': 'http://purl.obolibrary.org/obo/MONDO_0013267',
  'ent_term': 'distal 16p11.2 microdeletion syndrome',
  'primary_term': False,
  'vector_term': 'body mass index QTL16'},
 {'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005851',
  'ent_term': 'height-adjusted body mass index',
  'primary_term': True,
  'vector_term': 'height-adjusted body mass index'}]




## term shingles / word ngram

In [10]:
def shingle(text: str, min_size: int = 2, max_size: int = 4):
    def _shingle(text: str, size: int) -> List[str]:
        tokens = text.split()
        return [' '.join(tokens[i:i+size])
                         for i in range(len(tokens) - size + 1)]
    token_len = len(text.split())
    if token_len < min_size:
        min_size = token_len
    all_shingles = []
    for size in range(min_size, max_size + 1):
        shingles = _shingle(text, size)
        all_shingles.append(shingles)
    # flatten
    all_shingles = list(set([__ for _ in all_shingles for __ in _]))
    return all_shingles

text_list = ["body mass index", "mass", "mass index"]
for idx, text in enumerate(text_list):
    res = shingle(text)
    print(f"#{idx} {text}")
    print(res)
    print("\n")

#0 body mass index
['body mass', 'mass index', 'body mass index']


#1 mass
['mass']


#2 mass index
['mass index']




In [11]:
def query_shingles(term_list: List[str], score_threshold=0.9) -> List[Dict[str, Any]]:
    shingle_list = py_.flatten([shingle(_) for _ in term_list])
    query_res = query_equivalence_embeddings(shingle_list, score_threshold=score_threshold)
    return query_res

def query_equivalence_and_shingles(term_list: List[str], score_threshold=0.9) -> List[Dict[str, Any]]:
    shingle_list = py_.flatten([shingle(_) for _ in term_list])
    shingle_res = query_equivalence_embeddings(shingle_list, score_threshold=score_threshold)
    equivalence_res = query_equivalence_embeddings(term_list, score_threshold=score_threshold)
    full_res = py_.chain(shingle_res + equivalence_res) \
        .flatten().uniq_by(lambda e: e["ent_id"]) \
        .value()
    return full_res

In [12]:
test_terms = [
    ["Intestinal infection"],
    ["Bacterial enteritis"],
    ["Body mass index"],
]

for idx, _ in enumerate(test_terms):
    print(f"# {idx}, {_}")
    search_res = query_equivalence_and_shingles(_)
    pprint(search_res)
    print("\n")

# 0, ['Intestinal infection']
[]


# 1, ['Bacterial enteritis']
[{'ent_id': 'http://purl.obolibrary.org/obo/MONDO_0000916',
  'ent_term': 'intestinal infectious disease',
  'primary_term': False,
  'vector_term': 'bacterial enteritis'},
 {'ent_id': 'http://purl.obolibrary.org/obo/MONDO_0043579',
  'ent_term': 'enteritis',
  'primary_term': True,
  'vector_term': 'enteritis'}]


# 2, ['Body mass index']
[{'ent_id': 'http://purl.obolibrary.org/obo/MONDO_0013267',
  'ent_term': 'distal 16p11.2 microdeletion syndrome',
  'primary_term': False,
  'vector_term': 'body mass index QTL16'},
 {'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0004340',
  'ent_term': 'body mass index',
  'primary_term': True,
  'vector_term': 'body mass index'},
 {'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005851',
  'ent_term': 'height-adjusted body mass index',
  'primary_term': True,
  'vector_term': 'height-adjusted body mass index'},
 {'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0000790',
  'ent_term': 'obsolete_adipose tis

## plain vanilla full text retrieval

In [13]:
def query_equivalence_fulltext(term_list: List[str]) -> List[Dict[str, Any]]:
    def _query(term: str): 
        es_url = settings.es_url
        index_name = "efo-vectors"
        url = f"{es_url}/{index_name}/_search"
        payload = {
            "query": {
              "match": {
                  "vector_term": {
                      "query": term
                  }
              }
            },
            "size": 3,
            "_source": ["ent_id", "ent_term", "primary_term", "vector_term"],
        }
        r = requests.get(url, json=payload)
        r.raise_for_status()
        results = [_ for _ in r.json()["hits"]["hits"]]
        return results
    nested_res = [_query(_) for _ in term_list]
    flat_res = py_.chain(nested_res) \
        .flatten() \
        .uniq_by(lambda e: e["_source"]["ent_id"]) \
        .map(lambda e: e["_source"]) \
        .value()
    return flat_res

In [14]:
test_terms = [
    ["Intestinal infection"],
    ["Bacterial enteritis"],
    ["Body mass index"],
]

for idx, _ in enumerate(test_terms):
    print(f"# {idx}, {_}")
    search_res = query_equivalence_fulltext(_)
    pprint(search_res)
    print("\n")

# 0, ['Intestinal infection']
[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0007231',
  'ent_term': 'cysticercosis',
  'primary_term': False,
  'vector_term': 'intestinal taenia solium infection'},
 {'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0007230',
  'ent_term': 'cyclosporiasis',
  'primary_term': False,
  'vector_term': 'intestinal infection caused by Cyclospora cayetanensis'}]


# 1, ['Bacterial enteritis']
[{'ent_id': 'http://purl.obolibrary.org/obo/MONDO_0000916',
  'ent_term': 'intestinal infectious disease',
  'primary_term': False,
  'vector_term': 'bacterial enteritis'},
 {'ent_id': 'http://purl.obolibrary.org/obo/MONDO_0043579',
  'ent_term': 'enteritis',
  'primary_term': True,
  'vector_term': 'enteritis'},
 {'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0000384',
  'ent_term': "Crohn's disease",
  'primary_term': False,
  'vector_term': 'Enteritis, Granulomatous'}]


# 2, ['Body mass index']
[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0004340',
  'ent_term': 'body mass index',
  'prim

## Ontology distance classifier

In [15]:
def get_ontology_distance_score(term: str, term_list: List[str]) -> List[float]:
    url = "http://ieu-mrbssd1.epi.bris.ac.uk:28015/ontology/distance"
    text_1 = [term] * len(term_list)
    text_2 = term_list
    payload = {
        "text_1": text_1,
        "text_2": text_2,
    }
    r = requests.post(url, json=payload)
    r.raise_for_status()
    search_res = r.json()
    # sanity override
    edit_dist = [levenshtein_distance(term.lower(), _.lower()) for _ in term_list]
    res = []
    for idx, _ in enumerate(search_res):
        if edit_dist[idx] >= 2:
            res.append(_)
        else:
            res.append(1)
    return res

def query_distance(
    main_term_list: str, 
    compare_term_list: List[str], 
    distance_threshold: float = 1.5
) -> List[str]:
    def _query(term: str, term_list: List[str], distance_threshold: float) -> List[str]:
        score_res = get_ontology_distance_score(term, term_list)
        valid_res = [_ for idx, _ in enumerate(term_list) if score_res[idx] <= distance_threshold]
        return valid_res
    if len(compare_term_list) == 0:
        return []
    nested_res = [_query(_, compare_term_list, distance_threshold) for _ in main_term_list]
    flat_res = py_.chain(nested_res) \
        .flatten() \
        .uniq().value()
    return flat_res

In [16]:
get_ontology_distance_score("height", ["height"])

[1]

In [17]:
test_terms = [
    ["Intestinal infection"],
    ["Bacterial enteritis"],
    ["Body mass index"],
]

for idx, _ in enumerate(test_terms):
    print(f"# {idx}, {_}")
    equivalence_res = query_equivalence_embeddings(_)
    print(equivalence_res)
    distance_res = query_distance(
        main_term_list=_,
        compare_term_list=[_["vector_term"] for _ in equivalence_res]
    )
    print(distance_res)
    print("\n")

# 0, ['Intestinal infection']
[]
[]


# 1, ['Bacterial enteritis']
[{'ent_id': 'http://purl.obolibrary.org/obo/MONDO_0000916', 'ent_term': 'intestinal infectious disease', 'vector_term': 'bacterial enteritis', 'primary_term': False}, {'ent_id': 'http://purl.obolibrary.org/obo/MONDO_0043579', 'ent_term': 'enteritis', 'vector_term': 'enteritis', 'primary_term': True}]
['bacterial enteritis', 'enteritis']


# 2, ['Body mass index']
[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0004340', 'ent_term': 'body mass index', 'vector_term': 'body mass index', 'primary_term': True}, {'ent_id': 'http://purl.obolibrary.org/obo/MONDO_0013267', 'ent_term': 'distal 16p11.2 microdeletion syndrome', 'vector_term': 'body mass index QTL16', 'primary_term': False}, {'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005851', 'ent_term': 'height-adjusted body mass index', 'vector_term': 'height-adjusted body mass index', 'primary_term': True}]
['body mass index']




# Other utils

In [18]:
main_term = "Body mass index"
term_list = [
    "body mass index",
    "Body Fat",
    "body mass index QTL16",
    "height-adjusted body mass index",
]
for idx, term in enumerate(term_list):
    dist = levenshtein_distance(main_term.lower(), term.lower())
    print(f"# {idx} {term}")
    print(dist)
    print("\n")

# 0 body mass index
0


# 1 Body Fat
9


# 2 body mass index QTL16
6


# 3 height-adjusted body mass index
16




In [19]:
def unpick_obsolete(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    res = [
        _ for _ in items if not _["ent_term"].startswith("obsolete_")
    ]
    return res

def pick_perfect_match(term: str, items: List[Dict[str, Any]], score_threshold: int = 0) -> List[Dict[str, Any]]:
    compare_terms = [_["vector_term"] for _ in items]
    dist_list = [levenshtein_distance(term.lower(), _.lower()) for _ in compare_terms]
    valid_items = [
        _
        for idx, _ in enumerate(items)
        if dist_list[idx] <= score_threshold
    ]
    return valid_items

# actual processing

## stage 0 composite mapping

In [20]:
df_composite = input_df \
    .assign(
      query_res = lambda df: 
        [
            unpick_obsolete(query_composite(_)) 
            for _ in tqdm(df["trait_term_query"], total=len(df))
        ]
    ) \
    .assign(
        perfect_match = lambda df: df.apply(
            # NOTE: trait_term_query is a list
            lambda row: py_.flatten([
                pick_perfect_match(_, row["query_res"]) 
                for _ in row["trait_term_query"]
            ]),
            axis=1
        )
    )

df_composite.info()
df_composite

100%|█████████████████████████████████████████████████████████████████████████████| 2079/2079 [00:20<00:00, 103.20it/s]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2079 entries, 0 to 2078
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   trait_id          2079 non-null   object
 1   trait_term        2079 non-null   object
 2   phenotype         2079 non-null   object
 3   dataset           2079 non-null   object
 4   trait_type        2079 non-null   object
 5   trait_term_query  2079 non-null   object
 6   query_res         2079 non-null   object
 7   perfect_match     2079 non-null   object
dtypes: object(8)
memory usage: 130.1+ KB


Unnamed: 0,trait_id,trait_term,phenotype,dataset,trait_type,trait_term_query,query_res,perfect_match
0,8-00,Intestinal infection,8,UKBB+MVP,phecode,[Intestinal infection],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005...,[]
1,85-01,Bacterial enteritis,8.5,UKBB+MVP,phecode,[Bacterial enteritis],[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[{'ent_id': 'http://purl.obolibrary.org/obo/MO...
2,851-02,Intestinal e.coli,8.51,MVP,phecode,[Intestinal e.coli],[],[]
3,852-03,Intestinal infection due to C. difficile,8.52,UKBB+MVP,phecode,[Intestinal infection due to C. difficile],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005...,[]
4,86-04,Viral Enteritis,8.6,UKBB+MVP,phecode,[Viral Enteritis],[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[]
...,...,...,...,...,...,...,...,...
2074,dbp-at-enrollment-2076,Diastolic blood pressure,DBP (at enrollment),UKBB+MVP,vital status,[Diastolic blood pressure],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0006...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0006...
2075,p-pulse-at-enrollment-2077,Heart rate,P (Pulse at enrollment),UKBB+MVP,vital status,[Heart rate],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0004...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0004...
2076,height-in-2078,Heigth,Height (in),MVP,vital status,[Heigth],[],[]
2077,weight-lb-2079,Weight,Weight (lb),UKBB+MVP,vital status,[Weight],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0004...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0004...


In [21]:
output_path = proj_root / "data" / "output" / "results_perfect.json"
with output_path.open("w") as f:
    json.dump(
        df_composite[
            df_composite["perfect_match"].apply(lambda e: len(e) > 0)].to_dict(orient="records"), 
    f)

In [22]:
df_composite \
    .assign(retrieval=lambda df: df["query_res"].apply(lambda e: len(e) > 0),
            perfect=lambda df: df["perfect_match"].apply(lambda e: len(e) > 0)) \
    .value_counts(["retrieval", "perfect"])

retrieval  perfect
True       False      1375
           True        501
False      False       203
dtype: int64

## stage 1 embedding mapping

In [23]:
def query_equivalence(term_list: List[str]) -> List[Dict[str, Any]]:
    embeddings_res = query_equivalence_embeddings(term_list)
    fulltext_res = query_equivalence_fulltext(term_list)
    res = py_.chain(embeddings_res + fulltext_res).uniq_by(lambda e: e["ent_id"]).value()
    return res
    
    
df_embeddings = df_composite[
    df_composite["perfect_match"].apply(lambda e: len(e) == 0)] \
    .drop(columns=["perfect_match"]) \
    .rename(columns={"query_res": "composite_res"}) \
    .assign(
        equivalence_res = lambda df:
        [
            unpick_obsolete(query_equivalence(_))
            for _ in tqdm(df["trait_term_query"], total=len(df))
        ]
    )

df_embeddings.info()
df_embeddings

100%|██████████████████████████████████████████████████████████████████████████████| 1578/1578 [00:49<00:00, 31.76it/s]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1578 entries, 0 to 2076
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   trait_id          1578 non-null   object
 1   trait_term        1578 non-null   object
 2   phenotype         1578 non-null   object
 3   dataset           1578 non-null   object
 4   trait_type        1578 non-null   object
 5   trait_term_query  1578 non-null   object
 6   composite_res     1578 non-null   object
 7   equivalence_res   1578 non-null   object
dtypes: object(8)
memory usage: 111.0+ KB





Unnamed: 0,trait_id,trait_term,phenotype,dataset,trait_type,trait_term_query,composite_res,equivalence_res
0,8-00,Intestinal infection,8,UKBB+MVP,phecode,[Intestinal infection],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0007...
2,851-02,Intestinal e.coli,8.51,MVP,phecode,[Intestinal e.coli],[],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_1000...
3,852-03,Intestinal infection due to C. difficile,8.52,UKBB+MVP,phecode,[Intestinal infection due to C. difficile],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005...,[{'ent_id': 'http://purl.obolibrary.org/obo/MO...
4,86-04,Viral Enteritis,8.6,UKBB+MVP,phecode,[Viral Enteritis],[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_1001...
5,87-05,Intestinal infection due to protozoa,8.7,MVP,phecode,[Intestinal infection due to protozoa],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005...
...,...,...,...,...,...,...,...,...
2069,fbrthozfrst-2071,What was the birth weight of your first child?...,FBrthOzFrst,MVP,lifestyle_survey,[What was the birth weight of your first child...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0004...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0004...
2070,fbrthagefrst-2072,How old were you when you had your first child?,FBrthAgeFrst,UKBB+MVP,lifestyle_survey,[How old were you when you had your first child ],[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0009...
2071,fbrthagelst-2073,How old were you when you had your last child?,FBrthAgeLst,UKBB+MVP,lifestyle_survey,[How old were you when you had your last child ],[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0009...
2072,fbrstfeed-2074,Did you breastfeed your child/children for at...,FBrstFeed,MVP,lifestyle_survey,[Did you breastfeed your child children for a...,[{'ent_id': 'http://purl.obolibrary.org/obo/UO...,[{'ent_id': 'http://purl.obolibrary.org/obo/UO...


## stage 2 filter

In [24]:
def filter_by_ontology_distance(row):
    query_terms = row["trait_term_query"]
    compare_term_list = [_["vector_term"] for _ in row["equivalence_res"][:10]]
    valid_terms = query_distance(main_term_list=query_terms,
                                compare_term_list=compare_term_list)
    valid_items = [_ for _ in row["equivalence_res"] if _["vector_term"] in valid_terms]
    return valid_items

df_filter = df_embeddings \
    .assign(
      equivalence_filter_res=lambda df: [
          filter_by_ontology_distance(_)
          for idx, _ in tqdm(df.iterrows(), total=len(df))
      ]
    )

df_filter.info()

df_filter

100%|██████████████████████████████████████████████████████████████████████████████| 1578/1578 [06:00<00:00,  4.37it/s]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1578 entries, 0 to 2076
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   trait_id                1578 non-null   object
 1   trait_term              1578 non-null   object
 2   phenotype               1578 non-null   object
 3   dataset                 1578 non-null   object
 4   trait_type              1578 non-null   object
 5   trait_term_query        1578 non-null   object
 6   composite_res           1578 non-null   object
 7   equivalence_res         1578 non-null   object
 8   equivalence_filter_res  1578 non-null   object
dtypes: object(9)
memory usage: 123.3+ KB


Unnamed: 0,trait_id,trait_term,phenotype,dataset,trait_type,trait_term_query,composite_res,equivalence_res,equivalence_filter_res
0,8-00,Intestinal infection,8,UKBB+MVP,phecode,[Intestinal infection],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0007...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0007...
2,851-02,Intestinal e.coli,8.51,MVP,phecode,[Intestinal e.coli],[],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_1000...,[]
3,852-03,Intestinal infection due to C. difficile,8.52,UKBB+MVP,phecode,[Intestinal infection due to C. difficile],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005...,[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[]
4,86-04,Viral Enteritis,8.6,UKBB+MVP,phecode,[Viral Enteritis],[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_1001...,[{'ent_id': 'http://purl.obolibrary.org/obo/MO...
5,87-05,Intestinal infection due to protozoa,8.7,MVP,phecode,[Intestinal infection due to protozoa],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0005...,[]
...,...,...,...,...,...,...,...,...,...
2069,fbrthozfrst-2071,What was the birth weight of your first child?...,FBrthOzFrst,MVP,lifestyle_survey,[What was the birth weight of your first child...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0004...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0004...,[]
2070,fbrthagefrst-2072,How old were you when you had your first child?,FBrthAgeFrst,UKBB+MVP,lifestyle_survey,[How old were you when you had your first child ],[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0009...,[]
2071,fbrthagelst-2073,How old were you when you had your last child?,FBrthAgeLst,UKBB+MVP,lifestyle_survey,[How old were you when you had your last child ],[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0009...,[]
2072,fbrstfeed-2074,Did you breastfeed your child/children for at...,FBrstFeed,MVP,lifestyle_survey,[Did you breastfeed your child children for a...,[{'ent_id': 'http://purl.obolibrary.org/obo/UO...,[{'ent_id': 'http://purl.obolibrary.org/obo/UO...,[]


In [25]:
output_path = proj_root / "data" / "output" / "results_equivalence.json"

with output_path.open("w") as f:
    json.dump(df_filter.to_dict(orient="records"), f)

In [26]:
df_filter \
    .assign(composite=lambda df: df["composite_res"].apply(lambda e: len(e) > 0),
            equivalence=lambda df: df["equivalence_res"].apply(lambda e: len(e) > 0)) \
    .value_counts(["composite", "equivalence"])

composite  equivalence
True       True           1367
False      True            189
           False            14
True       False             8
dtype: int64

In [27]:
df_empty = df_filter[
    (df_filter["composite_res"].apply(lambda e: len(e) == 0)) 
        & df_filter["equivalence_res"].apply(lambda e: len(e) == 0)
    ].reset_index(drop=True) 

df_empty.info()

df_empty

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   trait_id                14 non-null     object
 1   trait_term              14 non-null     object
 2   phenotype               14 non-null     object
 3   dataset                 14 non-null     object
 4   trait_type              14 non-null     object
 5   trait_term_query        14 non-null     object
 6   composite_res           14 non-null     object
 7   equivalence_res         14 non-null     object
 8   equivalence_filter_res  14 non-null     object
dtypes: object(9)
memory usage: 1.1+ KB


Unnamed: 0,trait_id,trait_term,phenotype,dataset,trait_type,trait_term_query,composite_res,equivalence_res,equivalence_filter_res
0,1102-51,Dermatomycoses,110.2,UKBB+MVP,phecode,[Dermatomycoses],[],[],[]
1,134-65,Helminthiases,134,UKBB+MVP,phecode,[Helminthiases],[],[],[]
2,20223-163,Lymphosarcoma,202.23,MVP,phecode,[Lymphosarcoma],[],[],[]
3,2607-296,Polyphagia,260.7,MVP,phecode,[Polyphagia],[],[],[]
4,3841-710,Myringitis,384.1,MVP,phecode,[Myringitis],[],[],[]
5,4759-915,Postnasal drip,475.9,MVP,phecode,[Postnasal drip],[],[],[]
6,51332-972,Orthopnea,513.32,MVP,phecode,[Orthopnea],[],[],[]
7,5309-1044,Heartburn,530.9,UKBB+MVP,phecode,[Heartburn],[],[],[]
8,6121-1256,Galactorrhea,612.1,MVP,phecode,[Galactorrhea],[],[],[]
9,7262-1521,Synoviopathy,726.2,UKBB+MVP,phecode,[Synoviopathy],[],[],[]


In [28]:
empty_manual_annotation = {
    "Dermatomycoses": "Dermatomycosis",
    "Helminthiases": "Helminthiasis",
    "Lymphosarcoma": "lymphoma",
    "Polyphagia": "hyperphagia",
    "Myringitis": "inflammation of the eardrum",
    "Postnasal drip": "sore throat",
    "Orthopnea": "shortness of breath",
    "Heartburn": "gastro-oesophageal reflux disease",
    "Galactorrhea": "lactation unassociated with childbirth",
    "Synoviopathy": "synovitis",
    "Osteochondropathies": "Osteochondritis dissecans",
    "Cervicalgia": "neck pain",
    "Cigars": "smoking",
    "Heigth": "height",
}

for k, v in empty_manual_annotation.items():
    df_empty.loc[df_empty["trait_term"] == k, "trait_term_query"] = [[v]]
    
df_empty

Unnamed: 0,trait_id,trait_term,phenotype,dataset,trait_type,trait_term_query,composite_res,equivalence_res,equivalence_filter_res
0,1102-51,Dermatomycoses,110.2,UKBB+MVP,phecode,[Dermatomycosis],[],[],[]
1,134-65,Helminthiases,134,UKBB+MVP,phecode,[Helminthiasis],[],[],[]
2,20223-163,Lymphosarcoma,202.23,MVP,phecode,[lymphoma],[],[],[]
3,2607-296,Polyphagia,260.7,MVP,phecode,[hyperphagia],[],[],[]
4,3841-710,Myringitis,384.1,MVP,phecode,[inflammation of the eardrum],[],[],[]
5,4759-915,Postnasal drip,475.9,MVP,phecode,[sore throat],[],[],[]
6,51332-972,Orthopnea,513.32,MVP,phecode,[shortness of breath],[],[],[]
7,5309-1044,Heartburn,530.9,UKBB+MVP,phecode,[gastro-oesophageal reflux disease],[],[],[]
8,6121-1256,Galactorrhea,612.1,MVP,phecode,[lactation unassociated with childbirth],[],[],[]
9,7262-1521,Synoviopathy,726.2,UKBB+MVP,phecode,[synovitis],[],[],[]


In [29]:
df_empty_res = df_empty \
    .assign(
        composite_res = lambda df: df["trait_term_query"] \
            .apply(
                lambda e: unpick_obsolete(query_composite(e))
            ),
        equivalence_res = lambda df: df["trait_term_query"] \
            .apply(
                lambda e: py_.chain(
                    query_equivalence_embeddings(e) + query_equivalence_fulltext(e)
                ).uniq_by(lambda e: e["ent_id"]).thru(unpick_obsolete).value()
        ),
    ) \
    .assign(
        equivalence_filter_res=lambda df: df.apply(filter_by_ontology_distance, axis=1)
    )

df_empty_res.info()

df_empty_res

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   trait_id                14 non-null     object
 1   trait_term              14 non-null     object
 2   phenotype               14 non-null     object
 3   dataset                 14 non-null     object
 4   trait_type              14 non-null     object
 5   trait_term_query        14 non-null     object
 6   composite_res           14 non-null     object
 7   equivalence_res         14 non-null     object
 8   equivalence_filter_res  14 non-null     object
dtypes: object(9)
memory usage: 1.1+ KB


Unnamed: 0,trait_id,trait_term,phenotype,dataset,trait_type,trait_term_query,composite_res,equivalence_res,equivalence_filter_res
0,1102-51,Dermatomycoses,110.2,UKBB+MVP,phecode,[Dermatomycosis],[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[{'ent_id': 'http://purl.obolibrary.org/obo/MO...
1,134-65,Helminthiases,134,UKBB+MVP,phecode,[Helminthiasis],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_1001...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_1001...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_1001...
2,20223-163,Lymphosarcoma,202.23,MVP,phecode,[lymphoma],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0000...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_1000...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0000...
3,2607-296,Polyphagia,260.7,MVP,phecode,[hyperphagia],[],[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[]
4,3841-710,Myringitis,384.1,MVP,phecode,[inflammation of the eardrum],[],[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[]
5,4759-915,Postnasal drip,475.9,MVP,phecode,[sore throat],[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[{'ent_id': 'http://purl.obolibrary.org/obo/MO...
6,51332-972,Orthopnea,513.32,MVP,phecode,[shortness of breath],[],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0009...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0009...
7,5309-1044,Heartburn,530.9,UKBB+MVP,phecode,[gastro-oesophageal reflux disease],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0003...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0003...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0003...
8,6121-1256,Galactorrhea,612.1,MVP,phecode,[lactation unassociated with childbirth],[],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0009...,[]
9,7262-1521,Synoviopathy,726.2,UKBB+MVP,phecode,[synovitis],[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0008...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0008...,[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0008...


In [30]:
output_path = proj_root / "data" / "output" / "results_empty.json"

with output_path.open("w") as f:
    json.dump(df_empty_res.to_dict(orient="records"), f)

# Finalise