# MVP phenotype terms, round 2, wrap up

In [1]:
%load_ext autoreload
%autoreload 2

%load_ext lab_black

In [2]:
import sys
from pathlib import Path

In [3]:
_pwd = Path(".").resolve()
print(_pwd)
sys.path.append(str(_pwd))

/data/ik18445_cache/projects/phenotype-mapping/analysis/notebooks/mvp_round_2


In [30]:
from typing import List
import re
import json
import math

import pandas as pd
import janitor
import numpy as np
from pydash import py_
from Levenshtein import distance

from common_funcs import utils
from analysis_funcs import paths
import mvp_funcs, mvp_types

In [2]:
proj_root = utils.find_project_root("docker-compose.yml")
assert proj_root.exists(), proj_root

data_path = proj_root / "data"

input_path = data_path / "output" / "mvp-mapping-round-2"
assert input_path.exists(), input_path

# Load in

In [4]:
input_file = input_path / "mvp-terms-augmented.json"
assert input_file.exists(), input_file

with input_file.open() as f:
    mvp_terms = json.load(f)

print(len(mvp_terms))

5404


In [6]:
print(mvp_terms[0])

{'trait_id': 'UKBB+MVP-8-00', 'trait_term': 'Intestinal infection', 'trait_term_clean': 'Intestinal infection', 'trait_basic_info': {'Phenotype': '8', 'description': 'Intestinal infection', 'dataset': 'UKBB+MVP'}, 'regular_ents': ['Intestinal infection'], 'kb_ents': ['Intestinal infectious disease (disorder)', 'Small Intestinal Infection', 'Intestinal Diseases, Parasitic', 'Infection of digestive system', 'Gastrointestinal infection'], 'ents': ['Intestinal infection', 'Intestinal infectious disease (disorder)', 'Small Intestinal Infection', 'Intestinal Diseases, Parasitic', 'Infection of digestive system', 'Gastrointestinal infection']}


In [5]:
input_file = input_path / "mvp-mapping.json"
assert input_file.exists(), input_file

with input_file.open() as f:
    mvp_mappings = json.load(f)

print(len(mvp_mappings))

5404


In [7]:
print(mvp_mappings[0])

{'trait_id': 'UKBB+MVP-8-00', 'trait_term_mapping': {'term': 'Intestinal infection', 'cands_embeddings': [], 'cands_fulltext': [{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0007231', 'ent_term': 'cysticercosis', 'vector_term': 'intestinal taenia solium infection', 'primary_term': False}, {'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0007231', 'ent_term': 'cysticercosis', 'vector_term': 'tapeworm infection: intestinal taenia solum', 'primary_term': False}, {'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0007230', 'ent_term': 'cyclosporiasis', 'vector_term': 'intestinal infection caused by Cyclospora cayetanensis', 'primary_term': False}]}, 'trait_ents_mapping': [{'term': 'Intestinal infection', 'cands_embeddings': [], 'cands_fulltext': [{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0007231', 'ent_term': 'cysticercosis', 'vector_term': 'intestinal taenia solium infection', 'primary_term': False}, {'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0007231', 'ent_term': 'cysticercosis', 'vector_term': 'tapeworm infectio

# Processing

In [22]:
flags = [
  {
    "name": "good",
    "desc": "Results for this item are good",
  },
  {
    "name": "poor",
    "desc": "Results for this item are very poor",
  },
  {
    "name": "review",
    "desc": "Require further review for the results to make a decision",
  },
  {
    "name": "unmappable",
    "desc": "This item is not mappable in its current state (i.e. need to use an alternative query term)",
  },
    {
        "name": "UKBB+MVP",
        "desc": "UKBB+MVP dataset"
    },
    {
        "name": "MVP",
        "desc": "MVP dataset"
    },
    {
        "name": "UKBB",
        "desc": "UKBB dataset"
    },
    {
        "name": "FinnGen",
        "desc": "FinnGen dataset"
    },
    {
        "name": "Biobank_Japan",
        "desc": "Biobank_Japan dataset"
    },
]

In [10]:
cols = ["trait_id", "trait_term_mapping", "trait_ents_mapping", "candidates_subset", "default_picks"]
cols_rename = {
    "candidates_subset": "candidates",
    "default_picks": "selection"
}
df_mappings = (
    pd.DataFrame(mvp_mappings)
    [cols]
    .rename(columns=cols_rename)
)

In [23]:
cols = ["trait_id", "trait_term", "trait_term_clean", "trait_basic_info", "ents"]
df_main = (
    pd.DataFrame(mvp_terms)[cols]
    .assign(category=lambda df: df["trait_basic_info"].apply(lambda e: e["dataset"] if "dataset" in e.keys() else e["source"]))
    .assign(
        augmentation_info=lambda df: df.apply(
            lambda row: {
                "trait_term_clean": row["trait_term_clean"],
                "ents": row["ents"],
            }
            , axis=1
        )
    )
    .drop(columns=["ents", "trait_term_clean"])
    .merge(
        df_mappings, on=["trait_id"]
    )
    .assign(
        external_selection=lambda df: df.apply(lambda row: [], axis=1),
        trait_flags=lambda df: df.apply(lambda row: [], axis=1),
        cand_flags=lambda df: df.apply(lambda row: [], axis=1),
        notes=lambda df: df.apply(lambda row: None, axis=1),
    )
)
print(len(df_main))

5404


In [24]:
df_main.head()

Unnamed: 0,trait_id,trait_term,trait_basic_info,category,augmentation_info,trait_term_mapping,trait_ents_mapping,candidates,selection,external_selection,trait_flags,cand_flags,notes
0,UKBB+MVP-8-00,Intestinal infection,"{'Phenotype': '8', 'description': 'Intestinal ...",UKBB+MVP,"{'trait_term_clean': 'Intestinal infection', '...","{'term': 'Intestinal infection', 'cands_embedd...","[{'term': 'Intestinal infection', 'cands_embed...",[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[http://purl.obolibrary.org/obo/MONDO_0043424],[],[],[],
1,UKBB+MVP-85-01,Bacterial enteritis,"{'Phenotype': '8.5', 'description': 'Bacterial...",UKBB+MVP,"{'trait_term_clean': 'Bacterial enteritis', 'e...","{'term': 'Bacterial enteritis', 'cands_embeddi...","[{'term': 'Bacterial enteritis', 'cands_embedd...",[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[http://purl.obolibrary.org/obo/MONDO_0000916],[],[],[],
2,MVP-851-02,Intestinal e.coli,"{'Phenotype': '8.51', 'description': 'Intestin...",MVP,"{'trait_term_clean': 'Intestinal e.coli', 'ent...","{'term': 'Intestinal e.coli', 'cands_embedding...","[{'term': 'Intestinal', 'cands_embeddings': [{...",[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0000...,[http://www.ebi.ac.uk/efo/EFO_0000834],[],[],[],
3,UKBB+MVP-852-03,Intestinal infection due to C. difficile,"{'Phenotype': '8.52', 'description': 'Intestin...",UKBB+MVP,{'trait_term_clean': 'Intestinal infection due...,{'term': 'Intestinal infection due to C. diffi...,"[{'term': 'Intestinal infection', 'cands_embed...",[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[http://purl.obolibrary.org/obo/MONDO_0043424],[],[],[],
4,UKBB+MVP-86-04,Viral Enteritis,"{'Phenotype': '8.6', 'description': 'Viral Ent...",UKBB+MVP,"{'trait_term_clean': 'Viral Enteritis', 'ents'...","{'term': 'Viral Enteritis', 'cands_embeddings'...","[{'term': 'Viral Enteritis', 'cands_embeddings...",[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[http://purl.obolibrary.org/obo/MONDO_0043579],[],[],[],


In [32]:
def create_flags(item):
    flags = []
    flags.append(item["category"])
    picked_terms = py_.chain([
        [_["ent_term"]]+_["synonyms"] for _ in item["candidates"]
        if _["ent_id"] in item["selection"]
    ]).flatten().uniq().value()
    dist = [
        distance(item["augmentation_info"]["trait_term_clean"].lower(),
                 _.lower()
                )
        for _ in picked_terms
    ]
    threshold = 2
    dist_filter = [_ for _ in dist if _ <= threshold]
    if len(dist_filter) > 0:
        flags.append("good")
    if len(item["selection"]) > 0:
        flags.append("review")
    else:
        flags.append("unmappable")
    return flags

df_main_stage1 = (
    df_main.assign(
        trait_flags=lambda df: df.apply(
            lambda row: create_flags(row),
            axis=1
        )
    )
)

df_main_stage1

Unnamed: 0,trait_id,trait_term,trait_basic_info,category,augmentation_info,trait_term_mapping,trait_ents_mapping,candidates,selection,external_selection,trait_flags,cand_flags,notes
0,UKBB+MVP-8-00,Intestinal infection,"{'Phenotype': '8', 'description': 'Intestinal ...",UKBB+MVP,"{'trait_term_clean': 'Intestinal infection', '...","{'term': 'Intestinal infection', 'cands_embedd...","[{'term': 'Intestinal infection', 'cands_embed...",[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[http://purl.obolibrary.org/obo/MONDO_0043424],[],"[UKBB+MVP, review]",[],
1,UKBB+MVP-85-01,Bacterial enteritis,"{'Phenotype': '8.5', 'description': 'Bacterial...",UKBB+MVP,"{'trait_term_clean': 'Bacterial enteritis', 'e...","{'term': 'Bacterial enteritis', 'cands_embeddi...","[{'term': 'Bacterial enteritis', 'cands_embedd...",[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[http://purl.obolibrary.org/obo/MONDO_0000916],[],"[UKBB+MVP, good, review]",[],
2,MVP-851-02,Intestinal e.coli,"{'Phenotype': '8.51', 'description': 'Intestin...",MVP,"{'trait_term_clean': 'Intestinal e.coli', 'ent...","{'term': 'Intestinal e.coli', 'cands_embedding...","[{'term': 'Intestinal', 'cands_embeddings': [{...",[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0000...,[http://www.ebi.ac.uk/efo/EFO_0000834],[],"[MVP, review]",[],
3,UKBB+MVP-852-03,Intestinal infection due to C. difficile,"{'Phenotype': '8.52', 'description': 'Intestin...",UKBB+MVP,{'trait_term_clean': 'Intestinal infection due...,{'term': 'Intestinal infection due to C. diffi...,"[{'term': 'Intestinal infection', 'cands_embed...",[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[http://purl.obolibrary.org/obo/MONDO_0043424],[],"[UKBB+MVP, review]",[],
4,UKBB+MVP-86-04,Viral Enteritis,"{'Phenotype': '8.6', 'description': 'Viral Ent...",UKBB+MVP,"{'trait_term_clean': 'Viral Enteritis', 'ents'...","{'term': 'Viral Enteritis', 'cands_embeddings'...","[{'term': 'Viral Enteritis', 'cands_embeddings...",[{'ent_id': 'http://purl.obolibrary.org/obo/MO...,[http://purl.obolibrary.org/obo/MONDO_0043579],[],"[UKBB+MVP, review]",[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5399,Biobank_Japan-Spinal_canal_stenosis-3320,Spinal canal stenosis,"{'phenocode': 'Spinal_canal_stenosis', 'name':...",Biobank_Japan,"{'trait_term_clean': 'Spinal canal stenosis', ...","{'term': 'Spinal canal stenosis', 'cands_embed...","[{'term': 'Spinal canal stenosis', 'cands_embe...",[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0007...,[http://www.ebi.ac.uk/efo/EFO_0007490],[],"[Biobank_Japan, review]",[],
5400,Biobank_Japan-Mastopathy-3321,Mastopathy,"{'phenocode': 'Mastopathy', 'name': 'Mastopath...",Biobank_Japan,"{'trait_term_clean': 'Mastopathy', 'ents': ['M...","{'term': 'Mastopathy', 'cands_embeddings': [],...","[{'term': 'Mastopathy', 'cands_embeddings': []...",[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0009...,[http://www.ebi.ac.uk/efo/EFO_0009483],[],"[Biobank_Japan, review]",[],
5401,Biobank_Japan-Schizophrenia-3322,Schizophrenia,"{'phenocode': 'Schizophrenia', 'name': 'Schizo...",Biobank_Japan,"{'trait_term_clean': 'Schizophrenia', 'ents': ...","{'term': 'Schizophrenia', 'cands_embeddings': ...","[{'term': 'Schizophrenia', 'cands_embeddings':...",[{'ent_id': 'http://www.ebi.ac.uk/efo/EFO_0000...,"[http://www.ebi.ac.uk/efo/EFO_0000692, http://...",[],"[Biobank_Japan, good, review]",[],
5402,Biobank_Japan-Tonsillitis-3323,Tonsillitis,"{'phenocode': 'Tonsillitis', 'name': 'Tonsilli...",Biobank_Japan,"{'trait_term_clean': 'Tonsillitis', 'ents': ['...","{'term': 'Tonsillitis', 'cands_embeddings': []...","[{'term': 'Tonsillitis', 'cands_embeddings': [...",[{'ent_id': 'http://purl.obolibrary.org/obo/HP...,[],[],"[Biobank_Japan, unmappable]",[],


# Post processing diagnostics

In [27]:
df_main_stage1["category"].value_counts()

FinnGen          3095
UKBB+MVP         1490
MVP               575
Biobank_Japan     230
UKBB               14
Name: category, dtype: int64

In [33]:
df_main_stage1["trait_flags"].value_counts()

[FinnGen, review]                2194
[UKBB+MVP, review]                867
[FinnGen, good, review]           632
[UKBB+MVP, good, review]          534
[MVP, review]                     352
[FinnGen, unmappable]             269
[MVP, good, review]               172
[Biobank_Japan, good, review]     158
[UKBB+MVP, unmappable]             89
[MVP, unmappable]                  51
[Biobank_Japan, review]            47
[Biobank_Japan, unmappable]        25
[UKBB, good, review]                6
[UKBB, review]                      5
[UKBB, unmappable]                  3
Name: trait_flags, dtype: int64

# Wrap up

In [35]:
metadata = {"flags": flags}
annotation_data = {
    _["trait_id"]: _
    for _ in df_main_stage1.to_dict(orient="records")
}

annotation_res = {
    "metadata": metadata,
    "data": annotation_data,
}

output_file = input_file.parent / "mvp-annotation-round-2.json"
with output_file.open("w") as f:
    json.dump(annotation_res, f)