# MVP phenotype terms, round 2, augmentation

# Init

In [1]:
%load_ext autoreload
%autoreload 2

%load_ext lab_black

In [2]:
import sys
from pathlib import Path

In [3]:
_pwd = Path(".").resolve()
print(_pwd)
sys.path.append(str(_pwd))

/data/ik18445_cache/projects/phenotype-mapping/analysis/notebooks/mvp_round_2


In [4]:
from typing import List
import re
import json

import pandas as pd
import janitor
import numpy as np
from pydash import py_

import spacy
import scispacy

# abbrevs not useful in this dataset
# from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker

import ray

from common_funcs import utils
from analysis_funcs import paths
import mvp_funcs, mvp_types

Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


In [5]:
proj_root = utils.find_project_root("docker-compose.yml")
assert proj_root.exists(), proj_root

data_path = proj_root / "data"

input_path = data_path / "output" / "mvp-mapping-round-2"
assert input_path.exists(), input_path

model_path = paths.models["scispacy_lg"]
assert model_path.exists(), model_path

# Load in

In [6]:
scispacy_model = spacy.load(model_path)
# scispacy_model.add_pipe("abbreviation_detector")
scispacy_model.add_pipe(
    "scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"}
)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


<scispacy.linking.EntityLinker at 0x7ff458463c40>

In [7]:
linker = scispacy_model.get_pipe("scispacy_linker")

In [8]:
input_file = input_path / "mvp-terms-clean.json"
assert input_file.exists(), input_file
with input_file.open() as f:
    df_init = pd.DataFrame(json.load(f))

mvp_types.CleanedDf.validate(df_init)

df_init

Unnamed: 0,trait_id,trait_term,trait_term_clean,trait_basic_info
0,UKBB+MVP-8-00,Intestinal infection,Intestinal infection,"{'Phenotype': '8', 'description': 'Intestinal ..."
1,UKBB+MVP-85-01,Bacterial enteritis,Bacterial enteritis,"{'Phenotype': '8.5', 'description': 'Bacterial..."
2,MVP-851-02,Intestinal e.coli,Intestinal e.coli,"{'Phenotype': '8.51', 'description': 'Intestin..."
3,UKBB+MVP-852-03,Intestinal infection due to C. difficile,Intestinal infection due to C. difficile,"{'Phenotype': '8.52', 'description': 'Intestin..."
4,UKBB+MVP-86-04,Viral Enteritis,Viral Enteritis,"{'Phenotype': '8.6', 'description': 'Viral Ent..."
...,...,...,...,...
5399,Biobank_Japan-Spinal_canal_stenosis-3320,Spinal canal stenosis,Spinal canal stenosis,"{'phenocode': 'Spinal_canal_stenosis', 'name':..."
5400,Biobank_Japan-Mastopathy-3321,Mastopathy,Mastopathy,"{'phenocode': 'Mastopathy', 'name': 'Mastopath..."
5401,Biobank_Japan-Schizophrenia-3322,Schizophrenia,Schizophrenia,"{'phenocode': 'Schizophrenia', 'name': 'Schizo..."
5402,Biobank_Japan-Tonsillitis-3323,Tonsillitis,Tonsillitis,"{'phenocode': 'Tonsillitis', 'name': 'Tonsilli..."


# Processing

In [9]:
def annotate_terms(item, scispacy_model, linker):
    term = item["trait_term_clean"]
    doc = scispacy_model(term)
    regular_ents = doc.ents
    kb_ents = mvp_funcs.get_kb_ents(ents=regular_ents, linker=linker)
    res = {
        "trait_id": item["trait_id"],
        "regular_ents": [str(_) for _ in regular_ents],
        "kb_ents": kb_ents,
    }
    return res

In [10]:
%%time
input_items = df_init[["trait_id", "trait_term_clean"]].to_dict(orient="records")

annotated_items = [annotate_terms(item=_, 
                                        scispacy_model=scispacy_model, linker=linker
                                        ) for _ in input_items]

print(len(input_items))
print(len(annotated_items))

  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


5404
5404
CPU times: user 2min 7s, sys: 58.9 s, total: 3min 6s
Wall time: 1min 52s


In [11]:
df_annotated = pd.DataFrame(annotated_items)

df_augment = df_init.merge(df_annotated, on=["trait_id"]).assign(
    ents=lambda df: df.apply(
        lambda row: py_.chain(row["regular_ents"] + row["kb_ents"])
        .uniq_by(lambda e: e.lower())
        .value(),
        axis=1,
    )
)

In [12]:
output_file = input_file.parent / "mvp-terms-augmented.json"
with output_file.open("w") as f:
    json.dump(df_augment.to_dict(orient="records"), f)

In [13]:
output_file = input_file.parent / "mvp-terms-augmented-flat.csv"
cols = ["trait_id", "trait_term", "trait_term_clean", "regular_ents", "kb_ents"]
df_augment[cols].to_csv(output_file, index=False)

# Post processing diagnostics

# Wrap up