# MVP phenotype terms, round 2, encode terms

In [1]:
%load_ext autoreload
%autoreload 2

%load_ext lab_black

In [2]:
import sys
from pathlib import Path

In [3]:
_pwd = Path(".").resolve()
print(_pwd)
sys.path.append(str(_pwd))

/data/ik18445_cache/projects/phenotype-mapping/analysis/notebooks/mvp_round_2


In [4]:
from typing import List
import re
import json
import math

import pandas as pd
import janitor
import numpy as np
from pydash import py_

import spacy
import scispacy

import ray

from common_funcs import utils
from analysis_funcs import paths
import mvp_funcs

In [5]:
proj_root = utils.find_project_root("docker-compose.yml")
assert proj_root.exists(), proj_root

data_path = proj_root / "data"

input_path = data_path / "output" / "mvp-mapping-round-2"
assert input_path.exists(), input_path

model_path = paths.models["scispacy_lg"]
assert model_path.exists(), model_path

In [6]:
NUM_WORKERS = 4

# Load in 

In [7]:
input_file = input_path / "mvp-terms-augmented.json"
assert input_file.exists(), input_file
with input_file.open() as f:
    df_init = pd.DataFrame(json.load(f))

df_init

Unnamed: 0,trait_id,trait_term,trait_term_clean,trait_basic_info,regular_ents,kb_ents,ents
0,UKBB+MVP-8-00,Intestinal infection,Intestinal infection,"{'Phenotype': '8', 'description': 'Intestinal ...",[Intestinal infection],"[Intestinal infectious disease (disorder), Sma...","[Intestinal infection, Intestinal infectious d..."
1,UKBB+MVP-85-01,Bacterial enteritis,Bacterial enteritis,"{'Phenotype': '8.5', 'description': 'Bacterial...",[Bacterial enteritis],"[Bacterial enteritis, Enteritis, Bacterial gas...","[Bacterial enteritis, Enteritis, Bacterial gas..."
2,MVP-851-02,Intestinal e.coli,Intestinal e.coli,"{'Phenotype': '8.51', 'description': 'Intestin...",[Intestinal],"[Intestines, Intestinal route, Intestinal Neur...","[Intestinal, Intestines, Intestinal route, Int..."
3,UKBB+MVP-852-03,Intestinal infection due to C. difficile,Intestinal infection due to C. difficile,"{'Phenotype': '8.52', 'description': 'Intestin...","[Intestinal infection, C. difficile]","[Intestinal infectious disease (disorder), Sma...","[Intestinal infection, C. difficile, Intestina..."
4,UKBB+MVP-86-04,Viral Enteritis,Viral Enteritis,"{'Phenotype': '8.6', 'description': 'Viral Ent...",[Viral Enteritis],"[Viral enteritis, Mink Viral Enteritis, Enteri...","[Viral Enteritis, Mink Viral Enteritis, Enteri..."
...,...,...,...,...,...,...,...
5399,Biobank_Japan-Spinal_canal_stenosis-3320,Spinal canal stenosis,Spinal canal stenosis,"{'phenocode': 'Spinal_canal_stenosis', 'name':...",[Spinal canal stenosis],"[Spinal canal stenosis, Spinal Stenosis, Cervi...","[Spinal canal stenosis, Spinal Stenosis, Cervi..."
5400,Biobank_Japan-Mastopathy-3321,Mastopathy,Mastopathy,"{'phenocode': 'Mastopathy', 'name': 'Mastopath...",[Mastopathy],[Breast Diseases],"[Mastopathy, Breast Diseases]"
5401,Biobank_Japan-Schizophrenia-3322,Schizophrenia,Schizophrenia,"{'phenocode': 'Schizophrenia', 'name': 'Schizo...",[Schizophrenia],"[Schizophrenia, SCHIZOPHRENIA 1 (disorder), SC...","[Schizophrenia, SCHIZOPHRENIA 1 (disorder), SC..."
5402,Biobank_Japan-Tonsillitis-3323,Tonsillitis,Tonsillitis,"{'phenocode': 'Tonsillitis', 'name': 'Tonsilli...",[Tonsillitis],"[Tonsillitis, Acute tonsillitis]","[Tonsillitis, Acute tonsillitis]"


# Processing

In [8]:
# SAMPLE_SIZE = 1000

# sample = df_init[:SAMPLE_SIZE].to_dict(orient="records")
sample = df_init.to_dict(orient="records")
sample_chunks = py_.chunk(sample, size=math.floor(len(sample) / NUM_WORKERS))
# sample_chunks = py_.chunk(sample, size=NUM_WORKERS)
print(len(sample_chunks))
print(len(sample_chunks[0]))

4
1351


In [9]:
encoders = [
    mvp_funcs.ItemEncoder.remote(idx=idx, model_path=model_path)
    for idx, _ in enumerate(range(NUM_WORKERS))
]

2022-11-07 12:11:43,521	INFO worker.py:1518 -- Started a local Ray instance.


[2m[36m(ItemEncoder pid=21753)[0m Encoder 3: Init model
[2m[36m(ItemEncoder pid=21750)[0m Encoder 0: Init model
[2m[36m(ItemEncoder pid=21751)[0m Encoder 1: Init model
[2m[36m(ItemEncoder pid=21752)[0m Encoder 2: Init model
[2m[36m(ItemEncoder pid=21753)[0m Encoder 3: Model loaded
[2m[36m(ItemEncoder pid=21750)[0m Encoder 0: Model loaded
[2m[36m(ItemEncoder pid=21751)[0m Encoder 1: Model loaded
[2m[36m(ItemEncoder pid=21752)[0m Encoder 2: Model loaded


In [10]:
sample_res = ray.get(
    [
        encoder.encode_chunk.remote(sample_chunks[idx])
        for idx, encoder in enumerate(encoders)
    ]
)
sample_res = py_.flatten(sample_res)

[2m[36m(ItemEncoder pid=21753)[0m Encoder 3: Start to process 1351 items
[2m[36m(ItemEncoder pid=21753)[0m 12:12:21 Encoder 3: # 0
[2m[36m(ItemEncoder pid=21750)[0m Encoder 0: Start to process 1351 items
[2m[36m(ItemEncoder pid=21750)[0m 12:12:21 Encoder 0: # 0
[2m[36m(ItemEncoder pid=21751)[0m Encoder 1: Start to process 1351 items
[2m[36m(ItemEncoder pid=21751)[0m 12:12:21 Encoder 1: # 0
[2m[36m(ItemEncoder pid=21752)[0m Encoder 2: Start to process 1351 items
[2m[36m(ItemEncoder pid=21752)[0m 12:12:21 Encoder 2: # 0
[2m[36m(ItemEncoder pid=21753)[0m 12:12:31 Encoder 3: # 200
[2m[36m(ItemEncoder pid=21750)[0m 12:12:35 Encoder 0: # 200
[2m[36m(ItemEncoder pid=21751)[0m 12:12:35 Encoder 1: # 200
[2m[36m(ItemEncoder pid=21752)[0m 12:12:37 Encoder 2: # 200
[2m[36m(ItemEncoder pid=21753)[0m 12:12:44 Encoder 3: # 400
[2m[36m(ItemEncoder pid=21750)[0m 12:12:48 Encoder 0: # 400
[2m[36m(ItemEncoder pid=21752)[0m 12:12:51 Encoder 2: # 400
[2m[36m(I

In [11]:
output_path = input_path / "mvp-encode.json"
assert output_path.parent.exists()
with output_path.open("w") as f:
    json.dump(sample_res, f)

In [12]:
# terminate actors
[ray.kill(_) for _ in encoders]

[None, None, None, None]