In [1]:
%load_ext autoreload
%autoreload 2

In [86]:
from typing import List
import re
import json

import pandas as pd
import janitor
import numpy as np
from string import punctuation

from funcs import utils, query

# init

In [6]:
proj_root = utils.find_project_root("docker-compose.yml")
assert proj_root.exists(), proj_root

data_path = proj_root / "data" / "mvp" / "all_mapped_phenotypes_03JUN2022.csv"
assert data_path.exists(), data_path

In [8]:
df_raw = pd.read_csv(data_path)
df_raw.info()
df_raw

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2081 entries, 0 to 2080
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Phenotype      2081 non-null   object 
 1   description    2079 non-null   object 
 2   dataset        2081 non-null   object 
 3   MVP_Cases      2060 non-null   object 
 4   MVP_Controls   1977 non-null   float64
 5   trait_type     2081 non-null   object 
 6   UKBB_filename  1506 non-null   object 
 7   UKBB_Cases     1505 non-null   float64
 8   UKBB_Controls  1439 non-null   float64
 9   unmapped_plan  648 non-null    object 
dtypes: float64(3), object(7)
memory usage: 162.7+ KB


Unnamed: 0,Phenotype,description,dataset,MVP_Cases,MVP_Controls,trait_type,UKBB_filename,UKBB_Cases,UKBB_Controls,unmapped_plan
0,8,Intestinal infection,UKBB+MVP,9351,454638.0,phecode,phecode-008-both_sexes.tsv.bgz,13044.0,407487.0,
1,8.5,Bacterial enteritis,UKBB+MVP,5609,458380.0,phecode,phecode-008.5-both_sexes.tsv.bgz,3275.0,407487.0,
2,8.51,Intestinal e.coli,MVP,57,463932.0,Phecode,,,,use MVP only
3,8.52,Intestinal infection due to C. difficile,UKBB+MVP,4873,459116.0,phecode,phecode-008.52-both_sexes.tsv.bgz,858.0,407487.0,
4,8.6,Viral Enteritis,UKBB+MVP,1347,462642.0,phecode,phecode-008.6-both_sexes.tsv.bgz,1114.0,407487.0,
...,...,...,...,...,...,...,...,...,...,...
2076,DBP (at enrollment),Diastolic blood pressure,UKBB+MVP,463989,,vital status,continuous-4079-both_sexes-irnt.tsv.bgz,396667.0,,
2077,P (Pulse at enrollment),Heart rate,UKBB+MVP,463989,,vital status,continuous-102-both_sexes-irnt.tsv.bgz,396667.0,,
2078,Height (in),Heigth,MVP,463989,,vital status,,,,use MVP only
2079,Weight (lb),Weight,UKBB+MVP,463989,,vital status,continuous-21002-both_sexes-irnt.tsv.bgz,419316.0,,


# raw diagnosis

In [10]:
df_raw.value_counts(["dataset", "trait_type"])

dataset   trait_type      
UKBB+MVP  phecode             1323
MVP       Phecode              461
          baseline_survey       66
UKBB+MVP  Phecode               55
          icd10                 43
MVP       lab                   35
UKBB+MVP  lab                   34
          lifestyle_survey      23
MVP       lifestyle_survey      14
UKBB      lab                   14
UKBB+MVP  baseline_survey        7
          vital status           5
MVP       vital status           1
dtype: int64

---

# cleaning

- trait_id: phenotype => lowercase, drop punct, replace whitespace with "-", trim
- trait_type: lowercase, trim
- trait_term: drop missing
  - "|": manual annotation
  
"|" manual annotation
- split multiple terms: e.g. "Infestation (lice | mites)" => ["Infestation of lice", "Infestation of mites"]
- drop modifier: e.g. "HIV infection | symptomatic" => "HIV infection"
- asis: 
  - clearly multiple terms "Epilepsy | recurrent seizures | convulsions"
  - unclear terms (but drop the pipe)
  
post manual annotation
- drop phrase e.g. "G99"
- punctuation
  - apostrophe: let tokenizer handle it
  - replace with whitespace: "/", "?", ":", "[]", "()", ";"

## stage 0 cleaning

In [54]:
def _clean_phenotype_id(text: str) -> str:
    text = text.lower().strip()
    for _ in punctuation:
        text = text.replace(_, "")
    text = text.replace(" ", "-")
    return text

In [55]:
df = df_raw \
    .rename(columns={
        "Phenotype": "phenotype",
        "description": "trait_term",
    }) \
    .transform_column(
        "phenotype",
        _clean_phenotype_id,
    ) \
    .assign(trait_id=lambda df: 
            [f"{_}-{idx:02d}" 
             for idx, _ in enumerate(df["phenotype"].tolist())]
    ) \
    [["trait_id", "trait_term", "phenotype", "dataset", "trait_type"]] \
    .dropna() \
    .transform_column(
        "trait_type",
        lambda e: e.lower().strip()
    )
df.info()
df

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2079 entries, 0 to 2080
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   trait_id    2079 non-null   object
 1   trait_term  2079 non-null   object
 2   phenotype   2079 non-null   object
 3   dataset     2079 non-null   object
 4   trait_type  2079 non-null   object
dtypes: object(5)
memory usage: 97.5+ KB


Unnamed: 0,trait_id,trait_term,phenotype,dataset,trait_type
0,8-00,Intestinal infection,8,UKBB+MVP,phecode
1,85-01,Bacterial enteritis,85,UKBB+MVP,phecode
2,851-02,Intestinal e.coli,851,MVP,phecode
3,852-03,Intestinal infection due to C. difficile,852,UKBB+MVP,phecode
4,86-04,Viral Enteritis,86,UKBB+MVP,phecode
...,...,...,...,...,...
2076,dbp-at-enrollment-2076,Diastolic blood pressure,dbp-at-enrollment,UKBB+MVP,vital status
2077,p-pulse-at-enrollment-2077,Heart rate,p-pulse-at-enrollment,UKBB+MVP,vital status
2078,height-in-2078,Heigth,height-in,MVP,vital status
2079,weight-lb-2079,Weight,weight-lb,UKBB+MVP,vital status


In [87]:
output_dir = proj_root / "data" / "output"
assert output_dir.exists(), output_dir
output_path = output_dir / "mvp_traits_clean.json"
with output_path.open("w") as f:
    json.dump(
        df1.to_dict(orient="records"),
        f
    )

In [57]:
df_pipe = df[df["trait_term"].apply(lambda e: "|" in e)] \
    .reset_index(drop=True)

output_path = output_dir / "mvp_traits_pipe.csv"
df_pipe.to_csv(output_path, index=False)

In [58]:
annot_file_path = proj_root / "data" / "mvp" / "mvp_traits_pipe_annotated.csv"
assert annot_file_path.exists(), annot_file_path

annot_df = pd.read_csv(annot_file_path)
annot_df.info()
annot_df = annot_df.assign(trait_id=lambda df: df["trait_id"].astype(str))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   trait_id    143 non-null    object
 1   trait_term  143 non-null    object
 2   phenotype   143 non-null    int64 
 3   dataset     143 non-null    object
 4   trait_type  143 non-null    object
 5   annotation  143 non-null    object
dtypes: int64(1), object(5)
memory usage: 6.8+ KB


## stage 1 cleaning

In [80]:
df1 = df.merge(annot_df[["trait_id", "annotation"]], on=["trait_id"], how="left")
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2079 entries, 0 to 2078
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   trait_id    2079 non-null   object
 1   trait_term  2079 non-null   object
 2   phenotype   2079 non-null   object
 3   dataset     2079 non-null   object
 4   trait_type  2079 non-null   object
 5   annotation  143 non-null    object
dtypes: object(6)
memory usage: 113.7+ KB


In [81]:
df1[df1.duplicated(["trait_id"], keep=False)]

Unnamed: 0,trait_id,trait_term,phenotype,dataset,trait_type,annotation


In [82]:
assert len(df1) == len(df)

In [77]:
def _clean_trait_term(text: str) -> List[str]:
    text = text.strip()
    # drop A123 like
    pat = re.compile(r"[A-Z]\d+ ")
    text = re.sub(pat, "", text)
    # drop select punct
    punct_list = ["/", "?", ":", "[", "]", "(", ")"]
    for _ in punct_list:
        text = text.replace(_, " ")
    # split into multiple terms
    text_split = text.split("|")
    return text_split

In [83]:
df1 = df1 \
    .assign(annot_empty=lambda df: df["annotation"].isna()) \
    .assign(
        trait_term_query=lambda df: df.apply(
            lambda row:
                row["annotation"] if not row["annot_empty"] else row["trait_term"],
            axis=1
        )
    ) \
    .transform_column("trait_term_query", _clean_trait_term) \
    .drop(columns=["annot_empty", "annotation"])
df1.info()
df1.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2079 entries, 0 to 2078
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   trait_id          2079 non-null   object
 1   trait_term        2079 non-null   object
 2   phenotype         2079 non-null   object
 3   dataset           2079 non-null   object
 4   trait_type        2079 non-null   object
 5   trait_term_query  2079 non-null   object
dtypes: object(6)
memory usage: 113.7+ KB


Unnamed: 0,trait_id,trait_term,phenotype,dataset,trait_type,trait_term_query
0,8-00,Intestinal infection,8,UKBB+MVP,phecode,[Intestinal infection]
1,85-01,Bacterial enteritis,85,UKBB+MVP,phecode,[Bacterial enteritis]
2,851-02,Intestinal e.coli,851,MVP,phecode,[Intestinal e.coli]
3,852-03,Intestinal infection due to C. difficile,852,UKBB+MVP,phecode,[Intestinal infection due to C. difficile]
4,86-04,Viral Enteritis,86,UKBB+MVP,phecode,[Viral Enteritis]


In [87]:
output_dir = proj_root / "data" / "output"
assert output_dir.exists(), output_dir
output_path = output_dir / "mvp_traits_clean.json"
with output_path.open("w") as f:
    json.dump(
        df1.to_dict(orient="records"),
        f
    )

---

# post cleaning diagnosis

In [22]:
df.value_counts(["dataset", "trait_type"])

dataset   trait_type      
UKBB+MVP  phecode             1378
MVP       phecode              461
          baseline_survey       66
UKBB+MVP  icd10                 43
MVP       lab                   35
UKBB+MVP  lab                   34
          lifestyle_survey      23
MVP       lifestyle_survey      14
UKBB      lab                   14
UKBB+MVP  baseline_survey        7
          vital status           5
MVP       vital status           1
dtype: int64

In [61]:
df[df.duplicated(["trait_id"], keep=False)]

Unnamed: 0,trait_id,trait_term,phenotype,dataset,trait_type


In [33]:
df[df["trait_term"].apply(lambda e: isinstance(e, float))]

Unnamed: 0,trait_id,trait_term,dataset,trait_type


In [34]:
df[df["trait_term"].apply(lambda e: "|" in e)]

Unnamed: 0,trait_id,trait_term,dataset,trait_type
32,711,HIV infection | symptomatic,MVP,phecode
37,799,Viremia | NOS,MVP,phecode
62,132,Infestation (lice | mites),MVP,phecode
74,149,Cancer of larynx | pharynx | nasal cavities,UKBB+MVP,phecode
85,1533,Malignant neoplasm of rectum | rectosigmoid ju...,UKBB+MVP,phecode
...,...,...,...,...
1846,1002,Symptoms concerning nutrition | metabolism | a...,UKBB+MVP,phecode
1853,1009,Injury | NOS,UKBB+MVP,phecode
1867,10107,Z63 Other problems related to primary support ...,UKBB+MVP,icd10
1868,10107,Z71 Persons encountering health services for o...,UKBB+MVP,icd10
