# Build model for calibration

- `out/data/<pop>-<split>/[train.tsv|test.tsv]`: data for calibration and testing for the population and split.
- `out/model/<pop>-<split>/model.pkl`: model that has been trained from the training individuals.

In [7]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import pandas as pd
from glob import glob
import os
import statsmodels.api as sm
from typing import List
from sklearn.model_selection import train_test_split
import itertools
from tqdm import tqdm
from admix.data import quantile_normalize
import matplotlib.pyplot as plt

In [8]:
DATA_DIR = "../compile-data/out/per-trait-info/"

In [9]:
DATA_URL = "../r2-diff/data-table.xlsx"

df_trait_info = pd.read_excel(DATA_URL, sheet_name=0)
trait_map = {
    row.id: row.short if row.short is not np.nan else row.description
    for _, row in df_trait_info.iterrows()
}

df_covar_info = pd.read_excel(DATA_URL, sheet_name=1)
covar_map = {row.id: row.short for _, row in df_covar_info.iterrows()}

df_display = pd.read_excel(DATA_URL, sheet_name=2)

trait_list = df_display.id.values

COVAR_COLS = ["AGE", "SEX", "DEPRIVATION_INDEX", "log_BMI", "income"] + [
    f"PC{i}" for i in range(1, 11)
]

In [18]:
def split_data(
    trait: str,
    indiv_group: str,
    adjust_cols: List[str],
    out_prefix: str,
    predstd: str,
    seed: int = 1234,
):
    """
    split training and testing data

    Parameters
    ----------
    trait: str
        trait name
    indiv_group: str
        white: white British individuals / other individuals
    adjust_cols: List[str]
        covariates used to adjust the standard deviation
    out_prefix: str
        output prefix
    predstd: str
        type of prediction std
        TODO: (replace with linear combination of top 20 PCs later)
    seed: int
        random seed
    """

    df_trait = pd.read_csv(
        os.path.join(DATA_DIR, f"{trait}.tsv.gz"), index_col=0, sep="\t"
    )
    if indiv_group == "white":
        df_trait = df_trait[df_trait.group == "United Kingdom"]
    elif indiv_group == "other":
        df_trait = df_trait[~(df_trait.group == "United Kingdom")]
    else:
        raise NotImplementedError
    
    # standardize covariates
    df_trait[COVAR_COLS] = (
        df_trait[COVAR_COLS] - df_trait[COVAR_COLS].mean()
    ) / df_trait[COVAR_COLS].std()

    for col in COVAR_COLS:
        df_trait[col] = quantile_normalize(df_trait[col])

    # residual after regressing out COVAR_COLS
    df_trait["PHENO_RESID"] = (
        sm.OLS(
            df_trait["PHENO"].values,
            sm.add_constant(df_trait[COVAR_COLS]),
            missing="drop",
        )
        .fit()
        .resid
    )

    ## TODO: add `income` variable back (dropping this for now because too many zeros.)
    if predstd == "const":
        df_trait["predstd"] = 1.0
    elif predstd == "mcmc":
        df_trait["predstd"] = df_trait["SD"]
    elif predstd == "pc":
        df_trait["predstd"] = df_trait["PC1"]
    else:
        raise NotImplementedError

    # remove this dropna part
    df_trait = (
        df_trait[["PHENO_RESID", "MEAN", "predstd"] + adjust_cols]
        .rename(columns={"PHENO_RESID": "pheno", "MEAN": "pred"})
        .dropna()
    )

    ## split train and test
    df_train, df_test = train_test_split(
        df_trait, train_size=5000, test_size=5000, random_state=seed
    )
    out_dir = os.path.dirname(out_prefix)
    os.makedirs(out_dir, exist_ok=True)
    df_train.to_csv(out_prefix + ".train.tsv", sep="\t")
    df_test.to_csv(out_prefix + ".test.tsv", sep="\t")

In [19]:
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["LDL", "height"],
            ["white", "other"],
            ["const"],
            np.arange(1, 11),
        )
    ],
    columns=["trait", "group", "predstd", "seed"],
)
df_params["out_prefix"] = df_params.apply(
    lambda r: f"out/data/{r.trait}-{r.group}/{r.predstd}-{r.seed}", axis=1
)
print(f"{len(df_params)} jobs in total")

40 jobs in total


In [20]:
adjust_cols = ["AGE", "SEX", "log_BMI", "PC1", "PC2", "PC3", "PC4", "income"]

In [21]:
for _, param in tqdm(df_params.iterrows(), total=len(df_params)):
    split_data(
        trait=param.trait,
        indiv_group=param.group,
        adjust_cols=adjust_cols,
        predstd=param.predstd,
        out_prefix=param.out_prefix,
        seed=param.seed,
    )

100%|██████████| 40/40 [00:26<00:00,  1.51it/s]
