# Build model for calibration

- `out/data/<pop>-<split>/[train.tsv|test.tsv]`: data for calibration and testing for the population and split.
- `out/model/<pop>-<split>/model.pkl`: model that has been trained from the training individuals.

In [6]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import pandas as pd
from glob import glob
import os
import statsmodels.api as sm
from typing import List
from sklearn.model_selection import train_test_split
import itertools
from tqdm import tqdm
from admix.data import quantile_normalize
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [7]:
DATA_DIR = "../compile-data/out/per-trait-info/"
DATA_URL = "../r2-diff/data-table.xlsx"

df_trait_info = pd.read_excel(DATA_URL, sheet_name=0)
trait_map = {
    row.id: row.short if row.short is not np.nan else row.description
    for _, row in df_trait_info.iterrows()
}

df_covar_info = pd.read_excel(DATA_URL, sheet_name=1)
covar_map = {row.id: row.short for _, row in df_covar_info.iterrows()}

df_display = pd.read_excel(DATA_URL, sheet_name=2)

trait_list = df_display.id.values

COVAR_COLS = [
    "AGE",
    "SEX",
    "DEPRIVATION_INDEX",
    "log_BMI",
    "income",
    "ever_smoked",
    "drink_alcohol",
] + [f"PC{i}" for i in range(1, 5)]

In [12]:
def expand_pgs_interact(df, cols=["AGE", "SEX", "PC1", "PC2", "PC3", "PC4"]):
    # expand PGS interaction term
    df = df.copy()
    for col in cols:
        df[f"PGS*{col}"] = df["PGS"] * df[col]
    # add age * sex
    df["AGE*SEX"] = df["AGE"] * df["SEX"]
    return df

In [13]:
def split_data(
    trait: str,
    indiv_group: str,
    out_prefix: str,
    q_normalize: str = "none",
    seed: int = 1234,
):
    """
    split training and testing data

    Parameters
    ----------
    trait: str
        trait name
    indiv_group: str
        (1) white: white British individuals
        (2) other individuals
    out_prefix: str
        output prefix
    quantile_normalize: str
        (1) none no quantile normalization
        (2) pheno (quantile normalization on phenotype)
        (3) pheno+covar (quantile normalization for both phenotype and covar)
    seed: int
        random seed
    """
    assert q_normalize in ["none", "pheno", "pheno+covar"]
    df_trait = pd.read_csv(
        os.path.join(DATA_DIR, f"{trait}.tsv.gz"), index_col=0, sep="\t"
    )
    if indiv_group == "white":
        df_trait = df_trait[df_trait.group == "United Kingdom"]
    elif indiv_group == "other":
        df_trait = df_trait[~(df_trait.group == "United Kingdom")]
    elif indiv_group == "all":
        df_trait = df_trait
    else:
        raise NotImplementedError

    df_trait = df_trait.rename(columns={"MEAN": "PGS", "PHENO": "pheno"}).dropna(
        subset=["pheno", "PGS"]
    )

    # impute 0 and standardize covariates
    for col in ["PGS"] + COVAR_COLS:
        df_trait[col] = df_trait[col].fillna(df_trait[col].mean())
        df_trait[col] = (df_trait[col] - df_trait[col].mean()) / df_trait[col].std()
        if q_normalize == "pheno+covar":
            df_trait[col] = quantile_normalize(df_trait[col])

    if q_normalize in ["pheno", "pheno+covar"]:
        df_trait["pheno"] = quantile_normalize(df_trait["pheno"])

    df_trait = df_trait[["pheno", "PGS"] + COVAR_COLS]
    df_trait = expand_pgs_interact(df_trait)

    ## split train and test
    df_train, df_test = train_test_split(
        df_trait, train_size=5000, test_size=5000, random_state=seed
    )
    out_dir = os.path.dirname(out_prefix)
    os.makedirs(out_dir, exist_ok=True)
    df_train.to_csv(out_prefix + ".train.tsv", sep="\t")
    df_test.to_csv(out_prefix + ".test.tsv", sep="\t")

In [14]:
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["LDL", "height"],
            ["white", "other", "all"],
            np.arange(1, 31),
        )
    ],
    columns=["trait", "group", "seed"],
)
df_params["out_prefix"] = df_params.apply(
    lambda r: f"out/data/{r.trait}-{r.group}/{r.seed}", axis=1
)
print(f"{len(df_params)} jobs in total")

180 jobs in total


In [15]:
for _, param in tqdm(df_params.iterrows(), total=len(df_params)):
    split_data(
        trait=param.trait,
        indiv_group=param.group,
        out_prefix=param.out_prefix,
        seed=param.seed,
    )

100%|██████████| 180/180 [02:02<00:00,  1.47it/s]
