In [1]:
%load_ext lab_black
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import itertools
from tqdm import tqdm
import yaml
import os
import calpgs
import submitit

os.environ["R_HOME"] = "/u/project/pasaniuc/kangchen/software/miniconda3/envs/r/lib/R"

In [2]:
trait_list = np.loadtxt("data/traits.txt", dtype=str)
with open(f"data/meta.yaml", "r") as f:
    metadata = yaml.safe_load(f)

VAR_COLS = metadata["VAR_COLS"]
COVAR_COLS = metadata["COVAR_COLS"]

In [3]:
def calibrate(prefix: str, trait: str):
    """Build calibration model

    Parameters
    ----------
    prefix : str
        <prefix>.train.tsv, <prefix>.test.tsv will be loaded
    """

    train_path = prefix + ".train.tsv"
    test_path = prefix + ".test.tsv"

    # train model
    for method in ["mean", "mean+var"]:
        if method == "mean":
            var_cols = []
        elif method == "mean+var":
            # remove BMI for BMI, edu for edu
            var_cols = [col for col in VAR_COLS if col != trait]
        else:
            raise NotImplementedError

        out_prefix = prefix + "." + method
        calpgs.estimate_coef(
            df_path=train_path,
            y_col="QPHENO",
            mean_cols=["PGS"] + COVAR_COLS,
            var_cols=var_cols,
            target_df_path=test_path,
            out_prefix=out_prefix,
        )

        calpgs.quantify_r2(
            df_path=out_prefix + ".pred.tsv",
            y_col="QPHENO",
            pred_col="pred_mean",
            predstd_col="pred_std",
            test_cols=VAR_COLS,
            out_prefix=out_prefix,
            n_bootstrap=10,
        )

In [4]:
def run(
    trait: str,
    group: str,
):
    """
    split training and testing data

    Parameters
    ----------
    trait: str
        trait name
    indiv_group: str
        (1) white: white British individuals
        (2) other individuals
    out_prefix: str
        output prefix
    seed: int
        random seed
    """

    df_trait = pd.read_csv(
        os.path.join(f"out/format-data/{trait}.{group}.tsv"), index_col=0, sep="\t"
    )
    if trait == "LDL":
        # mmol/L to mg/dL
        df_trait["PHENO"] *= 38.66976

    out_dir = f"out/calibrate/{trait}.{group}"
    os.makedirs(out_dir, exist_ok=True)

    for seed in range(1, 31):
        out_prefix = f"{out_dir}/{seed}"
        ## split train and test
        df_train, df_test = train_test_split(
            df_trait, train_size=5000, test_size=5000, random_state=seed
        )
        out_dir = os.path.dirname(out_prefix)
        os.makedirs(out_dir, exist_ok=True)
        df_train.to_csv(out_prefix + ".train.tsv", sep="\t")
        df_test.to_csv(out_prefix + ".test.tsv", sep="\t")
        calibrate(prefix=out_prefix, trait=trait)

In [5]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=30,
    memory_g=12,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
        "export R_HOME=/u/project/pasaniuc/kangchen/software/miniconda3/envs/r/lib/R",
    ],
)
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(["LDL", "height", "log_BMI"], ["white", "all"])
    ],
    columns=["trait", "group"],
)

In [6]:
jobs = executor.map_array(
    run,
    df_params.trait,
    df_params.group,
)