In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import pandas as pd
from glob import glob
import os
import statsmodels.api as sm
import itertools
from tqdm import tqdm
import calpgs
import pickle
import submitit

os.environ["R_HOME"] = "/u/project/pasaniuc/kangchen/software/miniconda3/envs/r/lib/R"

# Meta data

In [2]:
DATA_DIR = "../compile-data/out/per-trait-info/"

DATA_URL = "../r2-diff/data-table.xlsx"

df_trait_info = pd.read_excel(DATA_URL, sheet_name=0)
trait_map = {
    row.id: row.short if row.short is not np.nan else row.description
    for _, row in df_trait_info.iterrows()
}

df_covar_info = pd.read_excel(DATA_URL, sheet_name=1)
covar_map = {row.id: row.short for _, row in df_covar_info.iterrows()}

df_display = pd.read_excel(DATA_URL, sheet_name=2)

# Build calibration model

In [3]:
def build_model(data_prefix: str, method: str, out_prefix: str):
    """Build calibration model

    Parameters
    ----------
    data_prefix : str
        <data_prefix>.train.tsv, <data_prefix>.test.tsv will be loaded
    method: str
        'mean' or 'mean+var'
        mean: PGS + covariates used to fit the mean model, only a constant term is fitted to var
        mean+var: PGS + covariates used to fit both the mean and variance model
    out_prefix : str
        output prefix
    """
    assert method in ["mean", "mean+var", "mean+var+slope"]

    df_train = pd.read_csv(data_prefix + ".train.tsv", sep="\t", index_col=0)
    df_test = pd.read_csv(data_prefix + ".test.tsv", sep="\t", index_col=0)

    covar_cols = list(df_train.columns[2:])
    # train model
    slope_cols = None
    if method == "mean":
        mean_cols = ["PGS"] + covar_cols
        var_cols = []
    elif method == "mean+var":
        mean_cols = ["PGS"] + covar_cols
        var_cols = mean_cols
    elif method == "mean+var+slope":
        mean_cols = ["PGS"] + covar_cols
        var_cols = mean_cols
        slope_cols = mean_cols
    else:
        raise NotImplementedError
    # calibrate
    train_x = sm.add_constant(df_train[mean_cols])
    train_z = sm.add_constant(df_train[var_cols])
    train_y = df_train["pheno"].values

    test_x = sm.add_constant(df_test[mean_cols])
    test_z = sm.add_constant(df_test[var_cols])

    if slope_cols is None:
        train_slope_covar = test_slope_covar = None
    else:
        train_slope_covar, test_slope_covar = df_train[slope_cols], df_test[slope_cols]

    # adjust
    res = calpgs.calibrate_and_adjust(
        train_mean_covar=train_x,
        train_var_covar=train_z,
        train_y=train_y,
        test_mean_covar=test_x,
        test_var_covar=test_z,
        train_slope_covar=train_slope_covar,
        test_slope_covar=test_slope_covar,
    )
    df_test["cal_pred"], df_test["cal_predstd"] = res[0:2]

    if slope_cols is None:
        df_param = pd.concat(
            [
                pd.Series(res[2], index=train_x.columns, name="mean_beta"),
                pd.Series(res[3], index=train_z.columns, name="var_beta"),
            ],
            axis=1,
        )
    else:
        df_param = pd.concat(
            [
                pd.Series(res[2], index=train_x.columns, name="mean_beta"),
                pd.Series(res[3], index=train_z.columns, name="var_beta"),
                pd.Series(res[4], index=train_slope_covar.columns, name="slope_beta"),
            ],
            axis=1,
        )
    # save
    out_dir = os.path.dirname(out_prefix)
    os.makedirs(out_dir, exist_ok=True)

    df_test.to_csv(out_prefix + ".test_info.tsv", sep="\t")
    df_param.to_csv(out_prefix + ".param.tsv", sep="\t", na_rep="NA")

In [4]:
data_prefix_list = np.unique([p.split(".")[0] for p in glob("out/data/*/*")])
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            data_prefix_list,
            ["mean", "mean+var"]
            #             ["mean", "mean+var", "mean+var+slope"]
        )
    ],
    columns=["data_prefix", "method"],
)
df_params["out_prefix"] = (
    df_params.data_prefix.str.replace("/data/", "/model/") + "-" + df_params["method"]
)
df_params["group"] = df_params["data_prefix"].apply(
    lambda x: x.split("/")[-2] + "-" + str(int(x.split("/")[-1]) // 10)
)

# Fit parameters

In [5]:
def submit(df_params):
    for _, param in tqdm(df_params.iterrows(), total=len(df_params)):
        build_model(
            data_prefix=param.data_prefix,
            method=param.method,
            out_prefix=param.out_prefix,
        )

In [6]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=60,
    memory_g=16,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
        "export R_HOME=/u/project/pasaniuc/kangchen/software/miniconda3/envs/r/lib/R",
    ],
)

jobs = executor.map_array(submit, [df for _, df in df_params.groupby("group")])

# Evaluate and summarize results

In [None]:
def summarize_result(trait: str, indiv_group: str, method: str, out: str):
    """Summarize calibration results

    Parameters
    ----------
    trait : str
        trait name
    indiv_group : str
        white or other
    method : str
        all or none
    out : str
        output path
    """
    path_list = natsorted(
        glob.glob(f"out/model/{trait}-{indiv_group}/*-{method}.test_info.tsv")
    )
    df_predint = []

    for seed_i, path in enumerate(path_list):
        with tempfile.TemporaryDirectory() as tmp_dir:
            cmds = [
                "calpgs group-stats",
                f"--df {path}",
                "--y pheno --pred cal_pred --predstd cal_predstd",
                "--group PC1,AGE,SEX",
                f"--out {tmp_dir}/out",
                "--n-bootstrap 10 --n-subgroup 5",
            ]
            subprocess.check_call(" ".join(cmds), shell=True)
            df_out = pd.read_csv(f"{tmp_dir}/out.predint.tsv", sep="\t")
            df_out.insert(0, "seed", seed_i)
            df_r2 = pd.read_csv(f"{tmp_dir}/out.r2.tsv", sep="\t")
            df_out["r2"] = df_r2["r2"]
            df_predint.append(df_out)
    pd.concat(df_predint, axis=0).to_csv(out, sep="\t", index=False)

In [None]:
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["LDL", "height"],
            ["white", "other", "all"],
            ["mean", "mean+var"]
            #             ["mean", "mean+var", "mean+var+slope"],
        )
    ],
    columns=[
        "trait",
        "group",
        "method",
    ],
)
df_params["out"] = df_params.apply(
    lambda row: f"out/summary/{row.trait}-{row.group}.{row.method}.tsv",
    axis=1,
)

os.makedirs("out/summary/", exist_ok=True)
print(f"{len(df_params)} jobs in total")

executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=10,
    memory_g=16,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PATH=~/project-pasaniuc/software/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    summarize_result,
    df_params.trait,
    df_params.group,
    df_params.method,
    df_params.out,
)