In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import pandas as pd
from glob import glob
import os
import statsmodels.api as sm
import itertools
from tqdm import tqdm
import calpgs
import pickle

os.environ["R_HOME"] = "/u/project/pasaniuc/kangchen/software/miniconda3/envs/r/lib/R"

# Meta data

In [2]:
DATA_DIR = "../compile-data/out/per-trait-info/"

DATA_URL = "../r2-diff/data-table.xlsx"

df_trait_info = pd.read_excel(DATA_URL, sheet_name=0)
trait_map = {
    row.id: row.short if row.short is not np.nan else row.description
    for _, row in df_trait_info.iterrows()
}

df_covar_info = pd.read_excel(DATA_URL, sheet_name=1)
covar_map = {row.id: row.short for _, row in df_covar_info.iterrows()}

df_display = pd.read_excel(DATA_URL, sheet_name=2)

# Build calibration model

In [7]:
def build_model(data_prefix: str, method: str, out_prefix: str):
    """Build calibration model

    Parameters
    ----------
    data_prefix : str
        <data_prefix>.train.tsv, <data_prefix>.test.tsv will be loaded
    method: str
        'mean' or 'mean+var'
        mean: PGS + covariates used to fit the mean model, only a constant term is fitted to var
        mean+var: PGS + covariates used to fit both the mean and variance model
    out_prefix : str
        output prefix
    """
    assert method in ["mean", "mean+var"]

    df_train = pd.read_csv(data_prefix + ".train.tsv", sep="\t", index_col=0)
    df_test = pd.read_csv(data_prefix + ".test.tsv", sep="\t", index_col=0)

    covar_cols = list(df_train.columns[2:])
    # train model
    if method == "mean":
        mean_cols = ["PGS"] + covar_cols
        var_cols = []
    elif method == "mean+var":
        mean_cols = ["PGS"] + covar_cols
        var_cols = mean_cols
    else:
        raise NotImplementedError
    # calibrate
    train_x = sm.add_constant(df_train[mean_cols])
    train_z = sm.add_constant(df_train[var_cols])
    train_y = df_train["pheno"].values

    test_x = sm.add_constant(df_test[mean_cols])
    test_z = sm.add_constant(df_test[var_cols])

    # adjust
    res = calpgs.calibrate_and_adjust(
        train_mean_covar=train_x,
        train_var_covar=train_z,
        train_y=train_y,
        test_mean_covar=test_x,
        test_var_covar=test_z,
    )
    df_test["cal_pred"], df_test["cal_predstd"] = res[0:2]

    df_param = pd.concat(
        [
            pd.Series(res[2], index=train_x.columns, name="beta"),
            pd.Series(res[3], index=train_z.columns, name="gamma"),
        ],
        axis=1,
    )
    out_dir = os.path.dirname(out_prefix)
    os.makedirs(out_dir, exist_ok=True)

    df_test.to_csv(out_prefix + ".test_info.tsv", sep="\t")
    df_param.to_csv(out_prefix + ".param.tsv", sep="\t", na_rep="NA")

In [4]:
data_prefix_list = np.unique([p.split(".")[0] for p in glob("out/data/*/*")])
df_params = pd.DataFrame(
    [params for params in itertools.product(data_prefix_list, ["mean", "mean+var"])],
    columns=["data_prefix", "method"],
)
df_params["out_prefix"] = (
    df_params.data_prefix.str.replace("/data/", "/model/") + "-" + df_params["method"]
)

In [5]:
for _, param in tqdm(df_params.iterrows(), total=len(df_params)):
    build_model(
        data_prefix=param.data_prefix,
        method=param.method,
        out_prefix=param.out_prefix,
    )

  0%|          | 0/240 [00:00<?, ?it/s]


TypeError: calibrate_and_adjust() got an unexpected keyword argument 'train_x'

In [6]:
?calpgs.calibrate_and_adjust

[0;31mSignature:[0m
[0mcalpgs[0m[0;34m.[0m[0mcalibrate_and_adjust[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtrain_mean_covar[0m[0;34m:[0m [0mnumpy[0m[0;34m.[0m[0mndarray[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrain_var_covar[0m[0;34m:[0m [0mnumpy[0m[0;34m.[0m[0mndarray[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrain_y[0m[0;34m:[0m [0mnumpy[0m[0;34m.[0m[0mndarray[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtest_mean_covar[0m[0;34m:[0m [0mnumpy[0m[0;34m.[0m[0mndarray[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtest_var_covar[0m[0;34m:[0m [0mnumpy[0m[0;34m.[0m[0mndarray[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrain_slope_covar[0m[0;34m:[0m [0mnumpy[0m[0;34m.[0m[0mndarray[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtest_slope_covar[0m[0;34m:[0m [0mnumpy[0m[0;34m.[0m[0mndarray[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m