In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import pandas as pd
import pickle
import os
import admix_prs
from functools import reduce

# Load data

In [2]:
PHENO_PATH = "/u/project/pasaniuc/kangchen/tmp/prs-1219/REAL-PHENO/all-pheno.csv"

df_pheno = pd.read_csv(PHENO_PATH, index_col=0)
df_pheno["PC1_Q"] = admix_prs.make_levels(df_pheno, stratify_col="PC1", n_level=5)
df_pheno["PC2_Q"] = admix_prs.make_levels(df_pheno, stratify_col="PC2", n_level=5)
df_pheno["AGE_Q"] = admix_prs.make_levels(df_pheno, stratify_col="AGE", n_level=5)

In [3]:
config = "hsq-0.25-pcausal-0.01-hermodel-uniform"

In [4]:
if os.path.exists("df_res_list.pkl"):
    with open("df_res_list.pkl", "rb") as f:
        df_res_list = pickle.load(f)
else:
    df_res_list = admix_prs.load_sim_data(config)
    with open("df_res_list.pkl", "wb") as f:
        pickle.dump(df_res_list, f)

In [5]:
n_sim = len(df_res_list)

In [6]:
res_dict = {
    "r2": {
        "PHENO_G": {"PC1_Q": list(), "PC2_Q": list(), "AGE_Q": list(), "SEX": list()},
        "PHENO": {"PC1_Q": list(), "PC2_Q": list(), "AGE_Q": list(), "SEX": list()},
    },
    "cali": {
        "PHENO_G": {"PC1_Q": list(), "PC2_Q": list(), "AGE_Q": list(), "SEX": list()},
        "PHENO": {"PC1_Q": list(), "PC2_Q": list(), "AGE_Q": list(), "SEX": list()},
    },
}

In [7]:
for i_sim in range(n_sim):
    df_prs = df_res_list[i_sim].copy()
    df_prs["PHENO_G"] -= df_prs["PHENO"].mean()
    df_prs["PHENO"] -= df_prs["PHENO"].mean()
    df_prs["ID"] = df_prs.index
    df_prs["ID"] = df_prs["ID"].apply(lambda x: x.split("_")[0])
    df_prs.index = df_prs["ID"]
    df_prs = df_prs.drop(columns=["ID"])
    df_prs = df_prs.dropna()
    df_prs.index = df_prs.index.astype(int)
    df_prs["PHENO_STD"] = np.sqrt(df_prs["PRS_STD"] ** 2 + 0.75)
    df_prs["PRS_LOW"] = df_prs["PRS_MEAN"] - 1.645 * df_prs["PRS_STD"]
    df_prs["PRS_UPP"] = df_prs["PRS_MEAN"] + 1.645 * df_prs["PRS_STD"]
    df_prs["PHENO_LOW"] = df_prs["PRS_MEAN"] - 1.645 * df_prs["PHENO_STD"]
    df_prs["PHENO_UPP"] = df_prs["PRS_MEAN"] + 1.645 * df_prs["PHENO_STD"]
    df_info = pd.merge(df_pheno, df_prs, left_index=True, right_index=True)

    for x_col in ["PHENO_G", "PHENO"]:
        if x_col == "PHENO_G":
            lower_col, upper_col = "PRS_LOW", "PRS_UPP"
        elif x_col == "PHENO":
            lower_col, upper_col = "PHENO_LOW", "PHENO_UPP"
        else:
            raise NotImplementedError
        for group_col in ["PC1_Q", "PC2_Q", "SEX", "AGE_Q"]:
            res_dict["r2"][x_col][group_col].append(
                admix_prs.stratify_calculate_r2(
                    df_info, x_col=x_col, y_col="PRS_MEAN", group_col=group_col
                )
            )

            res_dict["cali"][x_col][group_col].append(
                admix_prs.eval_calibration(
                    df_info,
                    x_col=x_col,
                    lower_col=lower_col,
                    upper_col=upper_col,
                    group_col=group_col,
                )
            )

In [8]:
pheno_g_r2 = []
for col in ["PC1_Q", "PC2_Q", "AGE_Q", "SEX"]:
    col_res = []
    l = np.shape(res_dict["r2"]["PHENO_G"][col][0])[0]
    for i in range(n_sim):
        col_res = np.append(
            col_res, (np.array(res_dict["r2"]["PHENO_G"][col][i]["R2"]))
        )
    col_res = col_res.reshape([l, n_sim])
    df_col_res = pd.DataFrame(
        {
            col: res_dict["r2"]["PHENO_G"][col][i][col],
            "R2_MEAN": np.mean(col_res, axis=1),
            "R2_STD": np.std(col_res, axis=1),
        }
    )
    pheno_g_r2.append(df_col_res)

In [9]:
pheno_g_r2

[                PC1_Q   R2_MEAN    R2_STD
 0  (-19.271, -13.615]  0.716163  0.030993
 1  (-13.615, -12.613]  0.726285  0.017240
 2  (-12.613, -11.683]  0.715226  0.047323
 3  (-11.683, -10.323]  0.717482  0.034309
 4  (-10.323, 419.396]  0.676173  0.101882,
                PC2_Q   R2_MEAN    R2_STD
 0  (-282.318, 2.103]  0.703374  0.067120
 1     (2.103, 3.217]  0.709214  0.033349
 2      (3.217, 4.07]  0.682952  0.099165
 3      (4.07, 5.039]  0.693313  0.072874
 4    (5.039, 86.112]  0.628547  0.177905,
             AGE_Q   R2_MEAN    R2_STD
 0  (36.999, 48.0]  0.686285  0.019622
 1    (48.0, 55.0]  0.709655  0.010204
 2    (55.0, 60.0]  0.659785  0.029001
 3    (60.0, 64.0]  0.670336  0.018097
 4    (64.0, 73.0]  0.528519  0.047351,
    SEX   R2_MEAN    R2_STD
 0    0  0.685787  0.020763
 1    1  0.601021  0.073135]

In [10]:
pheno_r2 = []
for col in ["PC1_Q", "PC2_Q", "AGE_Q", "SEX"]:
    col_res = []
    l = np.shape(res_dict["r2"]["PHENO"][col][i])[0]
    for i in range(n_sim):
        col_res = np.append(col_res, (np.array(res_dict["r2"]["PHENO"][col][i]["R2"])))
    col_res = col_res.reshape([l, n_sim])
    df_col_res = pd.DataFrame(
        {
            col: res_dict["r2"]["PHENO"][col][i][col],
            "R2_MEAN": np.mean(col_res, axis=1),
            "R2_STD": np.std(col_res, axis=1),
        }
    )
    pheno_r2.append(df_col_res)

In [11]:
pheno_r2

[                PC1_Q   R2_MEAN    R2_STD
 0  (-19.271, -13.615]  0.179066  0.005449
 1  (-13.615, -12.613]  0.182485  0.006698
 2  (-12.613, -11.683]  0.181075  0.019352
 3  (-11.683, -10.323]  0.175547  0.012387
 4  (-10.323, 419.396]  0.166495  0.024278,
                PC2_Q   R2_MEAN    R2_STD
 0  (-282.318, 2.103]  0.178282  0.025438
 1     (2.103, 3.217]  0.179972  0.006258
 2      (3.217, 4.07]  0.167909  0.022995
 3      (4.07, 5.039]  0.168490  0.019496
 4    (5.039, 86.112]  0.152726  0.048741,
             AGE_Q   R2_MEAN    R2_STD
 0  (36.999, 48.0]  0.187104  0.005982
 1    (48.0, 55.0]  0.189509  0.005028
 2    (55.0, 60.0]  0.164961  0.004375
 3    (60.0, 64.0]  0.166808  0.005794
 4    (64.0, 73.0]  0.131881  0.017178,
    SEX   R2_MEAN    R2_STD
 0    0  0.182479  0.010432
 1    1  0.150263  0.018030]

In [12]:
pheno_g_cali = []
for col in ["PC1_Q", "PC2_Q", "AGE_Q", "SEX"]:
    col_res = []
    l = np.shape(res_dict["cali"]["PHENO_G"][col][0])[0]
    for i in range(n_sim):
        col_res = np.append(
            col_res, (np.array(res_dict["cali"]["PHENO_G"][col][i]["coverage"]))
        )
    col_res = col_res.reshape([l, n_sim])
    df_col_res = pd.DataFrame(
        {
            col: res_dict["cali"]["PHENO_G"][col][i][col],
            "Coverage_MEAN": np.mean(col_res, axis=1),
            "Coverage_STD": np.std(col_res, axis=1),
        }
    )
    pheno_g_cali.append(df_col_res)

In [13]:
pheno_g_cali

[                PC1_Q  Coverage_MEAN  Coverage_STD
 0  (-19.271, -13.615]       0.898982      0.006027
 1  (-13.615, -12.613]       0.904141      0.004387
 2  (-12.613, -11.683]       0.880314      0.003860
 3  (-11.683, -10.323]       0.881993      0.011922
 4  (-10.323, 419.396]       0.835332      0.014020,
                PC2_Q  Coverage_MEAN  Coverage_STD
 0  (-282.318, 2.103]       0.897321      0.008850
 1     (2.103, 3.217]       0.905553      0.004998
 2      (3.217, 4.07]       0.881551      0.008335
 3      (4.07, 5.039]       0.887429      0.008159
 4    (5.039, 86.112]       0.821113      0.067916,
             AGE_Q  Coverage_MEAN  Coverage_STD
 0  (36.999, 48.0]       0.893894      0.003317
 1    (48.0, 55.0]       0.904530      0.002049
 2    (55.0, 60.0]       0.883252      0.001788
 3    (60.0, 64.0]       0.892648      0.003457
 4    (64.0, 73.0]       0.835575      0.010602,
    SEX  Coverage_MEAN  Coverage_STD
 0    0       0.895657      0.008299
 1    1       0.8

In [14]:
pheno_cali = []
for col in ["PC1_Q", "PC2_Q", "AGE_Q", "SEX"]:
    col_res = []
    l = np.shape(res_dict["cali"]["PHENO"][col][i])[0]
    for i in range(n_sim):
        col_res = np.append(
            col_res, (np.array(res_dict["cali"]["PHENO"][col][i]["coverage"]))
        )
    col_res = col_res.reshape([l, n_sim])
    df_col_res = pd.DataFrame(
        {
            col: res_dict["cali"]["PHENO"][col][i][col],
            "Coverage_MEAN": np.mean(col_res, axis=1),
            "Coverage_STD": np.std(col_res, axis=1),
        }
    )
    pheno_cali.append(df_col_res)

In [15]:
pheno_cali

[                PC1_Q  Coverage_MEAN  Coverage_STD
 0  (-19.271, -13.615]       0.899860      0.002980
 1  (-13.615, -12.613]       0.899059      0.003762
 2  (-12.613, -11.683]       0.899569      0.005342
 3  (-11.683, -10.323]       0.899574      0.001942
 4  (-10.323, 419.396]       0.893397      0.003263,
                PC2_Q  Coverage_MEAN  Coverage_STD
 0  (-282.318, 2.103]       0.899171      0.001750
 1     (2.103, 3.217]       0.900370      0.002374
 2      (3.217, 4.07]       0.898768      0.005113
 3      (4.07, 5.039]       0.899506      0.002700
 4    (5.039, 86.112]       0.889284      0.012935,
             AGE_Q  Coverage_MEAN  Coverage_STD
 0  (36.999, 48.0]       0.899670      0.001731
 1    (48.0, 55.0]       0.900123      0.002967
 2    (55.0, 60.0]       0.898130      0.003661
 3    (60.0, 64.0]       0.898618      0.001575
 4    (64.0, 73.0]       0.891513      0.007116,
    SEX  Coverage_MEAN  Coverage_STD
 0    0       0.899481      0.000995
 1    1       0.8