In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import pickle
import os
import admix_prs
from functools import reduce

# Load data

In [3]:
PHENO_PATH = "/u/project/pasaniuc/kangchen/tmp/prs-1219/REAL-PHENO/all-pheno.csv"
df_pheno = pd.read_csv(PHENO_PATH, index_col=0)
df_pheno["PC1_Q"] = admix_prs.make_levels(df_pheno, stratify_col="PC1", n_level=5)
df_pheno["PC2_Q"] = admix_prs.make_levels(df_pheno, stratify_col="PC2", n_level=5)
df_pheno["AGE_Q"] = admix_prs.make_levels(df_pheno, stratify_col="AGE", n_level=5)

In [4]:
config = "hsq-0.25-pcausal-0.01-hermodel-uniform"

In [5]:
if os.path.exists("df_res_list.pkl"):
    with open("df_res_list.pkl", "rb") as f:
        df_res_list = pickle.load(f)
else:
    df_res_list = admix_prs.load_sim_data(config)
    with open("df_res_list.pkl", "wb") as f:
        pickle.dump(df_res_list, f)

In [6]:
n_sim = len(df_res_list)

In [7]:
res_dict = {
    'r2': {
        'PHENO_G': {'PC1_Q': list(), 'PC2_Q':list(), 'AGE_Q':list(), 'SEX_Q':list()},
        'PHENO': {'PC1_Q': list(), 'PC2_Q':list(), 'AGE_Q':list(), 'SEX_Q':list()}    
    },
    'cali':{
        'PHENO_G': {'PC1_Q': list(), 'PC2_Q':list(), 'AGE_Q':list(), 'SEX_Q':list()},
        'PHENO': {'PC1_Q': list(), 'PC2_Q':list(), 'AGE_Q':list(), 'SEX_Q':list()}    
    }
}

In [8]:

for i_sim in range(n_sim):
    df_prs = df_res_list[i_sim].copy()
    df_prs["PHENO_G"] -= df_prs["PHENO"].mean()
    df_prs["PHENO"] -= df_prs["PHENO"].mean()
    df_prs["ID"] = df_prs.index
    df_prs["ID"] = df_prs["ID"].apply(lambda x: x.split("_")[0])
    df_prs.index = df_prs["ID"]
    df_prs = df_prs.drop(columns=["ID"])
    df_prs = df_prs.dropna()
    df_prs.index = df_prs.index.astype(int)
    df_prs["PHENO_STD"] = np.sqrt(df_prs["PRS_STD"] ** 2 + 0.75)
    df_prs["PRS_LOW"] = df_prs["PRS_MEAN"] - 1.645 * df_prs["PRS_STD"]
    df_prs["PRS_UPP"] = df_prs["PRS_MEAN"] + 1.645 * df_prs["PRS_STD"]
    df_prs["PHENO_LOW"] = df_prs["PRS_MEAN"] - 1.645 * df_prs["PHENO_STD"]
    df_prs["PHENO_UPP"] = df_prs["PRS_MEAN"] + 1.645 * df_prs["PHENO_STD"]
    df_info = pd.merge(df_pheno, df_prs, left_index=True, right_index=True)    
    
    res_dict['r2']['PHENO_G']['PC1_Q'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO_G", y_col="PRS_MEAN", group_col="PC1_Q"))
    res_dict['r2']['PHENO_G']['PC2_Q'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO_G", y_col="PRS_MEAN", group_col="PC2_Q"))
    res_dict['r2']['PHENO_G']['SEX_Q'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO_G", y_col="PRS_MEAN", group_col="SEX"))
    res_dict['r2']['PHENO_G']['AGE_Q'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO_G", y_col="PRS_MEAN", group_col="AGE_Q"))
    res_dict['r2']['PHENO']['PC1_Q'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO", y_col="PRS_MEAN", group_col="PC1_Q"))
    res_dict['r2']['PHENO']['PC2_Q'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO", y_col="PRS_MEAN", group_col="PC2_Q"))
    res_dict['r2']['PHENO']['SEX_Q'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO", y_col="PRS_MEAN", group_col="SEX"))
    res_dict['r2']['PHENO']['AGE_Q'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO", y_col="PRS_MEAN", group_col="AGE_Q"))
    res_dict['cali']['PHENO_G']['PC1_Q'].append(admix_prs.eval_calibration(df_info, x_col="PHENO_G", lower_col="PRS_LOW", upper_col="PRS_UPP", group_col="PC1_Q"))
    res_dict['cali']['PHENO_G']['PC2_Q'].append(admix_prs.eval_calibration(df_info, x_col="PHENO_G", lower_col="PRS_LOW", upper_col="PRS_UPP", group_col="PC2_Q"))
    res_dict['cali']['PHENO_G']['SEX_Q'].append(admix_prs.eval_calibration(df_info, x_col="PHENO_G", lower_col="PRS_LOW", upper_col="PRS_UPP", group_col="SEX"))
    res_dict['cali']['PHENO_G']['AGE_Q'].append(admix_prs.eval_calibration(df_info, x_col="PHENO_G", lower_col="PRS_LOW", upper_col="PRS_UPP", group_col="AGE_Q"))
    res_dict['cali']['PHENO']['PC1_Q'].append(admix_prs.eval_calibration(df_info, x_col="PHENO", lower_col="PHENO_LOW", upper_col="PHENO_UPP", group_col="PC1_Q"))
    res_dict['cali']['PHENO']['PC2_Q'].append(admix_prs.eval_calibration(df_info, x_col="PHENO", lower_col="PHENO_LOW", upper_col="PHENO_UPP", group_col="PC2_Q"))
    res_dict['cali']['PHENO']['AGE_Q'].append(admix_prs.eval_calibration(df_info, x_col="PHENO", lower_col="PHENO_LOW", upper_col="PHENO_UPP", group_col="AGE_Q"))
    res_dict['cali']['PHENO']['SEX_Q'].append(admix_prs.eval_calibration(df_info, x_col="PHENO", lower_col="PHENO_LOW", upper_col="PHENO_UPP", group_col="SEX"))
    

In [9]:
res_dict['r2']['PHENO_G']['PC1_Q']

[                PC1_Q        R2    R2_std
 0  (-19.271, -13.615]  0.728166  0.006713
 1  (-13.615, -12.613]  0.730566  0.006103
 2  (-12.613, -11.683]  0.736377  0.006277
 3  (-11.683, -10.323]  0.730102  0.004629
 4  (-10.323, 419.396]  0.654313  0.003397,
                 PC1_Q        R2    R2_std
 0  (-19.271, -13.615]  0.736676  0.007345
 1  (-13.615, -12.613]  0.723762  0.008901
 2  (-12.613, -11.683]  0.738509  0.007441
 3  (-11.683, -10.323]  0.739132  0.006855
 4  (-10.323, 419.396]  0.693409  0.002826,
                 PC1_Q        R2    R2_std
 0  (-19.271, -13.615]  0.743007  0.007374
 1  (-13.615, -12.613]  0.737428  0.006737
 2  (-12.613, -11.683]  0.740541  0.003912
 3  (-11.683, -10.323]  0.737678  0.005652
 4  (-10.323, 419.396]  0.619142  0.004547,
                 PC1_Q        R2    R2_std
 0  (-19.271, -13.615]  0.727476  0.005709
 1  (-13.615, -12.613]  0.738815  0.006069
 2  (-12.613, -11.683]  0.738477  0.006128
 3  (-11.683, -10.323]  0.739206  0.006148
 4  (-10

In [31]:
# ? mean of std or std mean
pheno_g_r2_res_li = []
for col in ['PC1_Q', 'PC2_Q', 'AGE_Q', 'SEX_Q']:
    df_concat = pd.concat(res_dict['r2']['PHENO_G'][col], axis=1, ignore_index=True)
    df_pheno_g_r2 = pd.concat(
        [df_concat[0], df_concat[[1,4,7,10,13]].mean(axis=1), df_concat[[2,5,8,11,14]].std(axis=1)],
        axis=1
    )
    df_pheno_g_r2.columns = [col, "R2_MEAN", "R2_MEAN_STD"]
    pheno_g_r2_res_li.append(df_pheno_g_r2)

In [32]:
pheno_g_r2_res_li

[                PC1_Q   R2_MEAN  R2_MEAN_STD
 0  (-19.271, -13.615]  0.730363     0.000683
 1  (-13.615, -12.613]  0.730399     0.001216
 2  (-12.613, -11.683]  0.735620     0.001340
 3  (-11.683, -10.323]  0.735693     0.001647
 4  (-10.323, 419.396]  0.617264     0.000983,
                PC2_Q   R2_MEAN  R2_MEAN_STD
 0  (-282.318, 2.103]  0.716426     0.000572
 1     (2.103, 3.217]  0.735690     0.001261
 2      (3.217, 4.07]  0.730716     0.002501
 3      (4.07, 5.039]  0.732090     0.001482
 4    (5.039, 86.112]  0.507596     0.000628,
             AGE_Q   R2_MEAN  R2_MEAN_STD
 0  (36.999, 48.0]  0.613410     0.000789
 1    (48.0, 55.0]  0.629224     0.001390
 2    (55.0, 60.0]  0.669043     0.000386
 3    (60.0, 64.0]  0.678916     0.002444
 4    (64.0, 73.0]  0.661800     0.001417,
    SEX_Q   R2_MEAN  R2_MEAN_STD
 0      0  0.643569     0.001463
 1      1  0.644901     0.001138]

In [43]:
pheno_g_cali_res_li = []
for col in ['PC1_Q', 'PC2_Q', 'AGE_Q', 'SEX_Q']:
    df_concat = pd.concat(res_dict['cali']['PHENO_G'][col], axis=1, ignore_index=True)
    df_pheno_g_cali = pd.concat(
        [df_concat[0], df_concat[[1,3,5,7,9]].mean(axis=1)],
        axis=1
    )
    df_pheno_g_cali.columns = [col, "Coverage_MEAN"]
    pheno_g_cali_res_li.append(df_pheno_g_cali)

In [44]:
pheno_g_cali_res_li

[                PC1_Q  Coverage_MEAN
 0  (-19.271, -13.615]       0.873359
 1  (-13.615, -12.613]       0.879091
 2  (-12.613, -11.683]       0.882716
 3  (-11.683, -10.323]       0.882963
 4  (-10.323, 419.396]       0.882634,
                PC2_Q  Coverage_MEAN
 0  (-282.318, 2.103]       0.894827
 1     (2.103, 3.217]       0.886148
 2      (3.217, 4.07]       0.878636
 3      (4.07, 5.039]       0.883385
 4    (5.039, 86.112]       0.849972,
             AGE_Q  Coverage_MEAN
 0  (36.999, 48.0]       0.877740
 1    (48.0, 55.0]       0.880594
 2    (55.0, 60.0]       0.884561
 3    (60.0, 64.0]       0.885229
 4    (64.0, 73.0]       0.881775,
    SEX_Q  Coverage_MEAN
 0      0       0.880759
 1      1       0.882411]

In [45]:
pheno_r2_res_li = []
for col in ['PC1_Q', 'PC2_Q', 'AGE_Q', 'SEX_Q']:
    df_concat = pd.concat(res_dict['r2']['PHENO'][col], axis=1, ignore_index=True)
    df_pheno_r2 = pd.concat(
        [df_concat[0], df_concat[[1,4,7,10,13]].mean(axis=1), df_concat[[2,5,8,11,14]].std(axis=1)],
        axis=1
    )
    df_pheno_r2.columns = [col, "R2_MEAN", "R2_MEAN_STD"]
    pheno_r2_res_li.append(df_pheno_r2)

In [46]:
pheno_r2_res_li

[                PC1_Q   R2_MEAN  R2_MEAN_STD
 0  (-19.271, -13.615]  0.173798     0.000901
 1  (-13.615, -12.613]  0.185154     0.001259
 2  (-12.613, -11.683]  0.184262     0.001752
 3  (-11.683, -10.323]  0.182647     0.001958
 4  (-10.323, 419.396]  0.163284     0.000766,
                PC2_Q   R2_MEAN  R2_MEAN_STD
 0  (-282.318, 2.103]  0.189649     0.000817
 1     (2.103, 3.217]  0.178926     0.001411
 2      (3.217, 4.07]  0.178869     0.001180
 3      (4.07, 5.039]  0.177613     0.001622
 4    (5.039, 86.112]  0.125419     0.001270,
             AGE_Q   R2_MEAN  R2_MEAN_STD
 0  (36.999, 48.0]  0.156666     0.001568
 1    (48.0, 55.0]  0.162842     0.001928
 2    (55.0, 60.0]  0.172637     0.002074
 3    (60.0, 64.0]  0.173591     0.000730
 4    (64.0, 73.0]  0.169796     0.001431,
    SEX_Q   R2_MEAN  R2_MEAN_STD
 0      0  0.167497     0.000392
 1      1  0.165249     0.000742]

In [49]:
pheno_cali_res_li = []
for col in ['PC1_Q', 'PC2_Q', 'AGE_Q', 'SEX_Q']:
    df_concat = pd.concat(res_dict['cali']['PHENO'][col], axis=1, ignore_index=True)
    df_pheno_cali = pd.concat(
        [df_concat[0], df_concat[[1,3,5,7,9]].mean(axis=1)],
        axis=1
    )
    df_pheno_cali.columns = [col, "Coverage_MEAN"]
    pheno_cali_res_li.append(df_pheno_cali)

In [50]:
pheno_cali_res_li

[                PC1_Q  Coverage_MEAN
 0  (-19.271, -13.615]       0.895587
 1  (-13.615, -12.613]       0.900134
 2  (-12.613, -11.683]       0.899914
 3  (-11.683, -10.323]       0.899531
 4  (-10.323, 419.396]       0.896295,
                PC2_Q  Coverage_MEAN
 0  (-282.318, 2.103]       0.898994
 1     (2.103, 3.217]       0.900790
 2      (3.217, 4.07]       0.896993
 3      (4.07, 5.039]       0.899295
 4    (5.039, 86.112]       0.891028,
             AGE_Q  Coverage_MEAN
 0  (36.999, 48.0]       0.895253
 1    (48.0, 55.0]       0.895594
 2    (55.0, 60.0]       0.899073
 3    (60.0, 64.0]       0.901043
 4    (64.0, 73.0]       0.897092,
    SEX_Q  Coverage_MEAN
 0      0       0.896535
 1      1       0.898119]

1. stratify_calculate_r2 by PC1, PC2, AGE, SEX
- R2 different by PC1 and PC2, but not by AGE, SEX
- do this for both PHENO_G and PHENO

2. eval_calibration using PHENO_G and `[MEAN - 1.645 * STD, MEAN + 1.645 * STD]` 
- expect 90% coverage
3. eval_calibration using PHENO and `[MEAN - 1.645 * PHENO-STD, MEAN + 1.645 * PHENO-STD]`

- PHENO-G ~ PRS-MEAN + STD(PRS)
- PHENO ~ PRS-MEAN + STD(PRS + ENV)
simulated heritability is 0.25
var[y] = 1
hsq = var[gv] / (var[gv] + var[e]) = 0.25
var[e] = 0.75

STD(PRS + ENV) = sqrt(PRS-STD ** 2 + ENV-STD ** 2)

df["PHENO-STD"] = np.sqrt(df["PRS-STD"] ** 2 + 0.75)

ENV-STD ** 2 = 0.75 = 1 - 0.25