In [10]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import pickle
import os
import admix_prs

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [11]:
PHENO_PATH = "/u/project/pasaniuc/kangchen/tmp/prs-1219/REAL-PHENO/all-pheno.csv"
df_pheno = pd.read_csv(PHENO_PATH, index_col=0)
df_pheno["PC1_Q"] = admix_prs.make_levels(df_pheno, stratify_col="PC1", n_level=5)
df_pheno["PC2_Q"] = admix_prs.make_levels(df_pheno, stratify_col="PC2", n_level=5)
df_pheno["AGE_Q"] = admix_prs.make_levels(df_pheno, stratify_col="AGE", n_level=5)

In [12]:
config = "hsq-0.25-pcausal-0.01-hermodel-uniform"

In [13]:
if os.path.exists("df_res_list.pkl"):
    with open("df_res_list.pkl", "rb") as f:
        df_res_list = pickle.load(f)
else:
    df_res_list = admix_prs.load_sim_data(config)
    with open("df_res_list.pkl", "wb") as f:
        pickle.dump(df_res_list, f)

In [25]:
n_sim = len(df_res_list)

In [14]:
res_dict = {
    'r2': {
        'pheno_g': {'pc1': list(), 'pc2':list(), 'age':list(), 'sex':list()},
        'pheno': {'pc1': list(), 'pc2':list(), 'age':list(), 'sex':list()}    
    },
    'cali':{
        'pheno_g': {'pc1': list(), 'pc2':list(), 'age':list(), 'sex':list()},
        'pheno': {'pc1': list(), 'pc2':list(), 'age':list(), 'sex':list()}    
    }
}

In [61]:

for i_sim in range(n_sim):
    df_prs = df_res_list[i_sim].copy()
    df_prs["PHENO_G"] -= df_prs["PHENO"].mean()
    df_prs["PHENO"] -= df_prs["PHENO"].mean()
    df_prs["ID"] = df_prs.index
    df_prs["ID"] = df_prs["ID"].apply(lambda x: x.split("_")[0])
    df_prs.index = df_prs["ID"]
    df_prs = df_prs.drop(columns=["ID"])
    df_prs = df_prs.dropna()
    df_prs.index = df_prs.index.astype(int)
    df_prs["PHENO_STD"] = np.sqrt(df_prs["PRS_STD"] ** 2 + 0.75)
    df_prs["PRS_LOW"] = df_prs["PRS_MEAN"] - 1.645 * df_prs["PRS_STD"]
    df_prs["PRS_UPP"] = df_prs["PRS_MEAN"] + 1.645 * df_prs["PRS_STD"]
    df_prs["PHENO_LOW"] = df_prs["PRS_MEAN"] - 1.645 * df_prs["PHENO_STD"]
    df_prs["PHENO_UPP"] = df_prs["PRS_MEAN"] + 1.645 * df_prs["PHENO_STD"]
    df_info = pd.merge(df_pheno, df_prs, left_index=True, right_index=True)    
    
    res_dict['r2']['pheno_g']['pc1'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO_G", y_col="PRS_MEAN", group_col="PC1_Q"))
    res_dict['r2']['pheno_g']['pc2'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO_G", y_col="PRS_MEAN", group_col="PC2_Q"))
    res_dict['r2']['pheno_g']['sex'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO_G", y_col="PRS_MEAN", group_col="SEX"))
    res_dict['r2']['pheno_g']['age'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO_G", y_col="PRS_MEAN", group_col="AGE_Q"))
    res_dict['r2']['pheno']['pc1'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO", y_col="PRS_MEAN", group_col="PC1_Q"))
    res_dict['r2']['pheno']['pc2'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO", y_col="PRS_MEAN", group_col="PC2_Q"))
    res_dict['r2']['pheno']['sex'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO", y_col="PRS_MEAN", group_col="SEX"))
    res_dict['r2']['pheno']['age'].append(admix_prs.stratify_calculate_r2(df_info, x_col="PHENO", y_col="PRS_MEAN", group_col="AGE_Q"))
    res_dict['cali']['pheno_g']['pc1'].append(admix_prs.eval_calibration(df_info, x_col="PHENO_G", lower_col="PRS_LOW", upper_col="PRS_UPP", group_col="PC1_Q"))
    res_dict['cali']['pheno_g']['pc2'].append(admix_prs.eval_calibration(df_info, x_col="PHENO_G", lower_col="PRS_LOW", upper_col="PRS_UPP", group_col="PC2_Q"))
    res_dict['cali']['pheno_g']['sex'].append(admix_prs.eval_calibration(df_info, x_col="PHENO_G", lower_col="PRS_LOW", upper_col="PRS_UPP", group_col="SEX"))
    res_dict['cali']['pheno_g']['age'].append(admix_prs.eval_calibration(df_info, x_col="PHENO_G", lower_col="PRS_LOW", upper_col="PRS_UPP", group_col="AGE_Q"))
    res_dict['cali']['pheno']['pc1'].append(admix_prs.eval_calibration(df_info, x_col="PHENO", lower_col="PHENO_LOW", upper_col="PHENO_UPP", group_col="PC1_Q"))
    res_dict['cali']['pheno']['pc2'].append(admix_prs.eval_calibration(df_info, x_col="PHENO", lower_col="PHENO_LOW", upper_col="PHENO_UPP", group_col="PC2_Q"))
    res_dict['cali']['pheno']['age'].append(admix_prs.eval_calibration(df_info, x_col="PHENO", lower_col="PHENO_LOW", upper_col="PHENO_UPP", group_col="AGE_Q"))
    res_dict['cali']['pheno']['sex'].append(admix_prs.eval_calibration(df_info, x_col="PHENO", lower_col="PHENO_LOW", upper_col="PHENO_UPP", group_col="SEX"))
    

In [62]:
res_dict['r2']['pheno_g']['pc1']

[                PC1_Q        R2    R2_std
 0  (-19.271, -13.615]  0.729972  0.006727
 1  (-13.615, -12.613]  0.732209  0.008188
 2  (-12.613, -11.683]  0.736849  0.004965
 3  (-11.683, -10.323]  0.733245  0.005711
 4  (-10.323, 419.396]  0.653893  0.003369,
                 PC1_Q        R2    R2_std
 0  (-19.271, -13.615]  0.738519  0.005314
 1  (-13.615, -12.613]  0.724991  0.006109
 2  (-12.613, -11.683]  0.734305  0.005966
 3  (-11.683, -10.323]  0.739448  0.003722
 4  (-10.323, 419.396]  0.694450  0.001878,
                 PC1_Q        R2    R2_std
 0  (-19.271, -13.615]  0.739149  0.009384
 1  (-13.615, -12.613]  0.740538  0.005211
 2  (-12.613, -11.683]  0.735786  0.005596
 3  (-11.683, -10.323]  0.739621  0.005079
 4  (-10.323, 419.396]  0.618882  0.004150,
                 PC1_Q        R2    R2_std
 0  (-19.271, -13.615]  0.724696  0.005586
 1  (-13.615, -12.613]  0.738864  0.005235
 2  (-12.613, -11.683]  0.736219  0.005788
 3  (-11.683, -10.323]  0.740725  0.005321
 4  (-10

In [81]:
from functools import reduce
dfs = [res_dict['r2']['pheno_g']['pc1'][i] for i in range(n_sim)]
df_merged = reduce(lambda  left,right: pd.merge(left,right,
                                                left_on='PC1_Q', 
                                                right_on='PC1_Q'), dfs)

  df_merged = reduce(lambda  left,right: pd.merge(left,right,


In [82]:
df_merged = df_merged.rename(columns={"R2":"R2_sim5", "R2_std":"R2_std_sim5"})
d1 = {'R2_x': ['R2_sim1', 'R2_sim3']}
d2 = {'R2_y': ['R2_sim2', 'R2_sim4']}
d3 = {'R2_std_x': ['R2_std_sim1', 'R2_std_sim3']}
d4 = {'R2_std_y': ['R2_std_sim2', 'R2_std_sim4']}
d_li = [d1, d2, d3, d4]
for d in d_li:
    df_merged = df_merged.rename(columns=lambda c: d[c].pop(0) if c in d.keys() else c)

In [90]:
df_merged

Unnamed: 0,PC1_Q,R2_sim1,R2_std_sim1,R2_sim2,R2_std_sim2,R2_sim3,R2_std_sim3,R2_sim4,R2_std_sim4,R2_sim5,R2_std_sim5
0,"(-19.271, -13.615]",0.729972,0.006727,0.738519,0.005314,0.739149,0.009384,0.724696,0.005586,0.717078,0.008089
1,"(-13.615, -12.613]",0.732209,0.008188,0.724991,0.006109,0.740538,0.005211,0.738864,0.005235,0.720082,0.003568
2,"(-12.613, -11.683]",0.736849,0.004965,0.734305,0.005966,0.735786,0.005596,0.736219,0.005788,0.721735,0.008365
3,"(-11.683, -10.323]",0.733245,0.005711,0.739448,0.003722,0.739621,0.005079,0.740725,0.005321,0.728794,0.006145
4,"(-10.323, 419.396]",0.653893,0.003369,0.69445,0.001878,0.618882,0.00415,0.647987,0.002166,0.473939,0.003244


In [93]:
df_merged[['R2_sim1', 'R2_sim2', 'R2_sim3', 'R2_sim4', 'R2_sim5']].mean(axis=1)

0    0.729883
1    0.731337
2    0.732979
3    0.736366
4    0.617830
dtype: float64

In [94]:
df_merged[['R2_std_sim1', 'R2_std_sim2', 'R2_std_sim3', 'R2_std_sim4', 'R2_std_sim5']].mean(axis=1)

0    0.007020
1    0.005662
2    0.006136
3    0.005195
4    0.002961
dtype: float64

1. stratify_calculate_r2 by PC1, PC2, AGE, SEX
- R2 different by PC1 and PC2, but not by AGE, SEX
- do this for both PHENO_G and PHENO

2. eval_calibration using PHENO_G and `[MEAN - 1.645 * STD, MEAN + 1.645 * STD]` 
- expect 90% coverage
3. eval_calibration using PHENO and `[MEAN - 1.645 * PHENO-STD, MEAN + 1.645 * PHENO-STD]`

- PHENO-G ~ PRS-MEAN + STD(PRS)
- PHENO ~ PRS-MEAN + STD(PRS + ENV)
simulated heritability is 0.25
var[y] = 1
hsq = var[gv] / (var[gv] + var[e]) = 0.25
var[e] = 0.75

STD(PRS + ENV) = sqrt(PRS-STD ** 2 + ENV-STD ** 2)

df["PHENO-STD"] = np.sqrt(df["PRS-STD"] ** 2 + 0.75)

ENV-STD ** 2 = 0.75 = 1 - 0.25