# PAGE hm3 analysis

### Step 2: Calculate GRM
Calculate GRM matrices
```

### Step 3: estimate parameters
```bash
for n_indiv in 5000 10000 20000; do
    for anc_prop in 0.2 0.5; do
        dset_prefix=n_indiv_${n_indiv}_anc_prop_${anc_prop}
        for cor in 0.5 0.8 1.0; do
            qsub estimate_HE.sh ${dset_prefix} ${cor}
        done
    done
done
```

In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import xarray as xr
import numpy as np
import admix
import matplotlib.pyplot as plt
from utils import *
import dask
import dask.array as da
from tqdm import tqdm

def quantile_normalize(val):
    from scipy.stats import rankdata, norm
    return norm.ppf((rankdata(val) - 0.5) / len(val)) 

In [9]:
dset = load_hm3()

100%|██████████| 22/22 [00:00<00:00, 57.94it/s]


In [10]:
K1 = np.load("out/admix_grm/K1.all.npy")
K2 = np.load("out/admix_grm/K2.all.npy")
K12 = np.load("out/admix_grm/K12.all.npy")

In [11]:
dset["A1"] = (("indiv", "indiv"), K1 + K2)
dset["A2"] = (("indiv", "indiv"), K12 + K12.T)

In [1]:
trait_list = [
    # Inflammtory traits
    "crp",
    "total_wbc_cnt",
    "mean_corp_hgb_conc",
    "platelet_cnt",
    # lipid traits
    "hdl",
    "ldl",
    "triglycerides",
    "total_cholesterol",
    # lifestyle traits
    "cigs_per_day_excl_nonsmk_updated",
    "coffee_cup_day",
    # glycemic traits
    "a1c",
    "insulin",
    "glucose",
    "t2d_status",
    # electrocardiogram traits
    "qt_interval",
    "qrs_interval",
    "pr_interval",
    # blood pressure traits
    "systolic_bp",
    "diastolic_bp",
    "hypertension",
    # anthropometric traits
    "waist_hip_ratio",
    "height",
    "bmi",
    # kidney traits
    "egfrckdepi",
]


In [9]:
with open("trait_list.txt", 'w') as f:
    f.writelines('\n'.join(trait_list))

In [137]:
dict_result = {"trait": [], 
             "n_indiv": [], 
             "est": [],
             "est_var": []}

for trait in tqdm(trait_list):
    dset_cor = dset.isel(indiv=~np.isnan(dset[f"{trait}@indiv"].values))
    study_dummies = pd.get_dummies(dset_cor["study@indiv"], drop_first=True)
    for c in study_dummies:
        dset_cor[f"study_dummy_{c}@indiv"] = ("indiv", study_dummies[c])
    study_dummy_cols = [f"study_dummy_{c}" for c in study_dummies]
    
    pheno = dset_cor[f"{trait}@indiv"].values
#     pheno = (pheno - pheno.mean()) / pheno.std()
    pheno = quantile_normalize(pheno)
    est, est_var = admix.estimate.admix_gen_cor(dset=dset_cor, 
                                                pheno=pheno, 
                                                cov_cols=["age", "sex"] + study_dummy_cols + [f"geno_EV{i}" for i in range(1, 11)])[0]
    print('--------------')
    print(f"{trait} (N={dset_cor.dims['indiv']})")
    print(est)
    print(est_var)
    
    dict_result["trait"].append(trait)
    dict_result["n_indiv"].append(dset_cor.dims['indiv'])
    dict_result["est"].append(est)
    dict_result["est_var"].append(est_var)

    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
  4%|▍         | 1/24 [00:20<07:58, 20.82s/it]

--------------
crp (N=8521)
[0.42140723 0.39212931 0.82550683]
[[ 0.00601432  0.00846136 -0.00176934]
 [ 0.00846136  0.01635791 -0.00238153]
 [-0.00176934 -0.00238153  0.00074112]]


  8%|▊         | 2/24 [00:43<08:03, 21.99s/it]

--------------
total_wbc_cnt (N=8889)
[ 0.64724382 -1.05345162  0.65345385]
[[ 0.03220871  0.03939174 -0.00999912]
 [ 0.03939174  0.15508984 -0.01077568]
 [-0.00999912 -0.01077568  0.00334822]]


 12%|█▎        | 3/24 [00:47<04:51, 13.87s/it]

--------------
mean_corp_hgb_conc (N=3816)
[0.38898219 0.453761   0.79923986]
[[ 0.02307919  0.03400366 -0.00691036]
 [ 0.03400366  0.05704732 -0.00994381]
 [-0.00691036 -0.00994381  0.00252558]]


 17%|█▋        | 4/24 [01:10<05:49, 17.50s/it]

--------------
platelet_cnt (N=8871)
[0.21047784 0.14839214 0.87670605]
[[ 0.0048363   0.0070087  -0.00145156]
 [ 0.0070087   0.01291104 -0.00203896]
 [-0.00145156 -0.00203896  0.00063983]]


 21%|██        | 5/24 [01:43<07:15, 22.94s/it]

--------------
hdl (N=10248)
[0.25728543 0.20740644 0.82340057]
[[ 0.00357949  0.0050979  -0.00106347]
 [ 0.0050979   0.00976471 -0.00145797]
 [-0.00106347 -0.00145797  0.00047856]]


 25%|██▌       | 6/24 [02:11<07:25, 24.77s/it]

--------------
ldl (N=9875)
[0.17649221 0.15675546 0.80875166]
[[ 0.0031697   0.00460667 -0.00094956]
 [ 0.00460667  0.00822378 -0.00133846]
 [-0.00094956 -0.00133846  0.00043804]]


 29%|██▉       | 7/24 [02:42<07:34, 26.76s/it]

--------------
triglycerides (N=10217)
[0.28831102 0.28268692 0.85939772]
[[ 0.00383346  0.00549872 -0.00113085]
 [ 0.00549872  0.01010744 -0.00156662]
 [-0.00113085 -0.00156662  0.00051294]]


 33%|███▎      | 8/24 [03:15<07:36, 28.54s/it]

--------------
total_cholesterol (N=10300)
[0.16878066 0.07105047 0.80719309]
[[ 0.00316884  0.0045221  -0.00095653]
 [ 0.0045221   0.00885488 -0.00131399]
 [-0.00095653 -0.00131399  0.00043554]]


 38%|███▊      | 9/24 [03:28<05:59, 23.99s/it]

--------------
cigs_per_day_excl_nonsmk_updated (N=7171)
[0.1953002  0.21026018 0.86981171]
[[ 0.00638569  0.0094231  -0.00192279]
 [ 0.0094231   0.01607254 -0.00275878]
 [-0.00192279 -0.00275878  0.00082527]]


 42%|████▏     | 10/24 [04:11<06:57, 29.86s/it]

--------------
coffee_cup_day (N=11862)
[0.12520405 0.14712328 0.65859038]
[[ 0.00127476  0.00185596 -0.00037981]
 [ 0.00185596  0.00316793 -0.00053628]
 [-0.00037981 -0.00053628  0.00019614]]


 46%|████▌     | 11/24 [04:13<04:33, 21.06s/it]

--------------
a1c (N=1854)
[0.63306532 1.04932616 0.78363982]
[[ 0.105331    0.15650988 -0.03155093]
 [ 0.15650988  0.25046873 -0.04597796]
 [-0.03155093 -0.04597796  0.01052222]]


 50%|█████     | 12/24 [04:31<04:02, 20.17s/it]

--------------
insulin (N=7927)
[0.30018263 0.38293101 0.88315166]
[[ 0.0059739   0.00876503 -0.0017673 ]
 [ 0.00876503  0.01471117 -0.00252183]
 [-0.0017673  -0.00252183  0.00076709]]


 54%|█████▍    | 13/24 [04:59<04:10, 22.75s/it]

--------------
glucose (N=9937)
[0.10820453 0.09736257 0.86079365]
[[ 0.00313816  0.00462918 -0.00095034]
 [ 0.00462918  0.00793464 -0.00136514]
 [-0.00095034 -0.00136514  0.0004511 ]]


 58%|█████▊    | 14/24 [06:39<07:38, 45.84s/it]

--------------
t2d_status (N=16176)
[0.06996226 0.0684802  0.47584202]
[[ 3.88384027e-04  5.62437804e-04 -1.15753144e-04]
 [ 5.62437804e-04  9.92612187e-04 -1.62792595e-04]
 [-1.15753144e-04 -1.62792595e-04  6.53962151e-05]]


 62%|██████▎   | 15/24 [06:44<05:02, 33.56s/it]

--------------
qt_interval (N=4196)
[0.2525082  0.32103828 0.8986728 ]
[[ 0.02036245  0.03029722 -0.00615727]
 [ 0.03029722  0.05007502 -0.00894707]
 [-0.00615727 -0.00894707  0.00232919]]


 67%|██████▋   | 16/24 [06:48<03:17, 24.75s/it]

--------------
qrs_interval (N=4185)
[0.40813032 0.56560672 0.83883576]
[[ 0.02043545  0.03033205 -0.00609705]
 [ 0.03033205  0.04980681 -0.00883633]
 [-0.00609705 -0.00883633  0.00227562]]


 71%|███████   | 17/24 [06:53<02:11, 18.72s/it]

--------------
pr_interval (N=4178)
[0.4022199  0.44532517 0.84462778]
[[ 0.02143748  0.03156717 -0.00642499]
 [ 0.03156717  0.05374054 -0.00921667]
 [-0.00642499 -0.00921667  0.00238969]]


 75%|███████▌  | 18/24 [07:32<02:29, 25.00s/it]

--------------
systolic_bp (N=11425)
[0.09143118 0.13835599 0.91418628]
[[ 0.00255633  0.00380927 -0.00077074]
 [ 0.00380927  0.00617015 -0.00112386]
 [-0.00077074 -0.00112386  0.00038916]]


 79%|███████▉  | 19/24 [08:12<02:26, 29.36s/it]

--------------
diastolic_bp (N=11424)
[0.04543015 0.06252073 0.96944557]
[[ 0.00274493  0.00409577 -0.00083755]
 [ 0.00409577  0.00668504 -0.001223  ]
 [-0.00083755 -0.001223    0.00042641]]


 83%|████████▎ | 20/24 [10:01<03:33, 53.25s/it]

--------------
hypertension (N=17195)
[0.03414533 0.04625798 0.38825278]
[[ 1.95250339e-04  2.88258019e-04 -5.86183892e-05]
 [ 2.88258019e-04  4.72772110e-04 -8.44635755e-05]
 [-5.86183892e-05 -8.44635755e-05  3.62157150e-05]]


 88%|████████▊ | 21/24 [10:32<02:20, 46.71s/it]

--------------
waist_hip_ratio (N=10307)
[0.11409023 0.10967004 0.77954601]
[[ 0.00238857  0.00349933 -0.00072149]
 [ 0.00349933  0.00601172 -0.0010272 ]
 [-0.00072149 -0.0010272   0.00034869]]


 92%|█████████▏| 22/24 [12:18<02:09, 64.56s/it]

--------------
height (N=17286)
[0.39970117 0.38581886 0.47209472]
[[ 0.00104545  0.00128381 -0.00029493]
 [ 0.00128381  0.0032907  -0.00032907]
 [-0.00029493 -0.00032907  0.00012583]]


 96%|█████████▌| 23/24 [14:07<01:17, 77.82s/it]

--------------
bmi (N=17264)
[0.37357099 0.41306849 0.8248541 ]
[[ 0.00157557  0.00213294 -0.00044475]
 [ 0.00213294  0.00424611 -0.0005712 ]
 [-0.00044475 -0.0005712   0.00022924]]


100%|██████████| 24/24 [14:26<00:00, 36.10s/it]

--------------
egfrckdepi (N=8261)
[0.14721418 0.12131103 0.68707128]
[[ 0.00322823  0.00472596 -0.00097069]
 [ 0.00472596  0.00834396 -0.00138296]
 [-0.00097069 -0.00138296  0.00042424]]





In [138]:
df_rls = dict()
for name in ["trait", "n_indiv"]:
    df_rls[name] = dict_result[name]

for i, name in enumerate(["sigma_g", "gamma", "sigma_e"]):
    df_rls[name] = [f"{e[i]:.2f} ({np.sqrt(v[i, i]):.2f})" for e, v in zip(dict_result["est"], dict_result["est_var"])]
df_rls = pd.DataFrame(df_rls)

In [None]:
df_rls

In [142]:
df_rls.to_csv("result/page.csv", index=False)