# PAGE hm3 analysis

### Step 2: Calculate GRM
Calculate GRM matrices
```

### Step 3: estimate parameters
```bash
for n_indiv in 5000 10000 20000; do
    for anc_prop in 0.2 0.5; do
        dset_prefix=n_indiv_${n_indiv}_anc_prop_${anc_prop}
        for cor in 0.5 0.8 1.0; do
            qsub estimate_HE.sh ${dset_prefix} ${cor}
        done
    done
done
```

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
import xarray as xr
import numpy as np
import admix
import matplotlib.pyplot as plt
import dask
import dask.array as da
from tqdm import tqdm
import admix_genet_cor
import pandas as pd

In [3]:
dset = admix_genet_cor.load_page_hm3()

100%|██████████| 22/22 [00:04<00:00,  4.84it/s]


In [5]:
# a = np.load("out/OLD_admix_grm/K1.all.npy")[0:100, :] + np.load("out/OLD_admix_grm/K2.all.npy")[0:100, :]
# b = np.load("out/admix-grm/hm3.uniform.all.A1.npy")[0: 100, :]
# c = np.load("out/admix-grm/imputed.mafukb.all.A1.npy")[0:100, :]

In [6]:
# K1 = np.load("out/OLD_admix_grm/K1.all.npy")
# K2 = np.load("out/OLD_admix_grm/K2.all.npy")
# K12 = np.load("out/OLD_admix_grm/K12.all.npy")
# dset["A1"] = (("indiv", "indiv"), np.load("out/admix-grm/imputed.mafukb.all.A1.npy"))
# dset["A2"] = (("indiv", "indiv"), np.load("out/admix-grm/imputed.mafukb.all.A2.npy"))
# del K1, K2, K12

dset["A1"] = (("indiv", "indiv"), np.load("out/admix-grm/hm3.mafukb.all.A1.npy"))
dset["A2"] = (("indiv", "indiv"), np.load("out/admix-grm/hm3.mafukb.all.A2.npy"))

In [7]:
trait_list = [
    # Inflammtory traits
    "crp",
    "total_wbc_cnt",
    "mean_corp_hgb_conc",
    "platelet_cnt",
    # lipid traits
    "hdl",
    "ldl",
    "triglycerides",
    "total_cholesterol",
    # lifestyle traits
    "cigs_per_day_excl_nonsmk_updated",
    "coffee_cup_day",
    # glycemic traits
    "a1c",
    "insulin",
    "glucose",
    "t2d_status",
    # electrocardiogram traits
    "qt_interval",
    "qrs_interval",
    "pr_interval",
    # blood pressure traits
    "systolic_bp",
    "diastolic_bp",
    "hypertension",
    # anthropometric traits
    "waist_hip_ratio",
    "height",
    "bmi",
    # kidney traits
    "egfrckdepi",
]

In [None]:
dict_result = {"trait": [], 
             "n_indiv": [], 
             "est": [],
             "est_var": []}

for trait in tqdm(trait_list):
    dset_cor = dset.isel(indiv=~np.isnan(dset[f"{trait}"].values))
    study_dummies = pd.get_dummies(dset_cor["study"], drop_first=True)
    for c in study_dummies:
        dset_cor[f"study_dummy_{c}"] = ("indiv", study_dummies[c])
    study_dummy_cols = [f"study_dummy_{c}" for c in study_dummies]
    
    pheno = dset_cor[f"{trait}"].values
    pheno = admix.data.quantile_normalize(pheno)
    
    cov_cols=["age", "sex"] + study_dummy_cols + [f"geno_EV{i}" for i in range(1, 11)]
    cov = np.c_[np.ones((dset_cor.dims["indiv"], 1)), np.vstack([dset_cor[col].data for col in cov_cols]).T]
    normalized_cov = np.array(cov)
    for i in range(1, cov.shape[1]):
#         normalized_cov[:, i] = admix.data.quantile_normalize(normalized_cov[:, i])
        normalized_cov[:, i] = (normalized_cov[:, i] - normalized_cov[:, i].mean()) / normalized_cov[:, i].std()
    est, est_var = admix_genet_cor.estimate_genetic_cor(A1=dset_cor["A1"].data, A2=dset_cor["A2"].data, pheno=pheno, cov=normalized_cov, compute_varcov=True)[0]
    
#     est, est_var = admix.estimate.admix_gen_cor(dset=dset_cor, 
#                                                 pheno=pheno, 
#                                                 cov_cols=["age", "sex"] + study_dummy_cols + [f"geno_EV{i}" for i in range(1, 11)])[0]
    print('--------------')
    print(f"{trait} (N={dset_cor.dims['indiv']})")
    print(est)
    print(est_var)
    
    dict_result["trait"].append(trait)
    dict_result["n_indiv"].append(dset_cor.dims['indiv'])
    dict_result["est"].append(est)
    dict_result["est_var"].append(est_var)

    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]

  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:58<00:00, 58.72s/it][A
  4%|▍         | 1/24 [02:50<1:05:11, 170.08s/it]

--------------
crp (N=8521)
[0.88029924 0.83271288 0.69111675]
[[ 0.02788207  0.03028527 -0.00789622]
 [ 0.03028527  0.03522145 -0.0084219 ]
 [-0.00789622 -0.0084219   0.00246023]]



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [01:06<00:00, 66.62s/it][A
  8%|▊         | 2/24 [06:01<1:06:55, 182.54s/it]

--------------
total_wbc_cnt (N=8889)
[1.74641433 0.68560884 0.30634003]
[[ 0.13933774  0.13696613 -0.04127721]
 [ 0.13696613  0.19858521 -0.03718428]
 [-0.04127721 -0.03718428  0.01261213]]



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:05<00:00,  5.97s/it][A
 12%|█▎        | 3/24 [06:18<37:31, 107.24s/it]  

--------------
mean_corp_hgb_conc (N=3816)
[0.77494268 0.77926016 0.68818139]
[[ 0.09375044  0.10409616 -0.02691856]
 [ 0.10409616  0.11975832 -0.02956756]
 [-0.02691856 -0.02956756  0.00819808]]



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [01:07<00:00, 67.14s/it][A
 17%|█▋        | 4/24 [09:33<47:13, 141.65s/it]

--------------
platelet_cnt (N=8871)
[0.49392196 0.43896752 0.79236156]
[[ 0.02149385  0.0235601  -0.00616198]
 [ 0.0235601   0.02749632 -0.00664832]
 [-0.00616198 -0.00664832  0.00197389]]



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [01:42<00:00, 102.18s/it][A
 21%|██        | 5/24 [14:29<1:02:26, 197.21s/it]

--------------
hdl (N=10248)
[0.51158664 0.45558797 0.74830912]
[[ 0.01603851  0.01747448 -0.00458396]
 [ 0.01747448  0.02045756 -0.00490504]
 [-0.00458396 -0.00490504  0.00147529]]



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [01:31<00:00, 91.27s/it][A
 25%|██▌       | 6/24 [18:53<1:05:58, 219.89s/it]

--------------
ldl (N=9875)
[0.37571103 0.35042785 0.75026797]
[[ 0.01413599  0.01556989 -0.00405984]
 [ 0.01556989  0.01797671 -0.00441476]
 [-0.00405984 -0.00441476  0.00132129]]



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [01:43<00:00, 103.49s/it][A
 29%|██▉       | 7/24 [23:48<1:09:19, 244.70s/it]

--------------
triglycerides (N=10217)
[0.58297196 0.5524159  0.77315998]
[[ 0.01732809  0.01894667 -0.00492521]
 [ 0.01894667  0.02197755 -0.00530185]
 [-0.00492521 -0.00530185  0.00158128]]



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [01:47<00:00, 107.30s/it][A
 33%|███▎      | 8/24 [28:55<1:10:32, 264.53s/it]

--------------
total_cholesterol (N=10300)
[0.33505173 0.25069395 0.75631938]
[[ 0.01415283  0.01541836 -0.00409553]
 [ 0.01541836  0.01823924 -0.00437504]
 [-0.00409553 -0.00437504  0.00133486]]



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:35<00:00, 35.22s/it][A
 38%|███▊      | 9/24 [30:39<53:33, 214.24s/it]  

--------------
cigs_per_day_excl_nonsmk_updated (N=7171)
[0.35779495 0.35253069 0.82336708]
[[ 0.02878913  0.03198806 -0.00834649]
 [ 0.03198806  0.03651117 -0.00919309]
 [-0.00834649 -0.00919309  0.00266916]]



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [02:38<00:00, 158.83s/it][A
 42%|████▏     | 10/24 [38:10<1:07:02, 287.35s/it]

--------------
coffee_cup_day (N=11862)
[0.27855091 0.28503287 0.61405621]
[[ 0.00618125  0.00684091 -0.00176774]
 [ 0.00684091  0.00779935 -0.00193777]
 [-0.00176774 -0.00193777  0.00058896]]



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  1.23it/s][A
 46%|████▌     | 11/24 [38:13<43:22, 200.19s/it]  

--------------
a1c (N=1854)
[1.08367148 1.21599014 0.65604049]
[[ 0.35669547  0.39905094 -0.10239748]
 [ 0.39905094  0.46057478 -0.11328944]
 [-0.10239748 -0.11328944  0.03052875]]



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:48<00:00, 48.43s/it][A
 50%|█████     | 12/24 [40:31<36:16, 181.38s/it]

--------------
insulin (N=7927)
[0.59421066 0.61952427 0.79864545]
[[ 0.02678899  0.02968644 -0.00764744]
 [ 0.02968644  0.03379579 -0.00839804]
 [-0.00764744 -0.00839804  0.00242914]]



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [01:32<00:00, 92.73s/it][A
 54%|█████▍    | 13/24 [44:59<38:05, 207.78s/it]

--------------
glucose (N=9937)
[0.21655688 0.19201887 0.8281267 ]
[[ 0.01390763  0.01541751 -0.00404126]
 [ 0.01541751  0.0177312  -0.00443304]
 [-0.00404126 -0.00443304  0.0013393 ]]



  0%|          | 0/1 [00:00<?, ?it/s][A

In [None]:
df_rls = dict()
for name in ["trait", "n_indiv"]:
    df_rls[name] = dict_result[name]

for i, name in enumerate(["sigma_g", "gamma", "sigma_e"]):
    df_rls[name] = [f"{e[i]:.2f} ({np.sqrt(v[i, i]):.2f})" for e, v in zip(dict_result["est"], dict_result["est_var"])]
df_rls = pd.DataFrame(df_rls)

In [None]:
df_rls

In [None]:
df_rls.to_csv("result/page.csv", index=False)