In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import xarray as xr
import numpy as np
import xrpgen
import pandas as pd
import dask.array as da
import itertools
import submitit
import sys

sys.path.append("/u/project/pasaniuc/kangchen/2021-admix-corr")
import admix_genet_cor

In [2]:
CHROM = 22

In [3]:
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(["hm3", "all"], [1.0], [1.0], [0.5, 0.8, 1.0])
    ],
    columns=["snpset", "varg", "pcausal", "gamma"],
)
df_params["out_prefix"] = df_params.apply(
    lambda row: f"out/pheno/snpset-{row.snpset}-varg-{row.varg}-pcausal-{row.pcausal}-gamma-{row.gamma}",
    axis=1,
)
df_params = df_params.iloc[0:2]

In [4]:
def submit_simulate_pheno(
    snp_set: str, var_g: float, pcausal: float, gamma: float, out_prefix: str
):

    import sys

    sys.path.append("/u/project/pasaniuc/kangchen/2021-admix-corr")
    import admix_genet_cor

    pgen_path = f"/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/s03_aframr/pgen/{snp_set}/chr{CHROM}.pgen"

    print(
        f"pgen_path: {pgen_path}, var_g: {var_g}, pcausal: {pcausal}, gamma: {gamma}, out_prefix: {out_prefix}"
    )
    dset = xrpgen.read_pfile(pgen_path, phase=True)
    dset["lanc"] = (dset.geno.dims, da.from_zarr(pgen_path.replace(".pgen", ".lanc")))
    dset.attrs["n_anc"] = 2

    sim = admix_genet_cor.simulate_continuous_pheno(
        dset,
        var_g=var_g,
        gamma=gamma,
        var_e=1.0,
        n_causal=int(dset.dims["snp"] * pcausal),
        n_sim=30,
    )
    np.save(out_prefix + ".beta.npy", sim["beta"])
    sim["pheno"].to_csv(out_prefix + ".pheno.tsv", index=True, sep="\t")


def submit_compute_grm(snp_set: str, out_prefix: str):

    import sys

    sys.path.append("/u/project/pasaniuc/kangchen/2021-admix-corr")
    import admix_genet_cor

    pgen_path = f"/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/s03_aframr/pgen/{snp_set}/chr{CHROM}.pgen"

    print(f"pgen_path: {pgen_path}, out_prefix: {out_prefix}")
    dset = xrpgen.read_pfile(PGEN_PATH, phase=True)
    dset["lanc"] = (dset.geno.dims, da.from_zarr(pgen_path.replace(".pgen", ".lanc")))
    dset.attrs["n_anc"] = 2

    K1, K2, K12 = admix_genet_cor.compute_grm(dset, center=True)

    np.save(out_prefix + ".K1.npy", K1)
    np.save(out_prefix + ".K2.npy", K2)
    np.save(out_prefix + ".K12.npy", K12)


def submit_estimate(snp_set: str, pheno: str, out_prefix: str):

    import sys

    sys.path.append("/u/project/pasaniuc/kangchen/2021-admix-corr")
    import admix_genet_cor

    print(f"submit_estimate: snp_set={snp_set}, pheno={pheno}, out_prefix={out_prefix}")
    # Step 1: load GRM
    K1 = np.load(f"out/grm/{snp_set}.K1.npy")
    K2 = np.load(f"out/grm/{snp_set}.K2.npy")
    K12 = np.load(f"out/grm/{snp_set}.K12.npy")

    pgen_path = f"/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/s03_aframr/pgen/{snp_set}/chr{CHROM}.pgen"

    dset = xrpgen.read_pfile(pgen_path, phase=True)
    dset["lanc"] = (dset.geno.dims, da.from_zarr(pgen_path.replace(".pgen", ".lanc")))
    dset.attrs["n_anc"] = 2

    dset["A1"] = (("indiv", "indiv"), K1 + K2)
    dset["A2"] = (("indiv", "indiv"), K12 + K12.T)

    # K1, K2, K12 are no longer used, release memory for them
    del K1, K2, K12

    # Step 2: load phenotype and estimate
    pheno = pd.read_csv(pheno, delim_whitespace=True, index_col=0)
    print(pheno.head())

    rls_list = admix_genet_cor.estimate_genetic_cor(dset, pheno.values)
    print(rls_list)

# Simulate phenotype

In [6]:
df_params

Unnamed: 0,snpset,varg,pcausal,gamma,out_prefix
0,hm3,1.0,1.0,0.5,out/pheno/snpset-hm3-varg-1.0-pcausal-1.0-gamm...
1,hm3,1.0,1.0,0.8,out/pheno/snpset-hm3-varg-1.0-pcausal-1.0-gamm...


In [26]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=60 * 3,
    memory_g=24,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_simulate_pheno,
    df_params.snpset,
    df_params.varg,
    df_params.pcausal,
    df_params.gamma,
    df_params.out_prefix,
)

In [None]:
executor.update_parameters(
    time_min=60 * 3,
    memory_g=24,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_compute_grm,
    ["all", "hm3"],
    ["out/grm/all", "out/grm/hm3"]
)

In [5]:
submit_estimate("hm3", "out/pheno/snpset-hm3-varg-1.0-pcausal-1.0-gamma-0.5.pheno.tsv", "out/estimate")

submit_estimate: snp_set=hm3, pheno=out/pheno/snpset-hm3-varg-1.0-pcausal-1.0-gamma-0.5.pheno.tsv, out_prefix=out/estimate
             SIM_0     SIM_1     SIM_2     SIM_3     SIM_4     SIM_5  \
EC012047  0.604813  1.321706 -1.431661  0.569237 -0.263331 -0.809179   
EC150635 -2.882983 -0.483648 -0.063976  0.560419  0.020538  0.236429   
728314   -1.641886  1.264695 -0.553209  2.116093  0.573362 -0.744871   
709411    0.344567 -0.291240 -0.669629 -0.835595 -2.089428 -0.128010   
791264   -1.549024  0.433780 -1.469458 -0.038749  0.110079 -0.074696   

             SIM_6     SIM_7     SIM_8     SIM_9  ...    SIM_20    SIM_21  \
EC012047 -1.117742  1.219374  0.152453  2.158055  ... -0.132645  1.295733   
EC150635  1.646154 -0.193728 -0.513230  0.780463  ...  0.118712 -0.218369   
728314    0.510908 -0.202473  0.634590 -1.728898  ...  0.244214 -0.919846   
709411   -0.281925 -0.750734  0.202396  1.361253  ...  2.621240 -0.265719   
791264   -0.969956 -1.220861 -0.553301 -0.348093  ...  0.28

MemoryError: Unable to allocate 2.23 GiB for an array with shape (17299, 17299) and data type float64

# False positive and power simulation

# Imputation simulation