In [None]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import xarray as xr
import numpy as np
import xrpgen
import pandas as pd
import dask.array as da
import itertools
import submitit
import json
import matplotlib.pyplot as plt
import admix_genet_cor
import admix

# Overview of simulation studies
- p_causal: 0.005, 0.1, 1.0
- var_g: 1.0
- var_e: 1.0
- gamma: 0.5, 0.8, 1.0
- set of SNPs: simulate from imputed SNPs / HM3 SNPs

In [None]:
# CONSTANTS

CHROM = 22
DATA_ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01_dataset/out/aframr"
)

# define the simulation parameters

df_simulate_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["hm3", "imputed"], [1.0], [0.005, 0.1, 1.0], [0.5, 0.8, 1.0]
        )
    ],
    columns=["snpset", "varg", "pcausal", "rho"],
)
df_simulate_params["out_prefix"] = df_simulate_params.apply(
    lambda row: f"out/pheno/snpset-{row.snpset}"
    + f"-varg-{row.varg}-pcausal-{row.pcausal}-rho-{row.rho}",
    axis=1,
)

# Step 1: Simulate phenotype

In [None]:
# for each SNP, calculate the frequency, and LDSCORE

In [5]:
def calc_snp_prior_var(df_snp_info, her_model):
    """
    Calculate the SNP prior variance from SNP information
    """
    assert her_model in ["uniform", "gcta", "ldak"]
    if her_model == "uniform":
        return np.ones(len(df_snp_info))
    elif her_model == "gcta":
        freq = df_snp_info["FREQ"].values
        assert np.all(freq > 0), "frequencies should be larger than zero"
        return np.float_power(freq * (1 - freq), -1)
    elif her_model == "ldak":
        freq, weight = df_snp_info["FREQ"].values, df_snp_info["LDAK_WEIGHT"].values
        return np.float_power(freq * (1 - freq), -0.25) * weight

In [75]:
def submit_simulate_pheno(
    snp_set: str, var_g: float, pcausal: float, gamma: float, out_prefix: str, her_model:str="gcta"
):
    """
    her_model: one of [uniform, gcta, ldak]
    
    """
    assert her_model in ["uniform", "gcta", "ldak"]
    
    np.random.seed(1234)

    pgen_path = f"{DATA_ROOT_DIR}/{snp_set}/chr{CHROM}"
    lanc_path = pgen_path + ".lanc"
    print(
        f"pgen_path: {pgen_path}"
        + f"var_g: {var_g}, pcausal: {pcausal}, gamma: {gamma}, out_prefix: {out_prefix}"
    )
    pgen, df_snp, df_indiv = xrpgen.read_pfile(pgen_path, phase=True, snp_chunk=1024)
    geno = pgen.data
    lanc = admix.io.read_lanc(lanc_path, snp_chunk=1024)
    
    df_snp_info = pd.read_csv(pgen_path + ".snp_info", sep='\t')
    snp_prior_var = calc_snp_prior_var(df_snp_info, her_model)
    
    sim = admix_genet_cor.simulate_continuous_pheno(
        geno=geno,
        lanc=lanc,
        var_g=var_g,
        gamma=gamma,
        var_e=1.0,
        n_causal=int(geno.shape[0] * pcausal),
        n_sim=30,
    )
    print(f"n_causal={int(geno.shape[0] * pcausal)}")
    np.save(out_prefix + ".beta.npy", sim["beta"])
    df_pheno = pd.DataFrame(
        sim["pheno"],
        index=df_indiv.index,
        columns=[f"SIM_{i}" for i in range(sim["pheno"].shape[1])],
    )
    df_pheno.to_csv(out_prefix + ".pheno.tsv", index=True, sep="\t")

In [76]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=60 * 2,
    memory_g=40,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_simulate_pheno,
    df_simulate_params.snpset,
    df_simulate_params.varg,
    df_simulate_params.pcausal,
    df_simulate_params.rho,
    df_simulate_params.out_prefix,
)

# Step 2: Compute GRM

In [78]:
def submit_compute_grm(snp_set: str, out_prefix: str):

    pgen_path = f"{DATA_ROOT_DIR}/{snp_set}/chr{CHROM}"
    lanc_path = pgen_path + ".lanc"

    print(f"pgen_path: {pgen_path}, out_prefix: {out_prefix}")

    pgen, df_snp, df_indiv = xrpgen.read_pfile(pgen_path, phase=True, snp_chunk=1024)
    geno = pgen.data
    lanc = admix.io.read_lanc(lanc_path, snp_chunk=1024)

    K1, K2, K12 = admix_genet_cor.compute_grm(geno, lanc, center=True)

    np.save(out_prefix + ".K1.npy", K1)
    np.save(out_prefix + ".K2.npy", K2)
    np.save(out_prefix + ".K12.npy", K12)


executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=180,
    memory_g=40,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_compute_grm, ["imputed", "hm3"], ["out/grm/imputed", "out/grm/hm3"]
)

# Step 3: Estimation

In [98]:
df_estimate_params = pd.DataFrame({"estimate_snp_set": ["hm3", "imputed"]}).merge(
    df_simulate_params, how="cross"
)
df_estimate_params["estimate_out_prefix"] = df_estimate_params.apply(
    lambda row: row.out_prefix.replace("pheno", "estimate")
    + "."
    + row.estimate_snp_set,
    axis=1,
)

In [100]:
def submit_estimate(snp_set: str, pheno: str, out_prefix: str):
    """
    snp_set: set of SNPs used as GRM, for estimation
    pheno: phenotype file
    out_prefix: prefix to the output
    """

    print(f"submit_estimate: snp_set={snp_set}, pheno={pheno}, out_prefix={out_prefix}")
    # Step 1: load GRM
    K1 = np.load(f"out/grm/{snp_set}.K1.npy")
    K2 = np.load(f"out/grm/{snp_set}.K2.npy")
    K12 = np.load(f"out/grm/{snp_set}.K12.npy")

    # step 2 load data
    pgen_path = f"{DATA_ROOT_DIR}/{snp_set}/chr{CHROM}"
    lanc_path = pgen_path + ".lanc"

    pgen, df_snp, df_indiv = xrpgen.read_pfile(pgen_path, phase=True, snp_chunk=1024)
    geno = pgen.data
    lanc = admix.io.read_lanc(lanc_path, snp_chunk=1024)

    A1 = K1 + K2
    A2 = K12 + K12.T

    # K1, K2, K12 are no longer used, release memory for them
    del K1, K2, K12

    # Step 2: load phenotype and estimate
    pheno = pd.read_csv(pheno, delim_whitespace=True, index_col=0)
    print(pheno.head())

    rls_list = admix_genet_cor.estimate_genetic_cor(A1, A2, pheno.values)

    data = [
        {"estimate": rls[0].tolist(), "varcov": rls[1].tolist()} for rls in rls_list
    ]

    with open(out_prefix + ".estimate.json", "w") as out:
        json.dump(data, out)

In [101]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=120,
    memory_g=50,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_estimate,
    df_estimate_params.estimate_snp_set,
    (df_estimate_params["out_prefix"] + ".pheno.tsv"),
    df_estimate_params.estimate_out_prefix,
)