In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import dapgen
import pandas as pd
import dask.array as da
import itertools
import submitit
import json
import matplotlib.pyplot as plt
import admix_genet_cor
from admix_genet_cor import calc_snp_prior_var
import pandas as pd
import admix
from os.path import join
import os

# Overview of simulation studies
- p_causal: 0.001, 1.0
- var_g: 1.0
- var_e: 1.0
- gamma: 0.5, 0.8, 1.0
- set of SNPs: simulate from the imputed SNPs, the non-zero effects are simulated from SNPs with allele frequencies > 0.01 in both populations.

From S-LDXR paper: 
```
In these simulations, we randomly selected 10% of the SNPs to be causal in each population, with 80% of causal variants in each population shared with the other population, and sampled perfectly correlated causal effect sizes for shared causal variants using Eq. In these simulations, we set the variance of causal effect size of each SNP j in both populations to be proportional to $[p_{j,max}(1−p_{j,max})]^\alpha$, where pj,max is the maximum MAF of SNP j in the two populations.  We set α to − 0.38, as previously estimated for 25 UK Biobank diseases 
and complex traits in ref.
```

Here, we simulate from the imputed SNPs the non-zero effects are simulated from SNPs with allele frequencies > 0.005 in both populations. And from these, we sample 1% of causal effects.

In [2]:
# CONSTANTS

CHROM = 1
DATA_ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01_dataset/out/aframr"
)
GRM_DIR = "/u/project/pasaniuc/kangchen/2021-admix-corr/experiments/03_page_genome_wide/out/admix-grm"


# define the simulation parameters

df_simulate_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            [0.1, 0.5],
            [0.001, 0.01, 1.0],
            [0.6, 0.8, 1.0],
            ["mafukb"],
        )
    ],
    columns=["hsq", "pcausal", "cor", "hermodel"],
)
df_simulate_params["out_prefix"] = df_simulate_params.apply(
    lambda row: f"out/pheno/hsq-{row.hsq}-pcausal-{row.pcausal}-cor-{row.cor}-hermodel-{row.hermodel}",
    axis=1,
)

# Step 1: Simulate phenotype

In [4]:
def submit_simulate_pheno(
    hsq: float,
    pcausal: float,
    cor: float,
    hermodel: str,
    out_prefix: str,
):
    """
    her_model: one of [uniform, gcta, ldak]

    """
    N_SIM = 100
    N_ANC = 2

    np.random.seed(1234)

    pfile = f"{DATA_ROOT_DIR}/imputed/chr{CHROM}"
    print(
        f"hsq: {hsq}, pcausal: {pcausal}, cor: {cor}, hermodel: {hermodel}, out_prefix: {out_prefix}"
    )
    geno, df_snp, df_indiv = dapgen.read_pfile(pfile, phase=True, snp_chunk=1024)
    lanc = admix.io.read_lanc(pfile + ".lanc", snp_chunk=1024)
    df_snp_info = pd.read_csv(pfile + ".snp_info", sep="\t")
    snp_prior_var = calc_snp_prior_var(df_snp_info, hermodel)

    assert np.all(df_snp_info.SNP == df_snp.index.values)

    # simulate effects
    snp_subset = np.where(
        (df_snp_info.EUR_FREQ > 0.005) & (df_snp_info.AFR_FREQ > 0.005)
    )[0]
    beta = np.zeros((df_snp_info.shape[0], N_ANC, N_SIM))  # (n_snp, n_anc, n_sim)
    n_causal = int(len(snp_subset) * pcausal)
    for i_sim in range(N_SIM):
        cau = sorted(np.random.choice(snp_subset, size=n_causal, replace=False))

        i_beta = np.random.multivariate_normal(
            mean=[0.0, 0.0],
            cov=np.array([[1, cor], [cor, 1]]),
            size=n_causal,
        )

        i_beta = i_beta * np.sqrt(snp_prior_var[cau])[:, None]

        for i_anc in range(N_ANC):
            beta[cau, i_anc, i_sim] = i_beta[:, i_anc]

    sim = admix_genet_cor.simulate_quant_pheno(
        geno=geno, lanc=lanc, hsq=hsq, beta=beta, n_sim=N_SIM
    )
    np.save(out_prefix + ".beta.npy", sim["beta"])
    df_pheno = pd.DataFrame(
        sim["pheno"],
        index=df_indiv.index,
        columns=[f"SIM_{i}" for i in range(sim["pheno"].shape[1])],
    )
    df_pheno.to_csv(out_prefix + ".pheno.tsv", index=True, sep="\t")

In [5]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=300,
    memory_g=40,
    queue="highp",
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_simulate_pheno,
    df_simulate_params.hsq,
    df_simulate_params.pcausal,
    df_simulate_params.cor,
    df_simulate_params.hermodel,
    df_simulate_params.out_prefix,
)

# Step 2: Compute GRM

In [7]:
def submit_compute_grm(hermodel: str, out_prefix: str):

    pfile = f"{DATA_ROOT_DIR}/imputed/chr{CHROM}"

    geno, df_snp, df_indiv = dapgen.read_pfile(pfile, phase=True, snp_chunk=512)
    lanc = admix.io.read_lanc(pfile + ".lanc", snp_chunk=512)

    df_snp_info = pd.read_csv(pfile + ".snp_info", sep="\t")
    snp_prior_var = calc_snp_prior_var(df_snp_info, hermodel)
    print("snp_prior_var:")
    print(snp_prior_var)

    snp_subset = np.where(
        (df_snp_info.EUR_FREQ > 0.005) & (df_snp_info.AFR_FREQ > 0.005)
    )[0]

    K1, K2, K12 = admix_genet_cor.compute_grm(
        geno=geno[snp_subset, :, :],
        lanc=lanc[snp_subset, :, :],
        snp_prior_var=snp_prior_var[snp_subset],
        apa_center=False,
    )

    np.save(out_prefix + ".K1.npy", K1)
    np.save(out_prefix + ".K2.npy", K2)
    np.save(out_prefix + ".K12.npy", K12)

In [8]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=150,
    memory_g=40,
    queue="highp",
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

df_grm_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["mafukb"],
        )
    ],
    columns=["hermodel"],
)

df_grm_params["out_prefix"] = df_grm_params.apply(
    lambda row: f"out/grm/hermodel-{row.hermodel}-chrom-{CHROM}",
    axis=1,
)
jobs = executor.map_array(
    submit_compute_grm,
    df_grm_params.hermodel,
    df_grm_params.out_prefix,
)

# Step 3: HE Estimation

In [3]:
df_estimate_params = df_simulate_params.copy()
df_estimate_params["estimate_out_prefix"] = df_estimate_params.apply(
    lambda row: row.out_prefix.replace("pheno", "estimate"), axis=1
)

In [6]:
def submit_estimate(hermodel: str, pheno: str, out_prefix: str):
    """
    snp_set: set of SNPs used as GRM, for estimation
    pheno: phenotype file
    out_prefix: prefix to the output
    """
    print(f"submit_estimate:\npheno={pheno}, out_prefix={out_prefix}")

    grm_prefix = join(GRM_DIR, f"imputed.{hermodel}.chr1")
    A1 = np.load(grm_prefix + ".A1.npy")
    A2 = np.load(grm_prefix + ".A2.npy")

    # Step 2: load phenotype and estimate
    pheno = pd.read_csv(pheno, delim_whitespace=True, index_col=0)
    print(pheno.head())
    n_indiv = pheno.shape[0]

    rls_list = admix_genet_cor.estimate_genetic_cor(
        A1, A2, pheno.values, cov=np.ones((n_indiv, 1)), compute_varcov=True
    )

    if len(rls_list[0]) == 2:
        data = [
            {"estimate": rls[0].tolist(), "varcov": rls[1].tolist()} for rls in rls_list
        ]
    else:
        data = [{"estimate": rls.tolist()} for rls in rls_list]

    with open(out_prefix + ".estimate.json", "w") as out:
        json.dump(data, out)

In [7]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=240,
    memory_g=36,
    queue="highp",
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_estimate,
    df_estimate_params.hermodel,
    (df_estimate_params["out_prefix"] + ".pheno.tsv"),
    df_estimate_params.estimate_out_prefix,
)

# GCTA estimate

In [3]:
df_estimate_params = df_simulate_params.copy()
df_estimate_params["estimate_out_prefix"] = df_estimate_params.apply(
    lambda row: row.out_prefix.replace("pheno", "gcta-estimate"), axis=1
)
df_estimate_params = df_estimate_params.merge(
    pd.DataFrame({"sim_i": np.arange(20)}), how="cross"
)

In [4]:
def submit_gcta_estimate(hermodel: str, pheno: str, sim_i: int, out_prefix: str):
    # compile phenotype and covariates
    print(f"submit_estimate:\npheno={pheno}, out_prefix={out_prefix}")

    mgrm_path = join(GRM_DIR, f"imputed.{hermodel}.chr{CHROM}.mgrm.txt")
    df_pheno = pd.read_csv(pheno, delim_whitespace=True, index_col=0)
    df_pheno = pd.DataFrame(
        {
            "FID": df_pheno.index.values,
            "IID": df_pheno.index.values,
            "trait": df_pheno[f"SIM_{sim_i}"].values,
        }
    )
    admix.tools.gcta.reml(
        mgrm_path=mgrm_path, df_pheno=df_pheno, out_prefix=out_prefix + f".sim_{sim_i}"
    )

In [5]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=120,
    memory_g=24,
    queue="highp",
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_gcta_estimate,
    df_estimate_params.hermodel,
    (df_estimate_params["out_prefix"] + ".pheno.tsv"),
    df_estimate_params.sim_i,
    df_estimate_params.estimate_out_prefix,
)