In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import dapgen
import pandas as pd
import dask.array as da
import itertools
import submitit
import json
import matplotlib.pyplot as plt
import admix_genet_cor
from admix_genet_cor import calc_snp_prior_var
import pandas as pd
import admix
from os.path import join
import os

# Overview of simulation studies
- p_causal: 0.001, 1.0
- var_g: 1.0
- var_e: 1.0
- gamma: 0.5, 0.8, 1.0
- set of SNPs: simulate from the imputed SNPs, the non-zero effects are simulated from SNPs with allele frequencies > 0.005 in both populations.

From S-LDXR paper: 
```
In these simulations, we randomly selected 10% of the SNPs to be causal in each population, with 80% of causal variants in each population shared with the other population, and sampled perfectly correlated causal effect sizes for shared causal variants using Eq. In these simulations, we set the variance of causal effect size of each SNP j in both populations to be proportional to $[p_{j,max}(1−p_{j,max})]^\alpha$, where pj,max is the maximum MAF of SNP j in the two populations.  We set α to − 0.38, as previously estimated for 25 UK Biobank diseases 
and complex traits in ref.
```

Here, we simulate from the imputed SNPs the non-zero effects are simulated from SNPs with allele frequencies > 0.005 in both populations. And from these, we sample 1% of causal effects.

In [2]:
# CONSTANTS

DATA_ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr"
)
GRM_DIR = "/u/scratch/k/kangchen/admix-grm/rho-model"

# define the simulation parameters
df_simulate_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            #             [0.1, 0.25, 0.5],
            [0.25],
            [0.00001, 0.0001, 0.001, 0.01],
            [0.9, 0.95, 1.0],
            ["mafukb"],
        )
    ],
    columns=["hsq", "pcausal", "cor", "hermodel"],
)
df_simulate_params["out_prefix"] = df_simulate_params.apply(
    lambda row: f"out/pheno/hsq-{row.hsq}-pcausal-{np.format_float_positional(row.pcausal)}"
    f"-cor-{row.cor}-hermodel-{row.hermodel}",
    axis=1,
)

# Step 1: Simulate phenotype

In [3]:
def submit_simulate_pheno(
    hsq: float,
    pcausal: float,
    cor: float,
    hermodel: str,
    out_prefix: str,
    n_sim=500,
):
    """
    her_model: one of [uniform, gcta, ldak]

    """
    np.random.seed(admix.utils.str2int(out_prefix))
    N_ANC = 2
    pfile_list = [f"{DATA_ROOT_DIR}/imputed/chr{chrom}" for chrom in range(1, 23)]

    geno = []
    lanc = []
    df_indiv = None
    df_snp = []
    df_snp_info = []

    # read data
    for pfile in pfile_list:

        this_geno, this_df_snp, this_df_indiv = dapgen.read_pfile(
            pfile, phase=True, snp_chunk=4000
        )
        this_lanc = admix.io.read_lanc(pfile + ".lanc").dask(snp_chunk=4000)
        this_df_snp_info = pd.read_csv(pfile + ".snp_info", sep="\t")
        assert np.all(this_df_snp_info.SNP == this_df_snp.index.values)

        if df_indiv is None:
            df_indiv = this_df_indiv
        else:
            assert df_indiv.equals(
                df_indiv
            ), ".psam should be consistent for all pfiles"
        geno.append(this_geno)
        lanc.append(this_lanc)
        df_snp_info.append(this_df_snp_info)

    # concatenate
    geno = da.concatenate(geno, axis=0)
    lanc = da.concatenate(lanc, axis=0)
    df_snp_info = pd.concat(df_snp_info).reset_index(drop=True)

    snp_prior_var = calc_snp_prior_var(df_snp_info, hermodel)

    # simulate effects
    snp_subset = np.where(
        df_snp_info.EUR_FREQ.between(0.005, 0.995)
        & df_snp_info.AFR_FREQ.between(0.005, 0.995)
    )[0]

    # sub-sample SNPs from `geno`, `lanc`, `df_snp_info`, `snp_prior_var`
    n_eff_snp = len(snp_subset)
    geno = geno[snp_subset, :, :]
    lanc = lanc[snp_subset, :, :]
    df_snp_info = df_snp_info.iloc[snp_subset, :]
    snp_prior_var = snp_prior_var[snp_subset]

    beta = np.zeros((n_eff_snp, N_ANC, n_sim))  # (n_snp, n_anc, n_sim)
    n_causal = int(n_eff_snp * pcausal)
    print(f"n_causal: {n_causal}")
    for i_sim in range(n_sim):
        cau = sorted(
            np.random.choice(np.arange(n_eff_snp), size=n_causal, replace=False)
        )

        i_beta = np.random.multivariate_normal(
            mean=[0.0, 0.0],
            cov=np.array([[1, cor], [cor, 1]]) / n_causal,
            size=n_causal,
        )

        i_beta = i_beta * np.sqrt(snp_prior_var[cau])[:, None]

        for i_anc in range(N_ANC):
            beta[cau, i_anc, i_sim] = i_beta[:, i_anc]

    sim = admix_genet_cor.simulate_quant_pheno(
        geno=geno, lanc=lanc, hsq=hsq, beta=beta, n_sim=n_sim
    )
    np.savez_compressed(out_prefix + ".beta", sim["beta"])
    df_snp_info.to_csv(out_prefix + ".beta_info.tsv.gz", index=False, sep="\t")

    df_pheno = pd.DataFrame(
        sim["pheno"],
        index=df_indiv.index,
        columns=[f"SIM_{i}" for i in range(n_sim)],
    )
    df_pheno.to_csv(out_prefix + ".pheno.tsv.gz", index=True, sep="\t")

In [4]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=1400,
    memory_g=64,
    #     queue="highp",
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

df_todo_params = df_simulate_params[
    ~df_simulate_params.apply(
        lambda x: os.path.exists(x.out_prefix + ".pheno.tsv.gz"), axis=1
    )
]
jobs = executor.map_array(
    submit_simulate_pheno,
    df_todo_params.hsq,
    df_todo_params.pcausal,
    df_todo_params.cor,
    df_todo_params.hermodel,
    df_todo_params.out_prefix,
)



# Step 2: GCTA Estimation

In [5]:
# define GRM parameters
df_grm_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["hm3", "imputed"],
            ["mafukb"],
            [0.005],
        )
    ],
    columns=[
        "snpset",
        "hermodel",
        "maf",
    ],
)

df_grm_params["grm_prefix"] = df_grm_params.apply(
    lambda p: f"{p.snpset}.{p.hermodel}.{str(p.maf)[2:]}",
    axis=1,
)

df_estimate_params = df_simulate_params.copy()
df_estimate_params["estimate_out_dir"] = df_estimate_params.apply(
    lambda row: row.out_prefix.replace("pheno", "gcta-estimate"), axis=1
)

df_estimate_params = df_estimate_params.merge(
    pd.DataFrame(
        [
            params
            for params in itertools.product(
                np.arange(100),
                (np.linspace(0, 1, 21) * 100).astype(int),
            )
        ],
        columns=["sim_i", "rho"],
    ),
    how="cross",
)

In [6]:
# define GRM parameters
df_grm_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["hm3", "imputed"],
            ["mafukb"],
            [0.005],
        )
    ],
    columns=[
        "snpset",
        "hermodel",
        "maf",
    ],
)

df_grm_params["grm_prefix"] = df_grm_params.apply(
    lambda p: f"{p.snpset}.{p.hermodel}.{str(p.maf)[2:]}",
    axis=1,
)

df_estimate_params = df_simulate_params.copy()
df_estimate_params["estimate_out_dir"] = df_estimate_params.apply(
    lambda row: row.out_prefix.replace("pheno", "gcta-estimate"), axis=1
)

df_estimate_params = df_estimate_params.merge(
    pd.DataFrame(
        [
            params
            for params in itertools.product(
                np.arange(0, 100),
                (np.linspace(0, 1, 21) * 100).astype(int),
                df_grm_params["grm_prefix"].values,
            )
        ],
        columns=["sim_i", "rho", "grm_prefix"],
    ),
    how="cross",
)

# filter todo jobs
df_estimate_params = df_estimate_params[
    ~df_estimate_params.apply(
        lambda x: os.path.exists(
            os.path.join(
                x.estimate_out_dir, x.grm_prefix, f"sim_{x.sim_i}.rho{x.rho}.hsq"
            )
        ),
        axis=1,
    )
]
print(f"{len(df_estimate_params)} jobs to be done.")

9 jobs to be done.


In [7]:
# def submit_gcta_estimate(pheno: str, grm_prefix: str, sim_i: int, out_dir: str):
#     """
#     Run estimate for all rho values
#     """
#     # compile phenotype and covariates
#     df_pheno = pd.read_csv(pheno, delim_whitespace=True, index_col=0)
#     df_pheno = pd.DataFrame(
#         {
#             "FID": df_pheno.index.values,
#             "IID": df_pheno.index.values,
#             "trait": df_pheno[f"SIM_{sim_i}"].values,
#         }
#     )

#     os.makedirs(os.path.join(out_dir, grm_prefix), exist_ok=True)

#     ### fit different rho
#     rho_list = (np.linspace(0, 1, 21) * 100).astype(int)

#     for rho in rho_list:
#         grm = join(GRM_DIR, grm_prefix, f"rho{rho}")
#         out_prefix = os.path.join(out_dir, grm_prefix, f"sim_{sim_i}.rho{rho}")
#         if not os.path.exists(out_prefix + ".hsq"):
#             admix.tools.gcta.reml(
#                 grm_path=grm,
#                 df_pheno=df_pheno,
#                 out_prefix=out_prefix,
#                 n_thread=2,
#             )


def submit_gcta_estimate(
    pheno: str, grm_prefix: str, rho: int, sim_i: int, out_dir: str
):
    """
    Run estimate for a single rho values
    """
    # compile phenotype and covariates
    df_pheno = pd.read_csv(pheno, delim_whitespace=True, index_col=0)
    df_pheno = pd.DataFrame(
        {
            "FID": df_pheno.index.values,
            "IID": df_pheno.index.values,
            "trait": df_pheno[f"SIM_{sim_i}"].values,
        }
    )
    os.makedirs(os.path.join(out_dir, grm_prefix), exist_ok=True)

    ### fit different rho
    grm = join(GRM_DIR, grm_prefix, f"rho{rho}")
    out_prefix = os.path.join(out_dir, grm_prefix, f"sim_{sim_i}.rho{rho}")

    admix.tools.gcta.reml(
        grm_path=grm,
        df_pheno=df_pheno,
        out_prefix=out_prefix,
        n_thread=4,
    )

In [8]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=180,
    #     time_min=75,
    memory_g=15,
    #         queue="highp",
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_gcta_estimate,
    (df_estimate_params["out_prefix"] + ".pheno.tsv.gz"),
    df_estimate_params.grm_prefix,
    df_estimate_params.rho,
    df_estimate_params.sim_i,
    df_estimate_params.estimate_out_dir,
)