In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import dapgen
import pandas as pd
import dask.array as da
import itertools
import submitit
import json
import matplotlib.pyplot as plt
import admix_genet_cor
import pandas as pd
import admix
from os.path import join
import os

In [2]:
# CONSTANTS

DATA_ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr"
)

# define the simulation parameters
df_simulate_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            [0.025, 0.05, 0.1, 0.2], [62, 125, 250, 500, 1000]
        )
    ],
    columns=["hsq", "ncausal"],
)
df_simulate_params["out_prefix"] = df_simulate_params.apply(
    lambda row: f"out/pheno/hsq-{row.hsq}-ncausal-{int(row.ncausal)}",
    axis=1,
)

In [3]:
def submit_simulate_pheno(
    hsq: float,
    ncausal: float,
    out_prefix: str,
    cor: float = 1.0,
    n_sim=500,
):
    """
    her_model: one of [uniform, gcta, ldak]

    """
    np.random.seed(admix.utils.str2int(out_prefix))
    N_ANC = 2
    pfile_list = [f"{DATA_ROOT_DIR}/imputed/chr{chrom}" for chrom in range(1, 2)]

    geno = []
    lanc = []
    df_indiv = None
    df_snp = []
    df_snp_info = []

    # read data
    for pfile in pfile_list:

        this_geno, this_df_snp, this_df_indiv = dapgen.read_pfile(
            pfile, phase=True, snp_chunk=2048
        )
        this_lanc = admix.io.read_lanc(pfile + ".lanc").dask(snp_chunk=2048)
        this_df_snp_info = pd.read_csv(pfile + ".snp_info", sep="\t")
        assert np.all(this_df_snp_info.SNP == this_df_snp.index.values)

        if df_indiv is None:
            df_indiv = this_df_indiv
        else:
            assert df_indiv.equals(
                df_indiv
            ), ".psam should be consistent for all pfiles"
        geno.append(this_geno)
        lanc.append(this_lanc)
        df_snp_info.append(this_df_snp_info)

    # concatenate
    geno = da.concatenate(geno, axis=0)
    lanc = da.concatenate(lanc, axis=0)
    df_snp_info = pd.concat(df_snp_info).reset_index(drop=True)

    # simulate effects
    snp_subset = np.where(
        df_snp_info.EUR_FREQ.between(0.005, 0.995)
        & df_snp_info.AFR_FREQ.between(0.005, 0.995)
    )[0]

    # sub-sample SNPs from `geno`, `lanc`, `df_snp_info`, `snp_prior_var`
    n_eff_snp = len(snp_subset)
    geno = geno[snp_subset, :, :]
    lanc = lanc[snp_subset, :, :]
    df_snp_info = df_snp_info.iloc[snp_subset, :]

    beta = np.zeros((n_eff_snp, N_ANC, n_sim))  # (n_snp, n_anc, n_sim)
    for i_sim in range(n_sim):

        cau = sorted(
            np.random.choice(np.arange(n_eff_snp), size=ncausal, replace=False)
        )

        i_beta = np.random.multivariate_normal(
            mean=[0.0, 0.0],
            cov=np.array([[1, cor], [cor, 1]]) / ncausal,
            size=ncausal,
        )
        # UNIFORM allelic effects distribution
        # uncomment below to use GCTA
        #         scale = (
        #             1
        #             / np.sqrt(
        #                 df_snp_info.FREQ.iloc[cau] * (1 - df_snp_info.FREQ.iloc[cau])
        #             ).values
        #         )
        #         i_beta = np.sign(i_beta) * scale[:, None]

        for i_anc in range(N_ANC):
            beta[cau, i_anc, i_sim] = i_beta[:, i_anc]

    sim = admix_genet_cor.simulate_quant_pheno(
        geno=geno, lanc=lanc, hsq=hsq, beta=beta, n_sim=n_sim
    )
    np.savez_compressed(out_prefix + ".beta", sim["beta"])
    df_snp_info.to_csv(out_prefix + ".beta_info.tsv.gz", index=False, sep="\t")

    df_pheno = pd.DataFrame(
        sim["pheno"],
        index=df_indiv.index,
        columns=[f"SIM_{i}" for i in range(n_sim)],
    )
    df_pheno.to_csv(out_prefix + ".pheno.tsv.gz", index=True, sep="\t")

In [4]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=200,
    memory_g=30,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

df_todo_params = df_simulate_params[
    ~df_simulate_params.apply(
        lambda x: os.path.exists(x.out_prefix + ".pheno.tsv.gz"), axis=1
    )
]
jobs = executor.map_array(
    submit_simulate_pheno,
    df_todo_params.hsq,
    df_todo_params.ncausal,
    df_todo_params.out_prefix,
)