# Simulate phenotype

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import dapgen
import pandas as pd
import dask.array as da
import itertools
import submitit
import json
import matplotlib.pyplot as plt
import admix_genet_cor
import pandas as pd
import admix
from os.path import join
import os

In [2]:
# CONSTANTS
# define the simulation parameters
df_real_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["real"],
            [f"region{i}" for i in np.arange(100)],
            [0.002, 0.004, 0.006],
            [1, 3, 6, 11, 21, 41, 81],
        )
    ],
    columns=["group", "region", "hsq", "ncausal"],
)
df_simu_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["simu"],
            [f"region{i}.eur20.afr80" for i in np.arange(100)]
            + [f"region{i}.eur80.afr20" for i in np.arange(100)],
            [0.002, 0.004, 0.006],
            [1, 3, 6, 11, 21, 41],
        )
    ],
    columns=["group", "region", "hsq", "ncausal"],
)
df_params = pd.concat([df_real_params, df_simu_params])

df_params["out_dir"] = df_params.apply(
    lambda row: f"out/pheno/group-{row.group}-region-{row.region}-hsq-{row.hsq}-ncausal-{int(row.ncausal)}",
    axis=1,
)

In [3]:
def submit_simulate_pheno(
    group: str,
    region: str,
    hsq: float,
    ncausal: int,
    out_dir: str,
    cor: float = 1.0,
    n_sim=100,
):
    """
    region: int
        region id to simulate, defined in region_file
    hsq: float
        heritability of all the simulated causal variant
    ncausal: int
        number of simulated causal varaiants
    her_model: one of [uniform, gcta, ldak]

    """
    pfile = f"out/{group}-dataset/{region}"
    np.random.seed(admix.utils.str2int(out_dir))

    # read data
    dset = admix.io.read_dataset(pfile, snp_chunk=1024)

    # simulate effects
    dset = dset[
        (
            dset.snp.EUR_FREQ.between(0.005, 0.995)
            & dset.snp.AFR_FREQ.between(0.005, 0.995)
        ).values
    ]

    if ncausal == 1:
        cau = [int(dset.n_snp / 2)]
    else:
        cau = np.linspace(0, dset.n_snp - 1, ncausal).astype(int)

    beta = np.zeros((dset.n_snp, dset.n_anc, n_sim))  # (n_snp, n_anc, n_sim)

    for i_sim in range(n_sim):
        i_beta = np.random.multivariate_normal(
            mean=[0.0, 0.0],
            cov=np.array([[1, cor], [cor, 1]]) / ncausal,
            size=ncausal,
        )
        scale = (
            1 / np.sqrt(dset.snp.FREQ.iloc[cau] * (1 - dset.snp.FREQ.iloc[cau])).values
        )
        i_beta = np.sign(i_beta) * scale[:, None]

        for i_anc in range(dset.n_anc):
            beta[cau, i_anc, i_sim] = i_beta[:, i_anc]

    sim = admix_genet_cor.simulate_quant_pheno(
        geno=dset.geno, lanc=dset.lanc, hsq=hsq, beta=beta, n_sim=n_sim
    )
    os.makedirs(out_dir, exist_ok=True)
    np.savez_compressed(out_dir + "/beta", sim["beta"])
    np.savetxt(out_dir + "/snplist.gz", dset.snp.index.values, fmt="%s")

    df_pheno = pd.DataFrame(
        sim["pheno"],
        index=dset.indiv.index,
        columns=[f"SIM_{i}" for i in range(n_sim)],
    )
    df_pheno.to_csv(out_dir + "/pheno.tsv.gz", index=True, sep="\t")

In [4]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=15,
    memory_g=8,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

df_todo_params = df_params[
    ~df_params.apply(lambda x: os.path.exists(x.out_dir + "/pheno.tsv.gz"), axis=1)
]

In [5]:
jobs = executor.map_array(
    submit_simulate_pheno,
    df_todo_params.group,
    df_todo_params.region,
    df_todo_params.hsq * df_todo_params.ncausal,
    df_todo_params.ncausal,
    df_todo_params.out_dir,
)