In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import matplotlib.pyplot as plt
import numpy as np
import json
import pandas as pd
import seaborn as sns
import os
import admix
from os.path import join
import itertools
import admix_genet_cor
import submitit
import glob
from scipy.stats import pearsonr, linregress
from tqdm import tqdm
import glob
from shutil import copyfile
import os
import structlog

In [2]:
# define the simulation parameters
df_real_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["real"],
            [f"region{i}" for i in np.arange(100)],
            [0.002, 0.006, 0.01],
            [1, 41],
            ["lanc", "none", "pc"],
        )
    ],
    columns=["group", "region", "hsq", "ncausal", "covar"],
)
df_simu_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["simu"],
            [f"region{i}.eur20.afr80" for i in np.arange(100)]
            + [f"region{i}.eur80.afr20" for i in np.arange(100)],
            [0.002, 0.006, 0.01],
            [1, 41],
            ["lanc", "none"],
        )
    ],
    columns=["group", "region", "hsq", "ncausal", "covar"],
)
df_params = pd.concat([df_real_params, df_simu_params])
# df_params = df_real_params
df_params["config"] = df_params.apply(
    lambda row: f"group-{row.group}-region-{row.region}-hsq-{row.hsq}-ncausal-{int(row.ncausal)}",
    axis=1,
)


df_todo_params = df_params[
    ~df_params.apply(
        lambda x: np.any(
            [
                os.path.exists(
                    join(
                        f"out/covar-{x.covar}/summary",
                        x.config + f"/sim_{sim_i}.causal.tsv",
                    )
                )
                for sim_i in range(90, 100)
            ]
        ),
        axis=1,
    )
]

In [4]:
def gwas_clump(config, covar: str, sim_i):

    assert covar in ["none", "lanc", "pc"]
    if covar == "none":
        covar_cols = None
    elif covar == "lanc":
        covar_cols = ["AVG_ANC"]
    elif covar == "pc":
        covar_cols = ["geno_EV1", "geno_EV2"]
    else:
        raise NotImplementedError

    group = config.split("-")[1]
    region = config.split("-")[3]

    pfile = f"out/{group}-dataset/{region}"

    dset = admix.io.read_dataset(pfile=pfile, n_anc=2)

    df_sample_info = pd.read_csv(
        f"out/pheno/{config}/pheno.tsv.gz", sep="\t", index_col=0
    )
    # extract covariates and perform standardization
    if covar_cols is not None:
        df_cov = dset.indiv[covar_cols]
        df_cov = (df_cov - df_cov.mean(axis=0)) / df_cov.std(axis=0)
        df_sample_info = pd.merge(
            df_sample_info[[f"SIM_{sim_i}"]],
            df_cov,
            left_index=True,
            right_index=True,
        )
    else:
        df_sample_info = df_sample_info[[f"SIM_{sim_i}"]]

    out_dir = f"out/covar-{covar}/clump/{config}"
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    out_prefix = f"{out_dir}/sim_{sim_i}"
    admix.tools.plink2.gwas(
        pfile=pfile,
        df_sample_info=df_sample_info,
        pheno_col=f"SIM_{sim_i}",
        covar_cols=covar_cols,
        out_prefix=out_prefix,
        clean_tmp_file=True,
        memory=int(12 * 1e3),
    )
    # See Pardinas et al. for description of these set of parameters
    admix.tools.plink.clump(
        bfile=pfile,
        assoc_path=out_prefix + ".assoc",
        out_prefix=out_prefix,
        p1=5e-8,
        p2=1e-4,
        r2=0.1,
        kb=10000,  # 10Mb clumping window
        memory=int(12 * 1e3),
    )
    os.remove(out_prefix + ".assoc")


def test_het(config, covar: str, sim_i):
    ###########
    # read info
    ###########
    assert covar in ["none", "lanc", "pc"]
    if covar == "none":
        covar_cols = None
    elif covar == "lanc":
        covar_cols = ["AVG_ANC"]
    elif covar == "pc":
        covar_cols = ["geno_EV1", "geno_EV2"]
    else:
        raise NotImplementedError

    group = config.split("-")[1]
    region = config.split("-")[3]

    pfile = f"out/{group}-dataset/{region}"

    clump_dir = f"out/covar-{covar}/clump/{config}"
    clumped_file = f"{clump_dir}/sim_{sim_i}.clumped"
    with open(clumped_file) as f:
        lines = f.readlines()
        if len(lines) == 1:
            log = structlog.get_logger()
            log.info(f"clumped_file={clumped_file}, no clumped region")
            return

    df_clump = (
        pd.read_csv(f"{clump_dir}/sim_{sim_i}.clumped", delim_whitespace=True)
        .set_index("SNP")
        .sort_values(["CHR", "BP"])
    )
    df_pheno = pd.read_csv(f"out/pheno/{config}/pheno.tsv.gz", sep="\t", index_col=0)[
        f"SIM_{sim_i}"
    ]

    df_beta = pd.DataFrame(
        index=np.loadtxt(f"out/pheno/{config}/snplist.gz", dtype=str)
    )
    # attached effect sizes
    df_beta[["AFR_BETA", "EUR_BETA"]] = np.load(f"out/pheno/{config}/beta.npz")[
        "arr_0"
    ][:, :, sim_i]
    assert np.allclose(df_beta["AFR_BETA"], df_beta["EUR_BETA"])

    # extract only causal SNPs
    dict_snp_list = {
        "causal": df_beta[df_beta.AFR_BETA != 0].index.values,
        "clump": df_clump.index.values,
    }

    dict_df_summary = {"clump": [], "causal": []}

    dset = admix.io.read_dataset(
        pfile=pfile,
        snp_chunk=128,
        n_anc=2,
    )
    for group in ["clump", "causal"]:
        snp_list = dict_snp_list[group]

        if len(snp_list) == 0:
            continue

        dset_tmp = dset[snp_list]
        if covar_cols is not None:
            cov_values = dset.indiv[covar_cols].values
            cov_values = (cov_values - cov_values.mean(axis=0)) / cov_values.std(axis=0)
        else:
            cov_values = None

        # heterogeneity test
        df_tmp = admix_genet_cor.marginal_het(
            geno=dset_tmp.geno,
            lanc=dset_tmp.lanc,
            y=df_pheno,
            cov=cov_values,
        )

        # association test
        df_tmp["assoc_p"] = admix.assoc.marginal(
            dset=dset_tmp,
            pheno=df_pheno,
            cov=cov_values,
            method="ATT",
        ).P.values

        df_tmp["snp"] = dset_tmp.snp.index.values
        dict_df_summary[group].append(df_tmp)

    for group in ["clump", "causal"]:
        df_tmp = pd.concat(dict_df_summary[group])
        df_tmp = df_tmp[
            [
                "snp",
                "het_pval",
                "coef1",
                "se1",
                "coef2",
                "se2",
                "assoc_p",
            ]
        ]
        df_tmp.to_csv(
            f"out/covar-{covar}/summary/{config}/sim_{sim_i}.{group}.tsv",
            index=False,
            sep="\t",
        )


def submit_job(config, covar, n_sim=100):

    if not os.path.exists(f"out/covar-{covar}/clump/{config}"):
        os.makedirs(f"out/covar-{covar}/clump/{config}")

    if not os.path.exists(f"out/covar-{covar}/summary/{config}"):
        os.makedirs(f"out/covar-{covar}/summary/{config}")

    for sim_i in range(n_sim):
        if os.path.exists(f"out/covar-{covar}/summary/{config}/sim_{sim_i}.causal.tsv"):
            continue
        try:
            # in case no clump was formed
            gwas_clump(config, covar, sim_i)
            test_het(config, covar, sim_i)
        except ValueError as err:
            print(err)

In [5]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=240,
    memory_g=20,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(submit_job, df_todo_params.config, df_todo_params.covar)