In [1]:
import glob
from shutil import copyfile
import os
from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import matplotlib.pyplot as plt
import numpy as np
import json
import pandas as pd
import seaborn as sns
import os
import admix
from os.path import join
import itertools
import admix_genet_cor
import submitit
import glob
from scipy.stats import pearsonr, linregress
from tqdm import tqdm

In [3]:
def gwas_clump(config, sim_i):

    region_idx = config.split("-")[1]

    pfile = f"out/datasets/region{region_idx}"
    dset = admix.io.read_dataset(
        pfile=pfile,
        n_anc=2,
    )
    df_sample_info = pd.read_csv(
        f"out/pheno/{config}.pheno.tsv.gz", sep="\t", index_col=0
    )
    # extract covariates and perform standardization
    df_cov = dset.indiv[["AVG_ANC"]]
    df_cov = (df_cov - df_cov.mean(axis=0)) / df_cov.std(axis=0)
    df_sample_info = pd.merge(
        df_sample_info[[f"SIM_{sim_i}"]],
        df_cov,
        left_index=True,
        right_index=True,
    )
    if not os.path.exists(f"out/clump/{config}"):
        os.makedirs(f"out/clump/{config}")

    for chrom in range(1, 2):
        out_prefix = f"out/clump/{config}/sim_{sim_i}.chr{chrom}"
        admix.tools.plink2.gwas(
            pfile=pfile,
            df_sample_info=df_sample_info,
            pheno_col=f"SIM_{sim_i}",
            covar_cols=["AVG_ANC"],
            out_prefix=out_prefix,
            clean_tmp_file=True,
            chr=chrom,
            memory=int(18 * 1e3),
        )
        # See Pardinas et al. for description of these set of parameters
        admix.tools.plink.clump(
            bfile=pfile,
            assoc_path=out_prefix + ".assoc",
            out_prefix=out_prefix,
            p1=5e-8,
            p2=1e-4,
            r2=0.1,
            kb=10000,  # 10Mb clumping window
            chr=chrom,
            memory=int(18 * 1e3),
        )
        os.remove(out_prefix + ".assoc")

    # merging all the clumping
    df_clump = []
    clump_file_prefix = f"out/clump/{config}/sim_{sim_i}"
    for chrom in range(1, 2):
        clumped_file = clump_file_prefix + f".chr{chrom}.clumped"
        with open(clumped_file) as f:
            lines = f.readlines()
            if len(lines) == 1:
                continue
        df_clump.append(pd.read_csv(clumped_file, delim_whitespace=True))
    df_clump = pd.concat(df_clump).sort_values(["CHR", "BP"])
    df_clump.to_csv(clump_file_prefix + f".clumped", index=False, sep="\t")
    # clean up
    for f in glob.glob(clump_file_prefix + ".chr*"):
        os.remove(f)


def test_het(config, sim_i):

    ###########
    # read info
    ###########
    region_idx = config.split("-")[1]

    pfile = f"out/datasets/region{region_idx}"

    df_clump = (
        pd.read_csv(f"out/clump/{config}/sim_{sim_i}.clumped", sep="\t")
        .set_index("SNP")
        .sort_values(["CHR", "BP"])
    )
    df_pheno = pd.read_csv(f"out/pheno/{config}.pheno.tsv.gz", sep="\t", index_col=0)[
        f"SIM_{sim_i}"
    ]

    df_beta = pd.DataFrame(
        index=np.loadtxt(f"out/pheno/{config}.snplist.gz", dtype=str)
    )
    # attached effect sizes
    df_beta[["AFR_BETA", "EUR_BETA"]] = np.load(f"out/pheno/{config}.beta.npz")[
        "arr_0"
    ][:, :, sim_i]
    #     assert np.allclose(df_beta["AFR_BETA"], df_beta["EUR_BETA"])

    # extract only causal SNPs
    dict_snp_list = {
        "causal": df_beta[
            (df_beta.AFR_BETA != 0) | (df_beta.EUR_BETA != 0)
        ].index.values,
        "clump": df_clump.index.values,
    }

    dict_df_summary = {"clump": [], "causal": []}

    for chrom in range(1, 2):
        dset = admix.io.read_dataset(
            pfile=pfile,
            snp_chunk=128,
            n_anc=2,
        )
        for group in ["clump", "causal"]:
            snp_list = [snp for snp in dict_snp_list[group]]

            dset_tmp = dset[snp_list]
            cov_values = dset.indiv[["AVG_ANC"]].values
            cov_values = (cov_values - cov_values.mean(axis=0)) / cov_values.std(axis=0)
            # heterogeneity test
            df_tmp = admix_genet_cor.marginal_het(
                geno=dset_tmp.geno,
                lanc=dset_tmp.lanc,
                y=df_pheno,
                cov=cov_values,
            )
            # association test
            df_tmp["assoc_p"] = admix.assoc.marginal(
                dset=dset_tmp,
                pheno=df_pheno,
                cov=cov_values,
                method="ATT",
            ).P.values
            df_tmp["snp"] = dset_tmp.snp.index.values
            dict_df_summary[group].append(df_tmp)

    for group in ["clump", "causal"]:
        df_tmp = pd.concat(dict_df_summary[group])
        df_tmp = df_tmp[
            [
                "snp",
                "het_pval",
                "coef1",
                "se1",
                "coef2",
                "se2",
                "assoc_p",
            ]
        ]
        df_tmp.to_csv(
            f"out/summary/{config}/sim_{sim_i}.{group}.tsv",
            index=False,
            sep="\t",
        )


def submit_job(config, n_sim=100):

    if not os.path.exists(f"out/clump/{config}"):
        os.makedirs(f"out/clump/{config}")

    if not os.path.exists(f"out/summary/{config}"):
        os.makedirs(f"out/summary/{config}")

    for sim_i in range(n_sim):
        if os.path.exists(f"out/summary/{config}/sim_{sim_i}.causal.tsv"):
            continue
        try:
            # in case no clump was formed
            gwas_clump(config, sim_i)
            test_het(config, sim_i)
        except ValueError as err:
            print(err)

In [4]:
# define the simulation parameters
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            [f"{i}_20_80" for i in range(100)] + [f"{i}_80_20" for i in range(100)],
            [0.004],
            [1, 41],
        )
    ],
    columns=["region", "hsq", "ncausal"],
)
df_params["prefix"] = df_params.apply(
    lambda row: f"region-{row.region}-hsq-{row.hsq}-ncausal-{int(row.ncausal)}",
    axis=1,
)

df_todo_params = df_params[
    ~df_params.apply(
        lambda x: os.path.exists(join("out/summary", x.prefix + f"/sim_99.causal.tsv")),
        axis=1,
    )
]

In [5]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=180,
    memory_g=26,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(submit_job, df_todo_params.prefix)