# Simulation to evaluate the reason of inflation

1. Take the chromosome-wide simulation in 1st experiment.
2. Perform a GWAS on the data on chromosome 22.
3. Perform clumping on the data on chromosome 22.
4. For every GWAS hit, find the true SNPs, and evaluate the properties. Heterogeneity ~ distance to causal, strategies to find causal variables.
5. Run heterogeneity test on the index SNPs.

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import matplotlib.pyplot as plt
import numpy as np
import json
import pandas as pd
import seaborn as sns
import os
import admix
from os.path import join
import itertools
import admix_genet_cor
import submitit
import glob
from scipy.stats import pearsonr, linregress
from tqdm import tqdm
import string
from natsort import natsorted

# Convert PLINK2 to PLINK1

In [2]:
GENO_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr"
)
SAMPLE_INFO_PATH = "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr/sample_info.txt"

# convert to PLINK1
def convert_plink1():
    for chrom in range(1, 23):
        pfile = join(GENO_DIR, f"imputed/chr{chrom}")
        admix.tools.plink2.run(f"--pfile {pfile} --make-bed --out out/PLINK/chr{chrom}")


# convert_plink1()

# Perform GWAS clumping

In [3]:
def gwas_clump(config, sim_i):

    dset_chrom = admix.io.read_dataset(
        pfile=join(GENO_DIR, f"imputed/chr1"),
        indiv_info_file=SAMPLE_INFO_PATH,
        n_anc=2,
    )

    df_sample_info = pd.read_csv(
        f"out/pheno/{config}.pheno.tsv.gz", sep="\t", index_col=0
    )
    # extract covariates and perform standardization
    df_cov = dset_chrom.indiv[["geno_EV1"]]
    df_cov = (df_cov - df_cov.mean(axis=0)) / df_cov.std(axis=0)
    df_sample_info = pd.merge(
        df_sample_info[[f"SIM_{sim_i}"]],
        df_cov,
        left_index=True,
        right_index=True,
    )

    for chrom in range(1, 2):
        out_prefix = f"out/clump/{config}.sim_{sim_i}.chr{chrom}"
        admix.tools.plink2.gwas(
            pfile=join(GENO_DIR, f"imputed/chr{chrom}"),
            df_sample_info=df_sample_info,
            pheno_col=f"SIM_{sim_i}",
            covar_cols=["geno_EV1"],
            out_prefix=out_prefix,
            clean_tmp_file=True,
            memory=int(28 * 1e3),
        )
        # See Pardinas et al. for description of these set of parameters
        admix.tools.plink.clump(
            bfile=f"out/PLINK/chr{chrom}",
            assoc_path=out_prefix + ".assoc",
            out_prefix=out_prefix,
            p1=5e-8,
            p2=1e-4,
            r2=0.1,
            kb=10000,  # 10Mb clumping window
            memory=int(28 * 1e3),
        )
        os.remove(out_prefix + ".assoc")

    # merging all the clumping
    df_clump = []
    clump_file_prefix = f"out/clump/{config}.sim_{sim_i}"
    for chrom in range(1, 2):
        clumped_file = clump_file_prefix + f".chr{chrom}.clumped"
        with open(clumped_file) as f:
            lines = f.readlines()
            if len(lines) == 1:
                continue
        df_clump.append(pd.read_csv(clumped_file, delim_whitespace=True))
    df_clump = pd.concat(df_clump).sort_values(["CHR", "BP"])
    df_clump.to_csv(clump_file_prefix + f".clumped", index=False, sep="\t")
    # clean up
    for f in glob.glob(clump_file_prefix + ".chr*"):
        os.remove(f)


def test_het(config, sim_i):
    ###########
    # read info
    ###########
    df_clump = (
        pd.read_csv(f"out/clump/{config}.sim_{sim_i}.clumped", sep="\t")
        .set_index("SNP")
        .sort_values(["CHR", "BP"])
    )
    df_pheno = pd.read_csv(f"out/pheno/{config}.pheno.tsv.gz", sep="\t", index_col=0)[
        f"SIM_{sim_i}"
    ]

    df_beta = pd.read_csv(f"out/pheno/{config}.beta_info.tsv.gz", sep="\t", index_col=0)
    # attached effect sizes
    df_beta[["AFR_BETA", "EUR_BETA"]] = np.load(f"out/pheno/{config}.beta.npz",)[
        "arr_0"
    ][:, :, sim_i]
    assert np.allclose(df_beta["AFR_BETA"], df_beta["EUR_BETA"])

    # extract only causal SNPs
    # TODO: maybe threhold this such that the expected p-value < 5e-8
    dict_snp_list = {
        "causal": df_beta[df_beta.AFR_BETA != 0].index.values,
        "clump": df_clump.index.values,
    }

    dict_df_summary = {"clump": [], "causal": []}

    for chrom in range(1, 2):
        dset_chrom = admix.io.read_dataset(
            pfile=join(GENO_DIR, f"imputed/chr{chrom}"),
            indiv_info_file=SAMPLE_INFO_PATH,
            n_anc=2,
        )
        for group in ["clump", "causal"]:
            snp_list = [
                snp for snp in dict_snp_list[group] if snp.startswith(f"chr{chrom}:")
            ]
            if len(snp_list) == 0:
                continue
            dset_tmp = dset_chrom[snp_list]
            apa = dset_tmp.allele_per_anc()

            cov_values = dset_chrom.indiv[["geno_EV1"]].values
            cov_values = (cov_values - cov_values.mean(axis=0)) / cov_values.std(axis=0)
            # heterogeneity test
            df_tmp = admix_genet_cor.marginal_het(
                geno=dset_tmp.geno,
                lanc=dset_tmp.lanc,
                y=df_pheno,
                cov=cov_values,
            )
            # association test
            df_tmp["assoc_p"] = admix.assoc.marginal(
                dset=dset_tmp,
                pheno=df_pheno,
                cov=cov_values,
                method="ATT",
            ).P.values
            df_tmp["snp"] = dset_tmp.snp.index.values
            dict_df_summary[group].append(df_tmp)

    for group in ["clump", "causal"]:
        df_tmp = pd.concat(dict_df_summary[group])
        df_tmp = df_tmp[
            [
                "snp",
                "het_pval",
                "coef1",
                "se1",
                "coef2",
                "se2",
                "assoc_p",
            ]
        ]
        df_tmp.to_csv(
            f"out/summary/{config}.sim_{sim_i}.{group}.tsv",
            index=False,
            sep="\t",
        )


def submit_job(config, sim_i):
    gwas_clump(config, sim_i)
    test_het(config, sim_i)

In [4]:
# define the simulation parameters
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            [0.025, 0.05, 0.1, 0.2], [62, 125, 250, 500, 1000], np.arange(0, 500)
        )
    ],
    columns=["hsq", "ncausal", "sim_i"],
)
df_params["prefix"] = df_params.apply(
    lambda row: f"hsq-{row.hsq}-ncausal-{int(row.ncausal)}",
    axis=1,
)

df_todo_params = df_params[
    ~df_params.apply(
        lambda x: os.path.exists(
            join("out/summary", x.prefix + f".sim_{x.sim_i}.causal.tsv")
        ),
        axis=1,
    )
]

In [5]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=180,
    memory_g=36,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(submit_job, df_todo_params.prefix, df_todo_params.sim_i)

In [8]:
df_todo_params["prefix"].value_counts()

hsq-0.025-ncausal-1000    435
hsq-0.025-ncausal-500     376
hsq-0.05-ncausal-1000     247
hsq-0.025-ncausal-250     224
hsq-0.05-ncausal-500       71
hsq-0.025-ncausal-125      36
hsq-0.1-ncausal-1000        5
hsq-0.05-ncausal-250        2
hsq-0.025-ncausal-62        1
Name: prefix, dtype: int64

# Legacy code below

In [6]:
assert False

AssertionError: 

In [None]:
df_sum = []
for chrom, df_chrom in df_clump.groupby("CHR"):

    df_tmp_snp = {"snp": [], "index_snp": [], "is_causal": [], "clump_size": []}

    for index_snp, row in df_chrom.iterrows():
        if row.SP2 != "NONE":
            snp_list = [s[:-3] for s in row.SP2.split(",")]
            clump_size = len(snp_list)
            # find tag SNPs that are also causal snp
            snp_list = [s for s in snp_list if s in causal_snp_list]
        else:
            snp_list = []
            clump_size = 1
        snp_list.append(index_snp)
        df_tmp_snp["snp"].extend(snp_list)
        df_tmp_snp["index_snp"].extend([index_snp] * len(snp_list))
        df_tmp_snp["is_causal"].extend([False] + [True] * (len(snp_list) - 1))
        df_tmp_snp["clump_size"].extend([clump_size] * len(snp_list))

    df_tmp_snp = pd.DataFrame(df_tmp_snp).set_index("snp")
    df_tmp_snp = df_tmp_snp.loc[natsorted(df_tmp_snp.index)]

    dset_chrom = admix.io.read_dataset(
        pfile=join(GENO_DIR, f"imputed/chr{chrom}"), n_anc=2
    )[df_tmp_snp.index.values]

    df_tmp_corr = admix_genet_cor.marginal_het(
        dset_chrom.geno, dset_chrom.lanc, df_pheno
    )
    df_tmp_corr["assoc_p"] = admix.assoc.marginal(
        geno=dset_chrom.geno, lanc=dset_chrom.lanc, pheno=df_pheno, cov=None
    )
    df_tmp_corr.index = df_tmp_snp.index.values