In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import pandas as pd
import admix
import dask
import dask.array as da
from tqdm import tqdm
import statsmodels.api as sm
import matplotlib.pyplot as plt
from admix.data import quantile_normalize
import submitit
import dapgen
import os
import admix_genet_cor

# Process GWAS hits from PAGE / UKB

In [2]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp-tables.xlsx?dl=1"

df_trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="ukb-trait-info")
ukb_trait_list = df_trait_info[df_trait_info["in-analysis"] == "T"].id.values

df_trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="page-trait-info")
page_trait_list = df_trait_info.trait.values

In [24]:
PAGE_GENO_DIR = "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr/imputed/"
PAGE_PHENO_DIR = "/u/project/pasaniuc/kangchen/2021-admix-corr/experiments/03-page-genome-wide-profile-likelihood/out/pheno"

UKB_GENO_DIR = "/u/project/sgss/UKBB/UKB-ADMIXED/01-dataset/out/PLINK2/imputed"
UKB_PHENO_DIR = "/u/project/sgss/UKBB/UKB-ADMIXED/02-genet-cor/out/pheno/"

In [27]:
def submit_locus_hetero(study, trait, out):

    # load clumped variants
    clump_file = f"out/gwas-assoc/{study}-{trait}.clumped"
    if sum(1 for line in open(clump_file)) == 1:
        with open(out, "w") as f:
            f.writelines("# No clumped region")
        return

    df_assoc = (
        pd.read_csv(f"out/gwas-assoc/{study}-{trait}.clumped", sep="\t")[
            ["CHR", "SNP", "BP", "P"]
        ]
        .rename(columns={"CHR": "CHROM", "SNP": "SNP", "BP": "POS", "P": "PLINK_P"})
        .set_index("SNP")
    )

    # load both phenotype and covariates
    geno_dir = UKB_GENO_DIR if study == "ukb" else PAGE_GENO_DIR
    pheno_dir = UKB_PHENO_DIR if study == "ukb" else PAGE_PHENO_DIR

    pheno_path = os.path.join(pheno_dir, f"{trait}-sample10pc.tsv")
    df_sample_info = pd.read_csv(pheno_path, sep="\t", index_col=0, low_memory=False)

    pheno_col = df_sample_info.columns[0]
    covar_cols = df_sample_info.columns[1:]

    # format phenotype and covariates
    df_trait = pd.read_csv(pheno_path, sep="\t", index_col=0)
    df_trait.index = df_trait.index.astype(str)

    dset = admix.io.read_dataset(
        os.path.join(geno_dir, "chr1"),
        n_anc=2,
    )
    for col in dset._indiv:
        dset._indiv.drop(columns=col, inplace=True)

    # subset for individuals with non-nan value in df_trait
    dset = dset[:, dset.indiv.index.isin(df_trait.index)]
    dset.append_indiv_info(df_trait)
    pheno_col = df_trait.columns[[0]]
    covar_cols = df_trait.columns[1:]

    df_pheno = dset.indiv[pheno_col].copy()
    df_covar = dset.indiv[covar_cols].copy()
    df_covar = admix.data.convert_dummy(df_covar)

    for col in df_pheno.columns:
        df_pheno[col] = admix.data.quantile_normalize(df_pheno[col])

    for col in df_covar.columns:
        df_covar[col] = admix.data.quantile_normalize(df_covar[col])

    # fill na with column mean
    df_covar.fillna(df_covar.mean(), inplace=True)

    for chrom in range(1, 23):
        # subset df_assoc
        df_assoc_chrom = df_assoc[df_assoc.CHROM == chrom]
        if len(df_assoc_chrom) == 0:
            continue
        # subset dset
        dset = admix.io.read_dataset(
            os.path.join(geno_dir, f"chr{chrom}"),
            n_anc=2,
        )
        dset = dset[df_assoc_chrom.index.values, df_pheno.index.values]
        dset.persist()

        apa = dset.allele_per_anc()
        af = dset.af_per_anc()

        for snp_i in tqdm(range(dset.n_snp)):
            p_het, model_het = admix_genet_cor.test_snp_het(
                apa[snp_i, :, :], df_pheno.values, df_covar.values
            )
            p_assoc, model_assoc = admix_genet_cor.test_snp_assoc(
                apa[snp_i, :, :], df_pheno.values, df_covar.values
            )
            snp_idx = dset.snp.index[snp_i]
            df_assoc.loc[snp_idx, ["EUR_af", "AFR_af"]] = (
                af[snp_i, 0],
                af[snp_i, 1],
            )

            df_assoc.loc[
                snp_idx,
                [
                    "assoc_pval",
                    "HET_pval",
                    "EUR_beta",
                    "AFR_beta",
                    "EUR_beta_stderr",
                    "AFR_beta_stderr",
                ],
            ] = [
                p_assoc,
                p_het,
                model_het.params[1],
                model_het.params[2],
                model_het.bse[1],
                model_het.bse[2],
            ]
    df_assoc[["CHROM", "POS"]] = df_assoc[["CHROM", "POS"]].astype(int)
    df_assoc.sort_values(["CHROM", "POS"]).to_csv(out, float_format="%.8g")

In [28]:
df_params = {"study": [], "trait": []}
df_params["trait"].extend(ukb_trait_list)
df_params["study"].extend(["ukb"] * len(ukb_trait_list))
df_params["trait"].extend(page_trait_list)
df_params["study"].extend(["page"] * len(page_trait_list))
df_params = pd.DataFrame(df_params)
df_params["out"] = df_params.apply(
    lambda x: f"out/gwas-het/{x.study}-{x.trait}.csv", axis=1
)

In [29]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=40,
    memory_g=16,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_locus_hetero,
    df_params.study,
    df_params.trait,
    df_params.out,
)