In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import pandas as pd
import admix
import dask
import dask.array as da
from tqdm import tqdm
import statsmodels.api as sm
import matplotlib.pyplot as plt
from admix.data import quantile_normalize
import submitit

import admix_genet_cor

# Process GWAS hits from PAGE study

In [2]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp_tables.xlsx?dl=1"
trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="trait-info")
trait_list = trait_info["trait"].values

In [3]:
df_assoc = pd.read_csv("out/page-gwas-formatted.tsv", sep="\t", index_col=0)

In [4]:
import dapgen
from os.path import join

ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr/"
)
PFILE_DIR = join(ROOT_DIR, "imputed")

In [5]:
def submit_locus_hetero(trait, out, duffy_covar=False):
    df_assoc_trait = df_assoc[df_assoc.trait_id == trait].copy()

    # format phenotype and covariates

    df_trait = pd.read_csv(join(ROOT_DIR, f"pheno/{trait}.tsv"), sep="\t", index_col=0)
    df_trait.index = df_trait.index.astype(str)
    dset = admix.io.read_dataset(
        join(PFILE_DIR, "chr1"),
        n_anc=2,
    )
    # subset for individuals with non-nan value in df_trait
    dset = dset[:, dset.indiv.index.isin(df_trait.index)]
    dset.append_indiv_info(df_trait)
    covar_cols = df_trait.columns[1:]

    df_pheno = dset.indiv[[trait]].copy()
    df_covar = dset.indiv[covar_cols].copy()
    df_covar = admix.data.convert_dummy(df_covar)

    # special case for duffy SNPs, include the duffy SNPs in the covariate
    if duffy_covar:
        # find closest SNPs to Duffy SNP (GRCH38: 1:159204893)
        duffy_snp_loc = np.argmin(np.abs(dset.snp.POS - 159204893))
        assert dset.snp.CHROM.iloc[duffy_snp_loc] == 1
        duffy_lanc = dset[duffy_snp_loc].lanc.sum(axis=[0, 2]).compute()
        df_covar["duffy_lanc"] = duffy_lanc

    for col in df_pheno.columns:
        df_pheno[col] = admix.data.quantile_normalize(df_pheno[col])

    for col in df_covar.columns:
        df_covar[col] = admix.data.quantile_normalize(df_covar[col])

    # fill na with column mean
    df_covar.fillna(df_covar.mean(), inplace=True)

    for chrom in range(1, 23):
        # subset df_assoc
        df_assoc_trait_chrom = df_assoc_trait[df_assoc_trait.CHROM == chrom]

        # subset dset
        dset = admix.io.read_dataset(
            join(PFILE_DIR, f"chr{chrom}"),
            n_anc=2,
        )
        dset = dset[
            dset.snp.POS.isin(df_assoc_trait_chrom.POS.values).values,
            df_pheno.index.values,
        ]
        dset.persist()

        apa = dset.allele_per_anc()
        af = dset.af_per_anc()

        for snp_i in tqdm(range(dset.n_snp)):
            p_het, model_het = admix_genet_cor.test_snp_het(
                apa[snp_i, :, :], df_pheno.values, df_covar.values
            )
            p_assoc, model_assoc = admix_genet_cor.test_snp_assoc(
                apa[snp_i, :, :], df_pheno.values, df_covar.values
            )
            snp_idx = f"{trait}:{dset.snp.CHROM[snp_i]}:{dset.snp.POS[snp_i]}"
            df_assoc_trait.loc[snp_idx, ["EUR_af", "AFR_af"]] = (
                af[snp_i, 0],
                af[snp_i, 1],
            )

            df_assoc_trait.loc[
                snp_idx,
                [
                    "assoc_pval",
                    "HET_pval",
                    "EUR_beta",
                    "AFR_beta",
                    "EUR_beta_stderr",
                    "AFR_beta_stderr",
                ],
            ] = [
                p_assoc,
                p_het,
                model_het.params[1],
                model_het.params[2],
                model_het.bse[1],
                model_het.bse[2],
            ]
    df_assoc_trait.sort_values(["CHROM", "POS"]).to_csv(out)

In [6]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=20,
    memory_g=16,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_locus_hetero,
    trait_list,
    [f"out/locus-hetero/{trait}.csv" for trait in trait_list],
)