In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black
import pandas as pd
import matplotlib.pyplot as plt
import admix
import numpy as np
import glob
from os.path import join
import os
import itertools
import submitit

In [2]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp-tables.xlsx?dl=1"

df_trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="ukb-trait-info")
ukb_trait_list = df_trait_info[df_trait_info["in-analysis"] == "T"].id.values

df_trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="page-trait-info")
page_trait_list = df_trait_info.trait.values

In [3]:
PAGE_GENO_DIR = "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr/imputed/"
PAGE_PHENO_DIR = "/u/project/pasaniuc/kangchen/2021-admix-corr/experiments/03-page-genome-wide-profile-likelihood/out/pheno"

UKB_GENO_DIR = "/u/project/sgss/UKBB/UKB-ADMIXED/01-dataset/out/PLINK2/imputed"
UKB_PHENO_DIR = "/u/project/sgss/UKBB/UKB-ADMIXED/02-genet-cor/out/pheno/"

In [4]:
def perform_plink_gwas(study, trait, out_prefix):
    # load both phenotype and covariates
    geno_dir = UKB_GENO_DIR if study == "ukb" else PAGE_GENO_DIR
    pheno_dir = UKB_PHENO_DIR if study == "ukb" else PAGE_PHENO_DIR

    pheno_path = join(pheno_dir, f"{trait}-sample10pc.tsv")
    df_sample_info = pd.read_csv(pheno_path, sep="\t", index_col=0, low_memory=False)

    pheno_col = df_sample_info.columns[0]
    covar_cols = df_sample_info.columns[1:]

    # impute covar columns (only for numerical columns)
    if study == "page":
        cat_cols = ["study"]
        numerical_cols = [col for col in covar_cols if col not in cat_cols]
    else:
        cat_cols = None
        numerical_cols = covar_cols

    df_sample_info[numerical_cols] = admix.data.impute_with_mean(
        df_sample_info[numerical_cols].values, axis=0
    )

    for chrom in range(1, 23):
        pfile = join(geno_dir, f"chr{chrom}")
        bfile = join(geno_dir, f"PLINK1/chr{chrom}")

        # perform plink2 gwas
        admix.tools.plink2.gwas(
            pfile=pfile,
            df_sample_info=df_sample_info,
            pheno_col=pheno_col,
            covar_cols=covar_cols,
            out_prefix=out_prefix + f".chr{chrom}",
            cat_cols=cat_cols,
            pheno_quantile_normalize=True,
            covar_quantile_normalize=True,
            clean_tmp_file=True,
            vif=100,
            memory=20000,
        )
        admix.tools.plink.clump(
            bfile=bfile,
            assoc_path=out_prefix + f".chr{chrom}.assoc",
            out_prefix=out_prefix + f".chr{chrom}",
            p1=5e-8,
            p2=1e-4,
            r2=0.1,
            kb=10000,  # 10Mb clumping window
            memory=20000,
        )

    # merge association results
    df_assoc = []
    for chrom in range(1, 23):
        df_assoc.append(
            pd.read_csv(f"{out_prefix}.chr{chrom}.assoc", delim_whitespace=True)
        )
    df_assoc = pd.concat(df_assoc, axis=0).reset_index(drop=True)
    df_assoc.to_csv(out_prefix + ".assoc", index=False, sep="\t")

    # merge clumped results
    df_clump = []
    for chrom in range(1, 23):
        clumped_file = out_prefix + f".chr{chrom}.clumped"
        with open(clumped_file) as f:
            lines = f.readlines()
            if len(lines) == 1:
                continue
        df_clump.append(pd.read_csv(clumped_file, delim_whitespace=True))

    if len(df_clump) > 0:
        df_clump = pd.concat(df_clump).sort_values(["CHR", "BP"])
        df_clump.to_csv(out_prefix + ".clumped", index=False, sep="\t")
    else:
        with open(out_prefix + ".clumped", "w") as f:
            f.writelines("# No clumped region")

    for f in glob.glob(out_prefix + ".chr*"):
        os.remove(f)

In [5]:
df_params = {"study": [], "trait": []}
df_params["trait"].extend(ukb_trait_list)
df_params["study"].extend(["ukb"] * len(ukb_trait_list))
df_params["trait"].extend(page_trait_list)
df_params["study"].extend(["page"] * len(page_trait_list))
df_params = pd.DataFrame(df_params)
df_params["out_prefix"] = df_params.apply(
    lambda x: f"out/gwas-assoc/{x.study}-{x.trait}", axis=1
)

In [6]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=240,
    memory_g=36,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PATH=~/project-pasaniuc/software/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    perform_plink_gwas, df_params.study, df_params.trait, df_params.out_prefix
)

In [None]:
df_todo_params = df_params[
    ~df_params.apply(
        lambda x: os.path.exists(x.out_prefix + ".assoc"), axis=1
    )
].reset_index(drop=True)