In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black
import pandas as pd
import matplotlib.pyplot as plt
import admix
import numpy as np
import glob
from os.path import join
import os
import itertools
import submitit
import subprocess

In [2]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp-tables.xlsx?dl=1"

df_trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="ukb-trait-info")
ukb_trait_list = df_trait_info[df_trait_info["in-analysis"] == "T"].id.values

df_trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="page-trait-info")
page_trait_list = df_trait_info.trait.values

In [3]:
PAGE_GENO_DIR = "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr/imputed/"
PAGE_PHENO_DIR = (
    "/u/project/pasaniuc/kangchen/2021-admix-corr/experiments/"
    "03-page-genome-wide-profile-likelihood-new/out/pheno"
)

UKB_GENO_DIR = "/u/project/sgss/UKBB/UKB-ADMIXED/01-dataset/out/PLINK2/imputed"
UKB_PHENO_DIR = "/u/project/sgss/UKBB/UKB-ADMIXED/02-genet-cor/out/pheno/"

In [4]:
for study in ["page", "ukb"]:
    if study == "page":
        trait_list = page_trait_list
    else:
        trait_list = ukb_trait_list

    pheno_dir = UKB_PHENO_DIR if study == "ukb" else PAGE_PHENO_DIR
    for trait in trait_list:
        df_trait = pd.read_csv(
            os.path.join(pheno_dir, f"{trait}-sample10pc.tsv"),
            sep="\t",
            index_col=0,
            low_memory=False,
        )
        pheno_col = df_trait.columns[0]
        covar_cols = df_trait.columns[1:]
        df_trait[pheno_col] = df_trait[pheno_col] - df_trait[pheno_col].mean()
        df_trait = df_trait.rename(columns={pheno_col: "pheno"})
        df_trait = admix.data.convert_dummy(df_trait)
        df_trait.loc[:, df_trait.columns] = admix.data.impute_with_mean(
            df_trait.values, axis=0
        )
        df_trait.index.name = "#IID"
        df_trait.iloc[:, [0]].to_csv(
            f"out/processed-pheno/{study}/{trait}.pheno.tsv", sep="\t", na_rep="NA"
        )
        df_trait.iloc[:, 1:].to_csv(
            f"out/processed-pheno/{study}/{trait}.covar.tsv", sep="\t", na_rep="NA"
        )

2022-06-03 12:51.53 [info     ] Detected categorical columns: study
2022-06-03 12:51.53 [info     ] Added dummy variables: study_MEC,study_WHI
2022-06-03 12:51.53 [info     ] Detected categorical columns: study
2022-06-03 12:51.53 [info     ] Added dummy variables: study_WHI
2022-06-03 12:51.53 [info     ] Detected categorical columns: study
2022-06-03 12:51.53 [info     ] Added dummy variables: study_WHI
2022-06-03 12:51.53 [info     ] Detected categorical columns: study
2022-06-03 12:51.53 [info     ] Added dummy variables: study_WHI
2022-06-03 12:51.54 [info     ] Detected categorical columns: study
2022-06-03 12:51.54 [info     ] Added dummy variables: study_MEC,study_WHI
2022-06-03 12:51.54 [info     ] Detected categorical columns: study
2022-06-03 12:51.54 [info     ] Added dummy variables: study_MEC,study_WHI
2022-06-03 12:51.54 [info     ] Detected categorical columns: study
2022-06-03 12:51.54 [info     ] Added dummy variables: study_MEC,study_WHI
2022-06-03 12:51.54 [info    

In [5]:
def evaluate_het(study, chrom):

    geno_dir = UKB_GENO_DIR if study == "ukb" else PAGE_GENO_DIR
    pfile = join(geno_dir, f"chr{chrom}")
    bfile = join(geno_dir, f"PLINK1/chr{chrom}")
    if study == "page":
        trait_list = page_trait_list
    else:
        trait_list = ukb_trait_list

    for trait in trait_list:
        pheno_file = f"out/processed-pheno/{study}/{trait}.pheno.tsv"
        covar_file = f"out/processed-pheno/{study}/{trait}.covar.tsv"
        out_prefix = f"out/gwas/{study}/{trait}.chr{chrom}"

        # step 1: association testing
        cmds = [
            "plink2",
            f"--bfile {bfile}",
            f"--pheno iid-only {pheno_file}",
            f"--covar iid-only {covar_file}",
            "--quantile-normalize --glm hide-covar omit-ref --vif 100",
            "--memory 20000",
            f"--out {out_prefix}",
        ]
        subprocess.check_call(" ".join(cmds), shell=True)
        subprocess.check_call(
            f"sed '1 s/ID/SNP/' {out_prefix}.pheno.glm.linear > {out_prefix}.assoc",
            shell=True,
        )

        # step 2: clumping
        cmds = [
            "plink",
            f"--bfile {bfile}",
            f"--clump {out_prefix}.assoc",
            f"--clump-p1 5e-8 --clump-p2 1e-4 --clump-r2 0.1 --clump-kb 10000 --memory 20000",
            f"--out {out_prefix}",
        ]
        subprocess.check_call(" ".join(cmds), shell=True)
        if not os.path.exists(f"{out_prefix}.clumped"):
            continue

        subprocess.check_call(
            f"awk '(NF > 0) && (NR > 1) {{print $3 }}' {out_prefix}.clumped > {out_prefix}.clumped_snp_list",
            shell=True,
        )

        # step 3: heterogeneity
        cmds = [
            "admix assoc",
            f"--pfile {pfile}",
            f"--pheno {pheno_file}",
            f"--covar {covar_file}",
            f"--out {out_prefix}",
            "--method HET,ADM,ASE,ATT,TRACTOR",
            "--family quant",
            "--quantile-normalize True",
            f"--snp-list {out_prefix}.clumped_snp_list",
        ]
        subprocess.check_call(
            " ".join(cmds),
            shell=True,
        )

In [6]:
df_params = pd.DataFrame(
    [param for param in itertools.product(["ukb", "page"], np.arange(1, 23))],
    columns=["study", "chrom"],
)

In [7]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=180,
    memory_g=24,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PATH=~/project-pasaniuc/software/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    evaluate_het,
    df_params.study,
    df_params.chrom,
)