In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import dapgen
import itertools
import submitit
import json
import matplotlib.pyplot as plt
import admix_genet_cor
import admix
from os.path import join
import os
import json

In [6]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp_tables.xlsx?dl=1"
ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr"
)
PFILE_DIR = join(ROOT_DIR, "imputed")
SAMPLE_INFO_PATH = join(ROOT_DIR, "sample_info.txt")
trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="trait-info")
trait_list = trait_info["trait"].values

In [7]:
def submit_gcta_estimate(snpset, hermodel, trait, duffy_covar=True):
    # compile phenotype and covariates
    dset = admix.io.read_dataset(
        join(PFILE_DIR, "chr1"),
        indiv_info_file=SAMPLE_INFO_PATH,
        n_anc=2,
    )

    subset_indiv = np.where(~np.isnan(dset.indiv[trait]).values)[0]
    dset_assoc = dset[:, subset_indiv]

    covar_cols = ["age", "sex", "study"] + [f"geno_EV{i}" for i in range(1, 11)]

    df_pheno = dset_assoc.indiv[[trait]].copy()
    df_covar = dset_assoc.indiv[covar_cols].copy()
    # create study dummies variables
    study_dummies = pd.get_dummies(df_covar["study"], drop_first=True)
    study_dummies.columns = [f"study_dummy_{s}" for s in study_dummies.columns]
    df_covar = pd.concat([df_covar, study_dummies], axis=1)
    df_covar = df_covar.drop(columns=["study"])

    # special case for duffy SNPs, include the duffy SNPs in the covariate
    if duffy_covar:
        # find closest SNPs
        duffy_snp_loc = np.argmin(np.abs(dset_assoc.snp.POS - 159204893))
        assert dset_assoc.snp.CHROM.iloc[duffy_snp_loc] == 1
        duffy_lanc = dset_assoc[duffy_snp_loc].lanc.sum(axis=[0, 2]).compute()
        df_covar["duffy_lanc"] = duffy_lanc

    for col in df_pheno.columns:
        df_pheno[col] = admix.data.quantile_normalize(df_pheno[col])

    df_id = pd.DataFrame(
        {"FID": df_pheno.index.values, "IID": df_pheno.index.values},
        index=df_pheno.index.values,
    )
    df_pheno = pd.merge(df_id, df_pheno, left_index=True, right_index=True)
    df_covar = pd.merge(df_id, df_covar, left_index=True, right_index=True)

    out_prefix = f"out/gcta-estimate/{trait}.{snpset}.{hermodel}"
    if duffy_covar:
        out_prefix += ".duffy_covar"
    mgrm_path = join(ROOT_DIR, f"grm/{snpset}.{hermodel}.all.mgrm.txt")
    admix.tools.gcta.reml(
        mgrm_path=mgrm_path, df_pheno=df_pheno, df_covar=df_covar, out_prefix=out_prefix
    )

In [4]:
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["imputed"], ["mafukb", "gcta"], trait_list, [True, False]
        )
    ],
    columns=["snpset", "hermodel", "trait", "duffy_covar"],
)

In [5]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=160,
    memory_g=18,
    queue="highp",
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_gcta_estimate,
    df_params.snpset,
    df_params.hermodel,
    df_params.trait,
    df_params.duffy_covar,
)

In [11]:
print(jobs[0].stdout())

submitit INFO (2021-12-26 21:03:01,553) - Starting with JobEnvironment(job_id=1119101_1, hostname=n7125, local_rank=0(1), node=0(1), global_rank=0(1))
submitit INFO (2021-12-26 21:03:01,553) - Loading pickle: /u/project/pasaniuc/kangchen/2021-admix-corr/experiments/03_page_genome_wide/submitit-logs/1119101_1_submitted.pkl
2021-12-26 21:03.10 [info     ] admix.Dataset: read local ancestry from /u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr/imputed/chr1.lanc
*******************************************************************
* Genome-wide Complex Trait Analysis (GCTA)
* version 1.93.2 beta Linux
* (C) 2010-present, Jian Yang, The University of Queensland
* Please report bugs to Jian Yang <jian.yang.qt@gmail.com>
*******************************************************************
Analysis started at 21:03:25 PST on Sun Dec 26 2021.
Hostname: n7125

Accepted options:
--reml
--reml-no-lrt
--reml-no-constrain
--pheno out/gcta-estimate/crp.imputed.mafukb.duf