In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import dapgen
import itertools
import submitit
import json
import matplotlib.pyplot as plt
import admix_genet_cor
import admix
from os.path import join
import os
import json
import glob

In [2]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp_tables.xlsx?dl=1"
ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr"
)
PFILE_DIR = join(ROOT_DIR, "imputed")
trait_list = [
    f.split("/")[-1].split(".")[0] for f in glob.glob(join(ROOT_DIR, "pheno", "*.tsv"))
]
# trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="trait-info")
# trait_list = trait_info["trait"].values

In [4]:
def submit_gcta_estimate(snpset, hermodel, trait, duffy_covar=True):
    # compile phenotype and covariates
    dset = admix.io.read_dataset(
        join(PFILE_DIR, "chr1"),
        n_anc=2,
    )
    df_trait = pd.read_csv(join(ROOT_DIR, f"pheno/{trait}.tsv"), sep="\t", index_col=0)
    df_trait.index = df_trait.index.astype(str)

    # subset for individuals with non-nan value in df_trait
    dset = dset[:, dset.indiv.index.isin(df_trait.index)]
    dset.append_indiv_info(df_trait)

    covar_cols = df_trait.columns[1:]

    df_pheno = dset.indiv[[trait]].copy()
    df_covar = dset.indiv[covar_cols].copy()
    df_covar = admix.data.convert_dummy(df_covar)

    # special case for duffy SNPs, include the duffy SNPs in the covariate
    if duffy_covar:
        # find closest SNPs
        duffy_snp_loc = np.argmin(np.abs(dset.snp.POS - 159204893))
        assert dset.snp.CHROM.iloc[duffy_snp_loc] == 1
        duffy_lanc = dset[duffy_snp_loc].lanc.sum(axis=[0, 2]).compute()
        df_covar["duffy_lanc"] = duffy_lanc

    for col in df_pheno.columns:
        df_pheno[col] = admix.data.quantile_normalize(df_pheno[col])

    for col in df_covar.columns:
        df_covar[col] = admix.data.quantile_normalize(df_covar[col])

    # fill na with column mean
    df_covar.fillna(df_covar.mean(), inplace=True)

    df_id = pd.DataFrame(
        {"FID": df_pheno.index.values, "IID": df_pheno.index.values},
        index=df_pheno.index.values,
    )
    df_pheno = pd.merge(df_id, df_pheno, left_index=True, right_index=True)
    df_covar = pd.merge(df_id, df_covar, left_index=True, right_index=True)

    print(df_covar)

    out_prefix = f"out/gcta-estimate/{trait}.{snpset}.{hermodel}"
    if duffy_covar:
        out_prefix += ".duffy_covar"

    ### fit reduced model ###
    reduced_grm = join(ROOT_DIR, f"grm/{snpset}.{hermodel}.all.K_full")
    admix.tools.gcta.reml(
        grm_path=reduced_grm,
        df_pheno=df_pheno,
        df_covar=df_covar,
        out_prefix=out_prefix + ".reduced",
        n_thread=4,
    )

    ### fit full model ###
    mgrm_path = join(ROOT_DIR, f"grm/{snpset}.{hermodel}.all.mgrm.txt")
    admix.tools.gcta.reml(
        mgrm_path=mgrm_path,
        df_pheno=df_pheno,
        df_covar=df_covar,
        out_prefix=out_prefix,
        n_thread=4,
    )

In [7]:
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["imputed"], ["mafukb", "gcta"], trait_list, [True, False]
        )
    ],
    columns=["snpset", "hermodel", "trait", "duffy_covar"],
)

In [10]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=300,
    memory_g=18,
    queue="highp",
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_gcta_estimate,
    df_params.snpset,
    df_params.hermodel,
    df_params.trait,
    df_params.duffy_covar,
)

In [9]:
df_params

Unnamed: 0,snpset,hermodel,trait,duffy_covar
0,imputed,mafukb,crp,True
1,imputed,mafukb,crp,False
2,imputed,mafukb,basophil_cnt,True
3,imputed,mafukb,basophil_cnt,False
4,imputed,mafukb,a1c,True
...,...,...,...,...
179,imputed,gcta,t2d_status,False
180,imputed,gcta,insulin,True
181,imputed,gcta,insulin,False
182,imputed,gcta,glucose,True
