In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import dapgen
import itertools
import submitit
import json
import matplotlib.pyplot as plt
import admix_genet_cor
import admix
from os.path import join
import os
import json

In [2]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp_tables.xlsx?dl=1"
PFILE_DIR = "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01_dataset/out/aframr/imputed"
SAMPLE_INFO_PATH = "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01_dataset/out/aframr/sample_info.txt"
trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="trait-info")
trait_list = trait_info["trait"].values

In [3]:
def write_gcta_grm(snpset, hermodel):
    dset = admix.dataset.read_dataset(
        join(PFILE_DIR, "chr22"),
        indiv_info=SAMPLE_INFO_PATH,
        n_anc=2,
    )

    n_indiv = dset.n_indiv
    # number of SNPs with both (EUR-freq > 0.005) & (AFR-freq > 0.005)

    prefix = f"out/admix-grm/{snpset}.{hermodel}.all"
    out_prefix = f"out/gcta-estimate/grm/{snpset}.{hermodel}."
    Ks = [np.load(prefix + ".A1.npy"), np.load(prefix + ".A2.npy")]

    n_snp = sum(1 for line in open(prefix + ".weight.tsv"))
    print(f"n_snp: {n_snp}")
    names = []

    for i, K in enumerate(Ks):
        name = f"K{i+1}"
        admix.tools.gcta.write_grm(
            out_prefix + name,
            K=K,
            df_id=pd.DataFrame(
                {"0": dset.indiv.index.values, "1": dset.indiv.index.values}
            ),
            n_snps=np.repeat(n_snp, n_indiv),
        )
        names.append(out_prefix + name)

    with open(out_prefix + "mgrm.txt", "w") as f:
        f.writelines("\n".join(names))

    # addition of all GRMs, used for likelihood ratio test
    K_full = sum(Ks)
    admix.tools.gcta.write_grm(
        out_prefix + "K_full",
        K=K_full,
        df_id=pd.DataFrame(
            {"0": dset.indiv.index.values, "1": dset.indiv.index.values}
        ),
        n_snps=np.repeat(n_snp, n_indiv),
    )

In [4]:
write_gcta_grm("imputed", "mafukb")
write_gcta_grm("imputed", "gcta")

n_snp: 7117286
n_snp: 7117286


In [5]:
def submit_gcta_estimate(snpset, hermodel, trait):
    # compile phenotype and covariates
    dset = admix.dataset.read_dataset(
        join(PFILE_DIR, "chr22"),
        indiv_info=SAMPLE_INFO_PATH,
        n_anc=2,
    )
    subset_indiv = np.where(~np.isnan(dset.indiv[trait]).values)[0]
    dset_assoc = dset[:, subset_indiv]

    covar_cols = ["age", "sex", "study"] + [f"geno_EV{i}" for i in range(1, 11)]

    df_pheno = dset_assoc.indiv[[trait]].copy()
    df_covar = dset_assoc.indiv[covar_cols].copy()
    # create study dummies variables
    study_dummies = pd.get_dummies(df_covar["study"], drop_first=True)
    study_dummies.columns = [f"study_dummy_{s}" for s in study_dummies.columns]
    df_covar = pd.concat([df_covar, study_dummies], axis=1)
    df_covar = df_covar.drop(columns=["study"])

    for col in df_pheno.columns:
        df_pheno[col] = admix.data.quantile_normalize(df_pheno[col])

    df_id = pd.DataFrame(
        {"FID": df_pheno.index.values, "IID": df_pheno.index.values},
        index=df_pheno.index.values,
    )
    df_pheno = pd.merge(df_id, df_pheno, left_index=True, right_index=True)
    df_covar = pd.merge(df_id, df_covar, left_index=True, right_index=True)

    out_prefix = f"out/gcta-estimate/estimate/{trait}.{snpset}.{hermodel}"
    mgrm_path = f"out/gcta-estimate/grm/{snpset}.{hermodel}.mgrm.txt"
    admix.tools.gcta.reml(
        mgrm_path=mgrm_path, df_pheno=df_pheno, df_covar=df_covar, out_prefix=out_prefix
    )

In [6]:
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(["imputed"], ["mafukb", "gcta"], trait_list)
    ],
    columns=["snpset", "hermodel", "trait"],
)

In [8]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=120,
    memory_g=20,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_gcta_estimate, df_params.snpset, df_params.hermodel, df_params.trait
)