In [1]:
%load_ext lab_black

import pandas as pd
import numpy as np
import dapgen
import itertools
import submitit
import json
import matplotlib.pyplot as plt
import admix_genet_cor
import admix
from os.path import join
import os
import json

In [2]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp_tables.xlsx?dl=1"
trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="trait-info")
trait_list = trait_info["trait"].values

In [3]:
PFILE_DIR = "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01_dataset/out/aframr/imputed"
SAMPLE_INFO_PATH = "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01_dataset/out/aframr/sample_info.txt"

In [4]:
def estimate_rg(est, est_var):
    """
    Estimate the ratio of genetic correlation.
    est: (3, ) numpy array
    est_var (3, 3) variance-covariance matrix
    """
    x, y = est[0], est[1]
    rg = y / x
    # grad = [-y / x^2, 1 / x]
    grad = np.array([-y / (x ** 2), 1 / x])

    def quad_form(x, A):
        return np.dot(np.dot(x.T, A), x)

    return rg, quad_form(grad, est_var[0:2, 0:2])

In [5]:
#     # compile phenotype and covariates
#     dset = admix.dataset.read_dataset(
#         join(PFILE_DIR, "chr22"),
#         indiv_info=SAMPLE_INFO_PATH,
#         n_anc=2,
#     )
#     subset_indiv = np.where(~np.isnan(dset.indiv[trait]).values)[0]
#     dset_assoc = dset[:, subset_indiv]
#     A1 = A1[np.ix_(subset_indiv, subset_indiv)]
#     A2 = A2[np.ix_(subset_indiv, subset_indiv)]

#     covar_cols = ["age", "sex", "study"] + [f"geno_EV{i}" for i in range(1, 11)]

#     df_pheno = dset_assoc.indiv[[trait]].copy()
#     df_covar = dset_assoc.indiv[covar_cols].copy()
#     # create study dummies variables
#     study_dummies = pd.get_dummies(df_covar["study"], drop_first=True)
#     study_dummies.columns = [f"study_dummy_{s}" for s in study_dummies.columns]
#     df_covar = pd.concat([df_covar, study_dummies], axis=1)
#     df_covar = df_covar.drop(columns=["study"])

#     for col in df_pheno.columns:
#         df_pheno[col] = admix.data.quantile_normalize(df_pheno[col])

#     #     for col in df_covar.columns:
#     #         df_covar[col] = admix.data.quantile_normalize(df_covar[col])

#     cov = np.c_[np.ones(df_covar.shape[0]), df_covar.values]
#     rls_list = admix_genet_cor.estimate_genetic_cor(
#         A1, A2, df_pheno.values, cov=cov, compute_varcov=True
#     )

In [6]:
def submit_estimate(snpset, hermodel, trait):
    # load data
    prefix = f"out/admix-grm/{snpset}.{hermodel}.all"
    A1 = np.load(prefix + ".A1.npy")
    A2 = np.load(prefix + ".A2.npy")

    # compile phenotype and covariates
    dset = admix.dataset.read_dataset(
        join(PFILE_DIR, "chr22"),
        indiv_info=SAMPLE_INFO_PATH,
        n_anc=2,
    )
    subset_indiv = np.where(~np.isnan(dset.indiv[trait]).values)[0]
    dset_assoc = dset[:, subset_indiv]
    A1 = A1[np.ix_(subset_indiv, subset_indiv)]
    A2 = A2[np.ix_(subset_indiv, subset_indiv)]

    covar_cols = ["age", "sex", "study"] + [f"geno_EV{i}" for i in range(1, 11)]

    df_pheno = dset_assoc.indiv[[trait]].copy()
    df_covar = dset_assoc.indiv[covar_cols].copy()
    # create study dummies variables
    study_dummies = pd.get_dummies(df_covar["study"], drop_first=True)
    study_dummies.columns = [f"study_dummy_{s}" for s in study_dummies.columns]
    df_covar = pd.concat([df_covar, study_dummies], axis=1)
    df_covar = df_covar.drop(columns=["study"])

    for col in df_pheno.columns:
        df_pheno[col] = admix.data.quantile_normalize(df_pheno[col])

    #     for col in df_covar.columns:
    #         df_covar[col] = admix.data.quantile_normalize(df_covar[col])

    cov = np.c_[np.ones(df_covar.shape[0]), df_covar.values]
    rls_list = admix_genet_cor.estimate_genetic_cor(
        A1, A2, df_pheno.values, cov=cov, compute_varcov=True
    )
    est, est_varcov = rls_list[0]
    rg, rg_var = estimate_rg(est, est_varcov)
    dict_rls = {
        "est": est.tolist(),
        "est_varcov": est_varcov.tolist(),
        "rg": rg,
        "rg_var": rg_var,
        "trait": trait,
        "n_indiv": df_pheno.shape[0],
    }
    with open(f"out/estimate/{snpset}.{hermodel}.{trait}.json", "w") as f:
        json.dump(dict_rls, f)

In [13]:
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(["imputed"], ["mafukb", "gcta"], trait_list)
    ],
    columns=["snpset", "hermodel", "trait"],
)

# find unfinished
df_params = df_params[
    ~df_params.apply(
        lambda p: os.path.exists(
            f"out/estimate/{p.snpset}.{p.hermodel}.{p.trait}.json"
        ),
        axis=1,
    )
]

In [12]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=30,
    memory_g=32,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_estimate, df_params.snpset, df_params.hermodel, df_params.trait
)