In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import itertools
import submitit
import matplotlib.pyplot as plt
import admix_genet_cor
import admix
import os
import glob
import scipy
import subprocess

In [2]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp-tables.xlsx?dl=1"

ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr"
)
GRM_DIR = "/u/scratch/k/kangchen/admix-grm/rho-model"
trait_list = [t.split("/")[-1].split(".")[0] for t in glob.glob(f"out/pheno/*.tsv")]

In [3]:
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["hm3", "imputed"],
            ["mafukb", "gcta"],
            [0.005, 0.05],
        )
    ],
    columns=[
        "snpset",
        "hermodel",
        "maf",
    ],
)

df_params["grm_prefix"] = df_params.apply(
    lambda p: f"{p.snpset}.{p.hermodel}.{str(p.maf)[2:]}",
    axis=1,
)

In [4]:
def submit_gcta_estimate(grm_prefix, trait):
    pheno_path = f"out/pheno/{trait}.tsv"

    out_dir = f"out/gcta-estimate/{trait}-{grm_prefix}"
    grm_dir = os.path.join(GRM_DIR, f"{grm_prefix}")
    cmds = [
        "admix estimate-genetic-cor",
        f"--grm-dir {grm_dir}",
        f"--pheno {pheno_path}",
        f"--out-dir {out_dir}",
        "--n-thread 4",
    ]
    cmd = " ".join(cmds)
    subprocess.check_call(cmd, shell=True)


def submit_gcta_estimate2(grm_prefix, trait, rho):
    pheno_path = f"out/pheno/{trait}.tsv"

    out_dir = f"out/gcta-estimate/{trait}-{grm_prefix}"
    grm_prefix = os.path.join(GRM_DIR, grm_prefix, f"rho{rho}")
    cmds = [
        "admix estimate-genetic-cor",
        f"--grm-prefix {grm_prefix}",
        f"--pheno {pheno_path}",
        f"--out-dir {out_dir}",
        "--n-thread 2",
    ]
    cmd = " ".join(cmds)
    subprocess.check_call(cmd, shell=True)

In [5]:
df_params = pd.DataFrame(
    [params for params in itertools.product(df_params.grm_prefix.unique(), trait_list)],
    columns=["grm_prefix", "trait"],
)

df_params = df_params.merge(
    pd.DataFrame({"rho": np.linspace(0, 100, 21).astype(int)}), how="cross"
)

df_todo_params = df_params[
    df_params.apply(
        lambda x: not os.path.exists(
            f"out/gcta-estimate/{x.trait}-{x.grm_prefix}/rho{x.rho}.hsq"
        ),
        axis=1,
    )
]

In [7]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=30,
    memory_g=10,
    cores=2,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_gcta_estimate2,
    df_todo_params.grm_prefix,
    df_todo_params.trait,
    df_todo_params.rho,
)