In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import gomics
from glob import glob
import itertools
import submitit
import subprocess

In [2]:
version = "baseline"

In [3]:
LDSC_DATA_DIR = "/u/project/pasaniuc/kangchen/DATA/ldsc/"
DATA_DIR = "../format-dmr-loop/out/"

trait_list = [
    "PASS_ADHD_Demontis2018",
    "PASS_Alzheimers_Jansen2019",
    "PASS_BIP_Mullins2021",
    "PASS_Insomnia_Jansen2019",
    "PASS_Intelligence_SavageJansen2018",
    "PASS_MDD_Howard2019",
    "PASS_Schizophrenia_Pardinas2018",
    "PASS_VerbalNumericReasoning_Davies2018",
    "UKB_460K.body_BMIz",
    "UKB_460K.cov_EDU_COLLEGE",
    "UKB_460K.cov_EDU_YEARS",
    "UKB_460K.cov_SMOKING_STATUS",
    "UKB_460K.mental_NEUROTICISM",
    "UKB_460K.body_HEIGHTz",
    "PASS_SCZ_Trubetskoy2022",
    "PASS_ASD_Grove2019",
]

In [4]:
df_params = {"group": [], "bed": []}

for group_folder in glob(DATA_DIR + "*"):
    group = group_folder.split("/")[-1]
    if group.startswith("3C"):
        continue
    ct_list = [p.split("/")[-1] for p in glob(os.path.join("out/ldscore", group, "*"))]
    df_params["group"].extend([group] * len(ct_list))
    df_params["bed"].extend(ct_list)
df_params = pd.DataFrame(df_params)
display(df_params.groupby("group").size())
df_params["ldscore_prefix"] = df_params.apply(
    lambda x: f"out/ldscore/{x.group}/{x.bed}/", axis=1
)
df_params["out_prefix"] = df_params.apply(
    lambda x: f"out/hsq_{version}/{x.group}/{x.bed}/", axis=1
)

group
L2DMR                  62
L2DMR-L2LOOP           59
L2DMR-L2LOOPSUMMIT     59
L2LOOP                 89
L2LOOP-L2DMR           59
L2LOOPSUMMIT           89
L2LOOPSUMMIT-L2DMR     59
L3DMR                 195
L3DMR-L3LOOP          159
L3DMR-L3LOOPSUMMIT    159
L3LOOP                159
L3LOOP-L3DMR          159
L3LOOPSUMMIT          159
L3LOOPSUMMIT-L3DMR    159
dtype: int64

In [5]:
df_todo_params = df_params[
    ~df_params.apply(
        lambda x: os.path.exists(x.out_prefix + trait_list[-1] + ".results"), axis=1
    )
].reset_index(drop=True)

In [6]:
if version == "baselineLD":
    ref_ld_prefix = "baselineLD_v2.2/baselineLD."
elif version == "baseline":
    ref_ld_prefix = "baseline_v1.2/baseline."
else:
    raise NotImplementedError

In [7]:
def calc_hsq(ldscore_prefix, out_prefix):
    LDSC_DIR = "/u/project/pasaniuc/kangchen/software/ldsc"
    PYTHON_PATH = (
        "/u/project/pasaniuc/kangchen/software/miniconda3/envs/ldsc/bin/python"
    )
    os.makedirs(os.path.dirname(out_prefix), exist_ok=True)
    for trait in trait_list:
        trait_out_prefix = out_prefix + trait
        if os.path.exists(trait_out_prefix + ".results"):
            continue
        cmds = [
            f"{PYTHON_PATH} {LDSC_DIR}/ldsc.py",
            f"--h2 {LDSC_DATA_DIR}/sumstats/{trait}.sumstats",
            f"--ref-ld-chr {LDSC_DATA_DIR}/{ref_ld_prefix},{ldscore_prefix}",
            f"--frqfile-chr {LDSC_DATA_DIR}/1000G_Phase3_frq/1000G.EUR.QC.",
            f"--w-ld-chr {LDSC_DATA_DIR}/1000G_Phase3_weights_hm3_no_MHC/weights.hm3_noMHC.",
            "--overlap-annot",
            "--print-coefficients",
            f"--out {trait_out_prefix}",
        ]
        try:
            subprocess.check_call(" ".join(cmds), shell=True)
        except subprocess.CalledProcessError as e:
            print(e.output)

In [8]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=20,
    memory_g=16,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PATH=~/project-pasaniuc/software/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    calc_hsq,
    df_todo_params.ldscore_prefix,
    df_todo_params.out_prefix,
)