In [1]:
%load_ext lab_black

import numpy as np
import admix
import pandas as pd
import os
import subprocess
import glob
import submitit

# Format data
1. For each phenotype, we have one file, where every individual has a mean PRS, ground truth phenotype, SD PRS, 5% and 95% quantile.
2. The covariates are shared across all phenotype files.

In [2]:
DATA_DIR = (
    "/u/project/sgss/UKBB/PRS-RESEARCH/02-yi-simulate-prs/experiments/real-trait-sub"
)
PHENO_DIR = "/u/project/sgss/UKBB/PRS-RESEARCH/03-compile-pheno/out"
PLINK_DIR = "/u/project/sgss/UKBB/PRS-RESEARCH/00-compile-data/out/PLINK/all"

In [3]:
# form the covariate file with group information
df_covar = pd.read_csv(
    os.path.join(PHENO_DIR, "covar.tsv"), sep="\t", index_col=0
).drop(columns=["IID"])
df_covar_sub = pd.read_csv(
    os.path.join(DATA_DIR, "meta/covariates.txt"), sep="\t", index_col=0
)
df_covar["group"] = df_covar_sub["group"].reindex(df_covar.index)
df_covar.index.name = "INDIV"
df_covar.to_csv("out/covar.tsv", sep="\t", float_format="%.8g")

In [4]:
# format individual list
with open(os.path.join(DATA_DIR, "meta/uk_train.list")) as f:
    uk_train_index = sorted([int(l.split()[0]) for l in f.readlines()])

with open(os.path.join(DATA_DIR, "meta/uk_val.list")) as f:
    uk_val_index = sorted([int(l.split()[0]) for l in f.readlines()])

test_index = set(df_covar.index.values) - (set(uk_train_index) | set(uk_val_index))
test_index = sorted(list(test_index))
for name, index in zip(
    ["uk-train", "uk-val", "test"], [uk_train_index, uk_val_index, test_index]
):
    with open(f"out/indiv/{name}.indivlist", "w") as f:
        f.writelines("\n".join([str(i) for i in index]))

In [5]:
def submit_summarize(trait):
    # TODO: add SNP mean + SD.
    weights_path = os.path.join(DATA_DIR, f"out/{trait}/PHENO.auto.weight.tsv.gz")
    cmds = [
        "dapgen score",
        f"--plink {PLINK_DIR}",
        "--freq-suffix .uk-train.afreq",
        f"--weights {weights_path}",
        "--weight-col-prefix SAMPLE_",
        f"--out out/pred/{trait}.score.tsv.gz",
        "--chrom-col CHR --pos-col POS --alt-col A1 --ref-col A2",
        "--keep-fam out/indiv/test.indivlist",
        "--center True",
        "--threads 8",
        "--memory 40000",
    ]
    subprocess.check_call(" ".join(cmds), shell=True)
    df_score = pd.read_csv(
        os.path.join(f"out/pred/{trait}.score.tsv.gz"), sep="\t", index_col=0
    )
    df_summary = pd.DataFrame(
        {"MEAN": df_score.mean(axis=1), "SD": df_score.std(axis=1)}
    )
    q_list = [0.05, 0.5, 0.95]
    df_quantile = df_score.quantile(q=q_list, axis=1).T
    df_quantile.columns = [f"QUANTILE_{int(q * 100)}" for q in q_list]
    df_summary = pd.merge(df_summary, df_quantile, left_index=True, right_index=True)
    df_summary.to_csv(
        os.path.join(f"out/pred/{trait}.score_summary.tsv.gz"),
        sep="\t",
        float_format="%.6g",
    )

In [6]:
trait_list = [
    t.split("/")[-2]
    for t in glob.glob(os.path.join(DATA_DIR, "out/*/PHENO.auto.weight.tsv.gz"))
]

In [7]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=80,
    memory_g=60,
    queue="highp",
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(submit_summarize, trait_list)

In [8]:
# for trait in ["height", "insomnia", "darker_skin"]:
#     df_prs = pd.read_csv(
#         os.path.join(DATA_DIR, f"out/{trait}/PHENO.auto.eur.test_prs.tsv.gz"),
#         sep="\t",
#         index_col=0,
#     )
#     df_prs.index = [int(i.split("_")[0]) for i in df_prs.index]
#     df_prs = df_prs[["MEAN"]].rename(columns={"MEAN": "PRS_MEAN"})
#     df_pheno = pd.read_csv(
#         os.path.join(PHENO_DIR, f"{trait}.tsv"), sep="\t", index_col=0
#     ).drop(columns=["IID"])
#     df_trait = pd.merge(df_prs, df_pheno, left_index=True, right_index=True)
#     df_trait.index.name = "INDIV"
#     df_trait.to_csv(f"out/pred/{trait}.tsv", sep="\t", float_format="%.8g")