In [3]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

import submitit
import admix
import numpy as np
import pandas as pd
import admix_prs
import os
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import glob
from tqdm import tqdm

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
DATA_DIR = "/u/project/pasaniuc/pasaniucdata/admixture/projects/admix-prs-uncertainty/experiments/00-compile-data/out"
PHENO_DIR = "/u/project/sgss/UKBB/PRS-RESEARCH/03-compile-pheno/out"

TEST_COLS = ["SEX", "glasses"] + [
    col + "_Q"
    for col in ["AGE", "years_of_edu", "income", "DEPRIVATION_INDEX", "PC1", "PC2"]
]
COVAR_COLS = ["AGE", "SEX", "DEPRIVATION_INDEX"] + [f"PC{i}" for i in range(1, 11)]

print("Covariates:", ", ".join(COVAR_COLS))
print("Testing:", ", ".join(TEST_COLS))

Covariates: AGE, SEX, DEPRIVATION_INDEX, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10
Testing: SEX, glasses, AGE_Q, years_of_edu_Q, income_Q, DEPRIVATION_INDEX_Q, PC1_Q, PC2_Q


In [5]:
def load_trait_info(trait):

    ## 1. load trait and score
    df_trait = pd.read_csv(
        os.path.join(PHENO_DIR, f"{trait}.tsv"), sep="\t", index_col=0
    ).drop(columns=["IID"])

    df_score = pd.read_csv(
        os.path.join(DATA_DIR, f"pred/{trait}.score_summary.tsv.gz"),
        sep="\t",
        index_col=0,
    )
    df_score.index = [int(i.split("_")[0]) for i in df_score.index]

    ## 2. load covariates
    df_covar = pd.read_csv(os.path.join(DATA_DIR, "covar.tsv"), sep="\t", index_col=0)

    # add some phenotype to the covariates
    for t in ["years_of_edu", "glasses", "income"]:
        df_tmp = pd.read_csv(
            os.path.join(PHENO_DIR, f"{t}.tsv"), sep="\t", index_col=0
        ).drop(columns=["IID"])
        df_covar[t] = df_tmp["PHENO"].reindex(df_covar.index)

    # convert continuous phenotype to discrete phenotypes
    for col in ["years_of_edu", "income", "AGE", "DEPRIVATION_INDEX", "PC1", "PC2"]:
        df_covar[f"{col}_Q"] = pd.qcut(df_covar[col], q=5, duplicates="drop").cat.codes

    # merge all files together
    df_trait = pd.merge(df_score, df_trait, left_index=True, right_index=True)
    df_trait = pd.merge(df_trait, df_covar, left_index=True, right_index=True)

    # restricted to UK ancestry
    df_trait = df_trait[df_trait.group == "United Kingdom"]
    df_trait = df_trait.dropna()
    return df_trait


def compute_r2(trait, out_prefix, n_bootstrap=1000):
    df_trait = load_trait_info(trait)
    # residual after regressing out COVAR_COLS
    df_trait["PHENO_RESID"] = (
        sm.OLS(
            df_trait["PHENO"].values,
            sm.add_constant(df_trait[COVAR_COLS]),
            missing="drop",
        )
        .fit()
        .resid
    )
    # baseline
    df_baseline = admix_prs.summarize_pred(
        df_trait,
        y_col="PHENO_RESID",
        pred_col="MEAN",
    )
    # R2 diff
    df_out = []
    for col in TEST_COLS:

        df_res, df_res_se, r2_diff = admix_prs.summarize_pred(
            df_trait,
            y_col="PHENO_RESID",
            pred_col="MEAN",
            group_col=col,
            n_bootstrap=n_bootstrap,
            return_r2_diff=True,
        )
        df_out.append(
            [col, df_res["r2"].iloc[-1] - df_res["r2"].iloc[0], np.mean(r2_diff > 0)]
        )

    df_out = pd.DataFrame(df_out, columns=["test_col", "r2_diff", "prob>0"])
    df_baseline.to_csv(out_prefix + ".baseline.tsv", sep="\t", header=False)
    df_out.to_csv(out_prefix + ".r2_diff.tsv", sep="\t", index=False)

In [6]:
trait_list = list(
    set(
        [
            t.split("/")[-1].split(".")[0]
            for t in glob.glob(os.path.join(DATA_DIR, "pred/*"))
        ]
    )
)
print(f"{len(trait_list)} traits in total.")

111 traits in total.


In [8]:
df_params = pd.DataFrame({"trait": trait_list})
df_params["out_prefix"] = df_params.trait.apply(lambda x: f"out/r2-diff/{x}")
print(f"{len(df_params)} jobs in total")

111 jobs in total


In [9]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=10,
    memory_g=12,
    queue="highp",
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

In [10]:
df_todo_params = df_params[
    ~df_params.apply(lambda x: os.path.exists(x.out_prefix + ".r2_diff.tsv"), axis=1)
]
print(f"{len(df_todo_params)} jobs remains")

1 jobs remains


In [11]:
jobs = executor.map_array(
    compute_r2,
    df_todo_params.trait,
    df_todo_params.out_prefix,
)

# Summarize the results

In [12]:
df_baseline_r2 = []
df_r2_diff = []
for _, row in tqdm(df_params.iterrows()):
    baseline_file = row.out_prefix + ".baseline.tsv"
    if not os.path.exists(baseline_file):
        print(f"{baseline_file} does not exist.")
        continue
    df_tmp = pd.read_csv(baseline_file, sep="\t", header=None, index_col=0).squeeze()
    df_baseline_r2.append([row.trait, df_tmp["r2"]])

    df_tmp = pd.read_csv(row.out_prefix + ".r2_diff.tsv", sep="\t")
    df_tmp.insert(0, "trait", row.trait)
    df_r2_diff.append(df_tmp)

111it [00:00, 140.31it/s]

out/r2-diff/F_length_menstrual_cycle.baseline.tsv does not exist.





In [13]:
df_baseline_r2 = pd.DataFrame(df_baseline_r2, columns=["trait", "baseline_r2"])
df_r2_diff = pd.concat(df_r2_diff)

In [14]:
df_baseline_r2.to_csv("out/baseline_r2.tsv", sep="\t", index=False)
df_r2_diff.to_csv("out/r2_diff.tsv", sep="\t", index=False)