In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

import submitit
import admix
import numpy as np
import pandas as pd
import calprs
import os
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import glob
from tqdm import tqdm
import itertools
from typing import List

In [2]:
from utils import DATA_DIR, COVAR_COLS, load_trait_info

In [3]:
TEST_COLS = ["SEX", "glasses"] + [
    "AGE",
    "years_of_edu",
    "income",
    "DEPRIVATION_INDEX",
    "PC1",
    "PC2",
]

print("Covariates:", ", ".join(COVAR_COLS))
print("Testing:", ", ".join(TEST_COLS))

Covariates: AGE, SEX, DEPRIVATION_INDEX, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10
Testing: SEX, glasses, AGE, years_of_edu, income, DEPRIVATION_INDEX, PC1, PC2


In [4]:
def compute_r2(
    trait: str,
    indiv_group: str,
    out_prefix: str,
    test_cols: List[str],
    n_bootstrap: int = 1000,
):
    """
    Compute R2 across covariate for trait and group of individuals

    Parameters
    ----------
    trait: str
        trait to load
    indiv_group: str
        group of individuals
    out_prefix: str
        output prefix
        <out_prefix>.baseline.tsv and <out_prefix>.r2_diff.tsv will be produced
    """
    df_trait = load_trait_info(
        trait, indiv_group, list(set(test_cols) | set(COVAR_COLS))
    )
    # residual after regressing out COVAR_COLS
    df_trait["PHENO_RESID"] = (
        sm.OLS(
            df_trait["PHENO"].values,
            sm.add_constant(df_trait[COVAR_COLS]),
            missing="drop",
        )
        .fit()
        .resid
    )
    # baseline
    df_baseline = admix_prs.summarize_pred(
        df_trait,
        y_col="PHENO_RESID",
        pred_col="MEAN",
    )
    # R2 diff
    df_out = []
    for col in test_cols:
        n_unique = len(np.unique(df_trait[col].values))
        if n_unique > 5:
            df_trait[col] = pd.qcut(df_trait[col], q=5, duplicates="drop").cat.codes
            print(f"Converting column '{col}' to 5 quintiles")
        df_res, df_res_se, r2_diff = admix_prs.summarize_pred(
            df_trait,
            y_col="PHENO_RESID",
            pred_col="MEAN",
            group_col=col,
            n_bootstrap=n_bootstrap,
            return_r2_diff=True,
        )
        df_out.append(
            [col, df_res["r2"].iloc[-1] - df_res["r2"].iloc[0], np.mean(r2_diff > 0)]
        )

    df_baseline.to_csv(out_prefix + ".baseline.tsv", sep="\t", header=False)
    pd.DataFrame(df_out, columns=["test_col", "r2_diff", "prob>0"]).to_csv(
        out_prefix + ".r2_diff.tsv", sep="\t", index=False
    )

In [5]:
trait_list = list(
    set(
        [
            t.split("/")[-1].split(".")[0]
            for t in glob.glob(os.path.join(DATA_DIR, "pred/*"))
        ]
    )
)
print(f"{len(trait_list)} traits in total.")

111 traits in total.


In [6]:
df_params = pd.DataFrame(
    [params for params in itertools.product(trait_list, ["white_british", "other"])],
    columns=["trait", "group"],
)
df_params["out_prefix"] = df_params.apply(
    lambda r: f"out/r2-diff/{r.trait}-{r.group}", axis=1
)
print(f"{len(df_params)} jobs in total")

222 jobs in total


In [7]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=10,
    memory_g=12,
    queue="highp",
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

In [8]:
df_todo_params = df_params[
    ~df_params.apply(lambda x: os.path.exists(x.out_prefix + ".r2_diff.tsv"), axis=1)
]
print(f"{len(df_todo_params)} jobs remains")

0 jobs remains


In [9]:
jobs = executor.map_array(
    compute_r2,
    df_todo_params.trait,
    df_todo_params.group,
    df_todo_params.out_prefix,
    [TEST_COLS] * len(df_todo_params),
)



# Summarize the results

In [10]:
for group in ["white_british", "other"]:
    df_group_params = df_params[df_params.group == group]
    df_baseline_r2 = []
    df_r2_diff = []
    for _, row in tqdm(df_group_params.iterrows()):
        baseline_file = row.out_prefix + ".baseline.tsv"
        if not os.path.exists(baseline_file):
            print(f"{baseline_file} does not exist.")
            continue
        df_tmp = pd.read_csv(
            baseline_file, sep="\t", header=None, index_col=0
        ).squeeze()
        df_baseline_r2.append([row.trait, df_tmp["r2"]])

        df_tmp = pd.read_csv(row.out_prefix + ".r2_diff.tsv", sep="\t")
        df_tmp.insert(0, "trait", row.trait)
        df_r2_diff.append(df_tmp)
    df_baseline_r2 = pd.DataFrame(df_baseline_r2, columns=["trait", "baseline_r2"])
    df_r2_diff = pd.concat(df_r2_diff)
    df_baseline_r2.to_csv(f"out/baseline_r2.{group}.tsv", sep="\t", index=False)
    df_r2_diff.to_csv(f"out/r2_diff.{group}.tsv", sep="\t", index=False)

111it [00:00, 123.93it/s]
111it [00:00, 126.80it/s]
