In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

import submitit
import admix
import numpy as np
import pandas as pd
import calpgs
import os
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import glob
from tqdm import tqdm
import itertools
from typing import List
import subprocess
from admix.data import quantile_normalize

In [2]:
DATA_DIR = "../compile-data/out/per-trait-info/"

In [3]:
# COVAR_COLS = [
#     "AGE",
#     "SEX",
#     "DEPRIVATION_INDEX",
#     "log_BMI",
#     "income",
#     "ever_smoked",
#     "drink_alcohol",
#     "glasses",
#     "years_of_edu",
# ] + [f"PC{i}" for i in range(1, 11)]

# TEST_COLS = COVAR_COLS

TEST_COLS = [
    "AGE",
    "SEX",
    "DEPRIVATION_INDEX",
    "log_BMI",
    "income",
    "ever_smoked",
    "drink_alcohol",
    "glasses",
    "years_of_edu",
] + [f"PC{i}" for i in range(1, 11)]

COVAR_COLS = ["AGE", "SEX"] + [f"PC{i}" for i in range(1, 11)]

In [4]:
print("Covariates:", ", ".join(COVAR_COLS))
print("Testing:", ", ".join(TEST_COLS))

Covariates: AGE, SEX, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10
Testing: AGE, SEX, DEPRIVATION_INDEX, log_BMI, income, ever_smoked, drink_alcohol, glasses, years_of_edu, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10


In [5]:
df_trait = pd.read_csv(os.path.join(DATA_DIR, f"height.tsv.gz"), index_col=0, sep="\t")

In [11]:
df_trait["drink_alcohol"].unique()

array([ 1.,  0., nan])

In [14]:
len(TEST_COLS)

19

In [13]:
for col in TEST_COLS:
    unique = df_trait[col].unique()
    line = f"{col}: {len(df_trait[col].unique())}"
    if len(unique) < 10:
        line += f": {','.join([str(u) for u in unique])}"
    print(line)

AGE: 34
SEX: 2: 1,0
DEPRIVATION_INDEX: 37749
log_BMI: 24050
income: 6: 1.41421,2.0,2.23607,1.0,nan,1.73205
ever_smoked: 3: 1.0,0.0,nan
drink_alcohol: 3: 1.0,0.0,nan
glasses: 3: 0.0,1.0,nan
years_of_edu: 6: 10.0,20.0,nan,13.0,19.0,15.0
PC1: 83739
PC2: 109388
PC3: 110619
PC4: 112421
PC5: 112133
PC6: 111659
PC7: 111722
PC8: 111893
PC9: 111243
PC10: 111880


In [15]:
def compute_r2(
    trait: str,
    indiv_group: str,
    out_prefix: str,
    test_cols: List[str],
    n_bootstrap: int = 1000,
):
    """
    Compute R2 across covariate for trait and group of individuals

    Parameters
    ----------
    trait: str
        trait to load
    indiv_group: str
        group of individuals
    out_prefix: str
        output prefix
        <out_prefix>.baseline.tsv and <out_prefix>.r2_diff.tsv will be produced
    """

    df_trait = pd.read_csv(
        os.path.join(DATA_DIR, f"{trait}.tsv.gz"), index_col=0, sep="\t"
    )
    if indiv_group == "white":
        df_trait = df_trait[df_trait.group == "United Kingdom"]
    elif indiv_group == "other":
        df_trait = df_trait[~(df_trait.group == "United Kingdom")]
    elif indiv_group == "all":
        df_trait = df_trait.copy()
    else:
        raise NotImplementedError

    # trait and covar can be the same, remove trait in COVAR
    covar_cols = [col for col in COVAR_COLS if col != trait]

    # residual after regressing out covar_cols (covariates are imputed)
    df_covar = df_trait[covar_cols].copy()
    for col in df_covar.columns:
        df_covar[col] = df_covar[col].fillna(df_covar[col].mean())

    # residual after regressing out covar_cols
    df_trait["PHENO_RESID"] = (
        sm.OLS(
            df_trait["PHENO"].values,
            sm.add_constant(df_covar),
        )
        .fit()
        .resid
    )
    # baseline
    df_baseline = calpgs.compute_group_stats(
        df_trait,
        y_col="PHENO_RESID",
        pred_col="MEAN",
    )
    df_baseline.to_csv(out_prefix + ".baseline.tsv", sep="\t", header=False)

    tmp_file = out_prefix + ".tmp.tsv"
    df_trait.to_csv(tmp_file, sep="\t")
    cmds = [
        "calpgs group-stats",
        f"--df {tmp_file}",
        "--y PHENO_RESID",
        "--pred MEAN",
        f"--group {','.join(test_cols)}",
        "--cor spearman",
        f"--out {out_prefix}",
    ]
    subprocess.check_call(" ".join(cmds), shell=True)
    os.remove(tmp_file)

In [16]:
trait_list = list(
    set(
        [
            t.split("/")[-1].rsplit(".", 2)[0]
            for t in glob.glob(os.path.join(DATA_DIR, "*.tsv.gz"))
        ]
    )
)
print(f"{len(trait_list)} traits in total.")

247 traits in total.


In [17]:
df_params = pd.DataFrame(
    [params for params in itertools.product(trait_list, ["white", "other", "all"])],
    columns=["trait", "group"],
)
df_params["out_prefix"] = df_params.apply(
    lambda r: f"out/r2-diff/{r.trait}-{r.group}", axis=1
)
print(f"{len(df_params)} jobs in total")
os.makedirs("out/r2-diff/", exist_ok=True)

741 jobs in total


In [18]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=120,
    memory_g=12,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

In [21]:
df_todo_params = df_params[
    ~df_params.apply(lambda x: os.path.exists(x.out_prefix + ".r2diff.tsv"), axis=1)
]
print(f"{len(df_todo_params)} jobs remains")

0 jobs remains


In [22]:
jobs = executor.map_array(
    compute_r2,
    df_todo_params.trait,
    df_todo_params.group,
    df_todo_params.out_prefix,
    [TEST_COLS] * len(df_todo_params),
)



# Summarize the results

In [23]:
for group in ["white", "other", "all"]:
    df_group_params = df_params[df_params.group == group]
    df_baseline_r2 = []
    df_r2_diff = []
    for _, row in tqdm(df_group_params.iterrows()):
        baseline_file = row.out_prefix + ".baseline.tsv"
        if not os.path.exists(baseline_file):
            print(f"{baseline_file} does not exist.")
            continue
        df_tmp = pd.read_csv(
            baseline_file, sep="\t", header=None, index_col=0
        ).squeeze()
        df_baseline_r2.append([row.trait, df_tmp["r2"]])

        df_tmp = pd.read_csv(row.out_prefix + ".r2diff.tsv", sep="\t")
        df_tmp.insert(0, "trait", row.trait)
        df_r2_diff.append(df_tmp)
    df_baseline_r2 = pd.DataFrame(df_baseline_r2, columns=["trait", "baseline_r2"])
    df_r2_diff = pd.concat(df_r2_diff)
    df_baseline_r2.to_csv(f"out/baseline_r2.{group}.tsv", sep="\t", index=False)
    df_r2_diff.to_csv(f"out/r2diff.{group}.tsv", sep="\t", index=False)

247it [00:10, 24.45it/s]
247it [00:08, 27.57it/s]
247it [00:07, 34.05it/s]
