In [1]:
%load_ext lab_black
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
import subprocess
from tqdm import tqdm
import itertools
import matplotlib.pyplot as plt

In [2]:
group, trait, covar = "white", "log_BMI", "SEX"

In [3]:
df = pd.read_csv(
    f"out/calibrate/height.all/1.mean+var.pred.tsv",
    sep="\t",
    index_col=0,
)

In [4]:
df.groupby("SEX").apply(lambda x: np.mean(x["PHENO"]))

SEX
-0.891061    161.814139
 1.122248    174.608634
dtype: float64

In [5]:
def regression(trait, group, covar, method):
    df_stats_list = []
    for seed in range(1, 31):
        df = pd.read_csv(
            f"out/calibrate/{trait}.{group}/{seed}.{method}.pred.tsv",
            sep="\t",
            index_col=0,
        )
        if len(df[covar].unique()) > 5:
            df["q"] = pd.qcut(df[covar], q=5).cat.codes
        else:
            df["q"] = df[covar].values

        df_stats = {
            "R2": [],
            "std(resid)": [],
            "std(pred)": [],
            "std(y)": [],
            "slope": [],
        }

        for i, (q, dfq) in enumerate(df.groupby("q")):
            x, y = dfq["pred_mean"], dfq["QPHENO"]
            model = sm.OLS(endog=y, exog=sm.add_constant(x)).fit()
            df_stats["R2"].append(model.rsquared)
            df_stats["std(resid)"].append(np.std(model.resid))
            df_stats["std(pred)"].append(np.std(x))
            df_stats["std(y)"].append(np.std(y))

            model = sm.WLS(
                endog=y, exog=sm.add_constant(x), weights=(1 / dfq["pred_std"]) ** 2
            ).fit()
            df_stats["slope"].append(model.params[1])

        df_stats = pd.DataFrame(df_stats)
        df_stats_list.append(df_stats)

    if covar == "PC1":
        index = pd.Index([1, 2, 3, 4, 5], name="PC1 q")
    elif covar == "SEX":
        index = pd.Index(["Female", "Male"], name="Sex")
    else:
        raise NotImplementedError

    df_stats = pd.DataFrame(
        np.mean(df_stats_list, axis=0),
        index=df_stats_list[0].index,
        columns=df_stats_list[0].columns,
    )

    df_stats_sem = pd.DataFrame(
        np.std(df_stats_list, axis=0) / np.sqrt(len(df_stats_list)),
        index=df_stats_list[0].index,
        columns=df_stats_list[0].columns,
    )

    df_stats.index = index
    df_stats_sem.index = index
    return df_stats, df_stats_sem

In [6]:
with pd.ExcelWriter("case-study.xlsx") as writer:
    for (group, trait, covar), method in itertools.product(
        [("all", "LDL", "PC1"), ("white", "log_BMI", "SEX")],
        ["mean+var", "mean+var+interact"],
    ):
        df_stats, df_sem = regression(
            trait=trait, group=group, covar=covar, method=method
        )
        df_tbl = df_stats.round(3).astype(str) + "(" + df_sem.round(3).astype(str) + ")"
        df_tbl.to_excel(writer, sheet_name=f"{trait}.{method}", index=True)