In [1]:
%load_ext lab_black
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
import subprocess
from tqdm import tqdm
import itertools
import matplotlib.pyplot as plt
import yaml
from glob import glob

In [2]:
with open(f"data/meta.yaml", "r") as f:
    metadata = yaml.safe_load(f)

VAR_COLS = metadata["VAR_COLS"]

trait_list = np.loadtxt("data/traits.txt", dtype=str)

In [3]:
def summarize_params():
    def _sum(value_col):
        df_numerics = []
        for group in ["white", "other", "all"]:
            df_params = pd.DataFrame(index=VAR_COLS, columns=trait_list)
            df_params.index.name = "covar"
            for trait in trait_list:

                df = pd.read_csv(
                    f"out/estimate-quantify/{trait}.{group}.params.tsv",
                    sep="\t",
                    index_col=0,
                )
                df_params[trait] = df[value_col]
            df_tmp = pd.melt(
                df_params.reset_index(), id_vars=["covar"], value_vars=df_params.columns
            )
            df_tmp.insert(0, "group", group)
            df_numerics.append(df_tmp)
        df_numerics = pd.concat(df_numerics).rename(
            columns={"variable": "trait", "value": value_col}
        )
        df_numerics = df_numerics[["group", "trait", "covar", value_col]]
        return df_numerics

    df_var_coef = _sum("var_coef")
    df_var_se = _sum("var_se")
    df_var = pd.merge(df_var_coef, df_var_se, on=["group", "trait", "covar"])
    return df_var


def summarize_r2(suffix):
    df_r2 = []
    for group in ["white", "other", "all"]:
        for trait in trait_list:
            prefix = f"./out/estimate-quantify/{trait}.{group}.{suffix}"
            df_baseline = pd.read_csv(
                prefix + ".baseline.tsv", sep="\t", header=None, index_col=0
            ).squeeze()

            df_tmp = pd.read_csv(prefix + ".r2diff.tsv", sep="\t").rename(
                columns={"group": "covar"}
            )
            df_tmp.insert(0, "group", group)
            df_tmp.insert(1, "trait", trait)
            df_tmp["R2_baseline"] = df_baseline["r2"]
            df_r2.append(df_tmp)

    df_r2 = pd.concat(df_r2)
    df_r2["R2_reldiff"] = df_r2["r2diff"] / df_r2["R2_baseline"]
    df_r2["R2_reldiff_z"] = df_r2["zscore"]
    df_r2 = df_r2[
        ["group", "trait", "covar", "R2_reldiff", "R2_reldiff_z", "R2_baseline"]
    ]
    return df_r2

In [4]:
df_params = summarize_params()
df_total_r2 = summarize_r2("total")
df_resid_r2 = summarize_r2("resid")

In [5]:
writer = pd.ExcelWriter("results/ukb-r2.xlsx", engine="xlsxwriter")
df_total_r2.to_excel(writer, sheet_name="total-r2", index=False)
df_resid_r2.to_excel(writer, sheet_name="resid-r2", index=False)
df_params.to_excel(writer, sheet_name="params", index=False)
writer.save()

In [6]:
def summarize_calibration(trait, group):
    from natsort import natsorted

    df_sum = []

    for method in ["mean", "mean+var"]:
        path_list = natsorted(
            glob(f"out/calibrate/{trait}.{group}/*.{method}.predint.tsv")
        )
        print(f"{method}: {len(path_list)}")

        for seed_i, path in enumerate(path_list):
            df_tmp = pd.read_csv(path, sep="\t")
            df_tmp.insert(0, "seed", seed_i)
            df_tmp.insert(0, "method", method)
            df_r2 = pd.read_csv(path.replace("predint", "r2"), sep="\t")
            df_tmp["r2"] = df_r2["r2"]
            df_sum.append(df_tmp)
    df_sum = pd.concat(df_sum, axis=0)
    return df_sum

In [7]:
writer = pd.ExcelWriter("results/ukb-calibrate.xlsx", engine="xlsxwriter")

for trait in ["LDL", "height"]:
    for group in ["white", "all"]:
        df_sum = summarize_calibration(trait=trait, group=group)
        df_sum.to_excel(writer, sheet_name=f"{trait}.{group}", index=False)
writer.save()

mean: 30
mean+var: 30
mean: 30
mean+var: 30
mean: 30
mean+var: 30
mean: 30
mean+var: 30
