In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
from typing import List
import matplotlib.pyplot as plt
import calpgs
from tqdm import tqdm
import pickle
from admix.data import quantile_normalize

np.random.seed(42)
plt.rcParams["font.family"] = "Arial"

In [2]:
df_cov = pd.read_csv("data/cov.tsv", sep="\t", index_col=0).reset_index(drop=True)
n_indiv = df_cov.shape[0]
df_cov["PC1"] = quantile_normalize(df_cov["PC1"].values)

df_cov = (df_cov - df_cov.mean(axis=0)) / df_cov.std(axis=0)

In [3]:
df_cov

Unnamed: 0,AGE,SEX,PC1
0,-0.449662,-0.875214,-1.254701
1,-1.614741,-0.875214,0.447978
2,0.365893,-0.875214,-1.503822
3,-1.614741,-0.875214,1.049788
4,0.715416,-0.875214,-0.695179
...,...,...,...
76359,-1.265217,-0.875214,0.709081
76360,-1.148709,1.142563,0.716275
76361,1.530971,-0.875214,0.304237
76362,0.132877,-0.875214,0.737349


In [4]:
from utils import simulate_data

In [5]:
df_cov.columns

Index(['AGE', 'SEX', 'PC1'], dtype='object')

In [6]:
N_CALIBRATE_LIST = [100, 500, 2500, 5000]
N_SIM = 100

In [7]:
DATASET_LIST = ["variable_slope_large", "variable_slope_small", "constant_slope"]

In [8]:
def make_dataset(name):
    assert name in ["variable_slope_large", "variable_slope_small", "constant_slope"]
    # effects for AGE, SEX, PC1
    if name == "variable_slope_large":
        var_effects = [0.25, 0.2, 0.15]
        slope_effects = [0, 0.3, -0.05]
    elif name == "variable_slope_small":
        var_effects = [0.25, 0.2, 0.15]
        slope_effects = [0, 0.1, -0.05]
    elif name == "constant_slope":
        var_effects = [0.25, 0.2, 0.15]
        slope_effects = [0, 0, 0]
    else:
        raise NotImplementedError

    dict_data = {}
    for n_calibrate in N_CALIBRATE_LIST:
        for seed in range(N_SIM):
            np.random.seed(seed)
            df_train, df_test = simulate_data(
                df_cov=df_cov,
                var_effects=var_effects,
                baseline_r2=0.3,
                n_train=n_calibrate,
                n_test=5000,
                slope_effects=slope_effects,
            )
            dict_data[(n_calibrate, seed)] = df_train, df_test
    with open(f"cache/{name}.data.pkl", "wb") as f:
        pickle.dump(dict_data, f)

In [9]:
for name in DATASET_LIST:
    make_dataset(name)

# Compute experiment raw data

In [10]:
from utils import evaluate_metrics

In [11]:
def evaluate(dataset: str, fit_slope: bool, n_calibrate_list: List[int]):
    with open(f"cache/{dataset}.data.pkl", "rb") as f:
        dict_data = pickle.load(f)

    dict_stats_sum = dict()
    dict_params_sum = dict()

    for adjust in ["all", "except-age", "none", "dummy5", "dummy25", "dummy50"]:
        if adjust == "all":
            adjust_cols = ["AGE", "SEX", "PC1"]
        elif adjust == "except-age":
            adjust_cols = ["SEX", "PC1"]
        elif adjust.startswith("dummy"):
            n_dummy = int(adjust.lstrip("dummy"))
            adjust_cols = ["AGE", "SEX", "PC1"] + [f"DUMMY{i}" for i in range(n_dummy)]
        elif adjust == "none":
            adjust_cols = None
        else:
            raise NotImplementedError

        dict_df_coverage = dict()
        dict_df_r2 = dict()
        dict_df_length = dict()
        dict_df_params = dict()

        for n_calibrate in tqdm(n_calibrate_list):
            df_coverage = []
            df_r2 = []
            df_length = []
            df_params = []
            for seed in range(N_SIM):
                df_train, df_test = dict_data[(n_calibrate, seed)]
                tmp_cov, tmp_r2, tmp_length, tmp_params = evaluate_metrics(
                    df_train, df_test, adjust_cols=adjust_cols, fit_slope=fit_slope
                )
                df_coverage.append(tmp_cov)
                df_r2.append(tmp_r2)
                df_length.append(tmp_length)
                df_params.append(tmp_params)
            dict_df_coverage[n_calibrate] = pd.concat(df_coverage, axis=1).T
            dict_df_r2[n_calibrate] = pd.concat(df_r2, axis=1).T
            dict_df_length[n_calibrate] = pd.concat(df_length, axis=1).T
            dict_df_params[n_calibrate] = pd.concat(df_params, axis=1).T

        df_stats = {
            "n": [],
            "seed": [],
            "col": [],
            "coverage": [],
            "r2": [],
            "length": [],
        }
        df_params = {"n": [], "seed": [], "param": [], "est": []}

        # summarize coverage / R2
        for n in dict_df_coverage:
            for col in dict_df_coverage[n].columns:
                covs = dict_df_coverage[n][col].values
                df_stats["n"].extend([n] * len(covs))
                df_stats["seed"].extend(np.arange(len(covs)))
                df_stats["col"].extend([col] * len(covs))
                df_stats["coverage"].extend(covs)
                df_stats["r2"].extend(dict_df_r2[n][col])
                df_stats["length"].extend(dict_df_length[n][col])

        # summarize parameter estimation
        for n in dict_df_params:
            for col in dict_df_params[n].columns:
                ests = dict_df_params[n][col].values
                df_params["n"].extend([n] * len(ests))
                df_params["seed"].extend(np.arange(len(ests)))
                df_params["param"].extend([col] * len(ests))
                df_params["est"].extend(ests)

        dict_stats_sum[adjust] = pd.DataFrame(df_stats)
        dict_params_sum[adjust] = pd.DataFrame(df_params)

    # format into long table
    df_stats = []
    for adjust in dict_stats_sum:
        df_tmp = dict_stats_sum[adjust]
        df_tmp.insert(0, "adjust", adjust)
        df_stats.append(df_tmp)
    df_stats = pd.concat(df_stats)

    df_params = []
    for adjust in dict_params_sum:
        df_tmp = dict_params_sum[adjust]
        df_tmp.insert(0, "adjust", adjust)
        df_params.append(df_tmp)
    df_params = pd.concat(df_params)

    out_prefix = f"cache/{dataset}"
    if fit_slope:
        out_prefix += ".fitslope"
    else:
        out_prefix += ".noslope"

    df_stats.to_csv(out_prefix + ".stats.tsv", sep="\t", index=False)
    df_params.to_csv(out_prefix + ".params.tsv", sep="\t", index=False)

In [12]:
for dataset in DATASET_LIST:
    # for fit_slope in [False, True]:
    for fit_slope in [False]:
        if fit_slope:
            n_calibrate_list = [2500, 5000]
        else:
            n_calibrate_list = [100, 500, 2500, 5000]
        evaluate(
            dataset=dataset, fit_slope=fit_slope, n_calibrate_list=n_calibrate_list
        )

100%|██████████| 4/4 [00:16<00:00,  4.22s/it]
100%|██████████| 4/4 [00:16<00:00,  4.16s/it]
100%|██████████| 4/4 [00:16<00:00,  4.18s/it]
100%|██████████| 4/4 [00:19<00:00,  4.79s/it]
100%|██████████| 4/4 [00:52<00:00, 13.18s/it]
100%|██████████| 4/4 [01:32<00:00, 23.13s/it]
100%|██████████| 4/4 [00:18<00:00,  4.57s/it]
100%|██████████| 4/4 [00:17<00:00,  4.42s/it]
100%|██████████| 4/4 [00:18<00:00,  4.58s/it]
100%|██████████| 4/4 [00:21<00:00,  5.28s/it]
100%|██████████| 4/4 [00:34<00:00,  8.57s/it]
100%|██████████| 4/4 [01:41<00:00, 25.45s/it]
100%|██████████| 4/4 [00:18<00:00,  4.72s/it]
100%|██████████| 4/4 [00:19<00:00,  4.75s/it]
100%|██████████| 4/4 [00:17<00:00,  4.42s/it]
100%|██████████| 4/4 [00:24<00:00,  6.00s/it]
100%|██████████| 4/4 [00:36<00:00,  9.06s/it]
100%|██████████| 4/4 [01:33<00:00, 23.42s/it]
