In [1]:
%load_ext lab_black

from scipy.interpolate import CubicSpline
from scipy import stats
import numpy as np
import admix
import pandas as pd
import itertools
import os
import glob
from tqdm import tqdm
import admix_genet_cor

In [2]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp_tables.xlsx?dl=1"
ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr"
)
PFILE_DIR = os.path.join(ROOT_DIR, "imputed")
trait_list = [
    f.split("/")[-1].split(".")[0]
    for f in glob.glob(os.path.join("out/pheno", "*.tsv"))
]

In [3]:
trait_list = list(set(trait_list) - set(["total_wbc_cnt_duffy"]))

In [4]:
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["hm3", "imputed"],
            ["mafukb", "gcta"],
            [0.005, 0.05],
        )
    ],
    columns=[
        "snpset",
        "hermodel",
        "maf",
    ],
)

df_params["grm_prefix"] = df_params.apply(
    lambda p: f"{p.snpset}.{p.hermodel}.{str(p.maf)[2:]}",
    axis=1,
)
df_params = pd.DataFrame(
    [params for params in itertools.product(df_params.grm_prefix, trait_list)],
    columns=["grm_prefix", "trait"],
)

In [6]:
dict_loglik = {trait: dict() for trait in trait_list}
rho_list = np.linspace(0, 1, 21)
xs = np.linspace(0, 1, 1001)

for i, param in tqdm(df_params.iterrows(), total=len(df_params)):
    est_dir = f"out/OLD-gcta-estimate-all-duffy-covar/{param.trait}-{param.grm_prefix}"
    loglik_list = [
        admix.tools.gcta.read_reml(os.path.join(est_dir, f"rho{int(rho * 100)}"))[
            "loglik"
        ]
        for rho in rho_list
    ]
    cs = CubicSpline(rho_list, loglik_list)
    ll = cs(xs)
    dict_loglik[param.trait][param.grm_prefix] = ll

100%|██████████| 192/192 [00:20<00:00,  9.30it/s]


In [8]:
for grm_prefix in df_params.grm_prefix.unique():
    meta_ll = 0
    for trait in trait_list:
        meta_ll += dict_loglik[trait][grm_prefix]
    print(grm_prefix, meta_ll.argmax() / 1000, admix_genet_cor.hdi(xs, meta_ll))

hm3.mafukb.005 0.962 (0.933, 0.988)
hm3.mafukb.05 0.938 (0.906, 0.966)
hm3.gcta.005 0.97 (0.9450000000000001, 0.994)
hm3.gcta.05 0.941 (0.911, 0.967)
imputed.mafukb.005 0.938 (0.9, 0.972)
imputed.mafukb.05 0.894 (0.849, 0.932)
imputed.gcta.005 0.941 (0.906, 0.972)
imputed.gcta.05 0.897 (0.854, 0.934)


In [9]:
df_plot = {"trait": [], "grm_prefix": [], "mode": [], "lower": [], "upper": []}

for i, param in tqdm(df_params.iterrows(), total=len(df_params)):
    trait, grm_prefix = param.trait, param.grm_prefix
    mode = dict_loglik[trait][grm_prefix].argmax() / 1000
    interval = admix_genet_cor.hdi(xs, dict_loglik[trait][grm_prefix])
    if isinstance(interval, list):
        print(f"skipping {trait}, {grm_prefix}, which has more than 1 interval")
        continue
    df_plot["trait"].append(trait)
    df_plot["grm_prefix"].append(grm_prefix)
    df_plot["mode"].append(mode)
    df_plot["lower"].append(interval[0])
    df_plot["upper"].append(interval[1])
df_plot = pd.DataFrame(df_plot)
df_plot["length"] = df_plot["upper"] - df_plot["lower"]

# fig, ax = plt.subplots(figsize=(3.5, 7), dpi=150)
# ax.errorbar(
#     x=df_plot["mode"],
#     y=np.arange(len(df_plot)),
#     xerr=(df_plot["mode"] - df_plot["lower"], df_plot["upper"] - df_plot["mode"]),
#     fmt=".",
# )
# ax.axvline(x=1.0, color="red", alpha=0.5)
# ax.set_yticks(np.arange(len(df_plot)))
# ax.set_yticklabels(
#     [dict_trait_display_name[trait] for trait in df_plot["trait"]], fontsize=9
# )
# ax.set_ylim(-1, len(df_plot))
# plt.tight_layout()
# # plt.savefig("results/genome-wide.pdf", bbox_inches="tight")
# plt.show()

100%|██████████| 192/192 [00:00<00:00, 979.93it/s]


skipping a1c, hm3.gcta.005, which has more than 1 interval


In [10]:
df_plot.to_csv("results/all-duffy-covar.tsv", index=False, sep="\t")