In [1]:
%load_ext lab_black

import numpy as np
import pandas as pd

import numpy as np
import pandas as pd
import admix_genet_cor
import itertools
import admix
from tqdm import tqdm
import os
from scipy.interpolate import CubicSpline
import matplotlib.pyplot as plt
import glob
import string
from scipy import stats
import matplotlib

from utils import read_estimate

matplotlib.rcParams["font.family"] = ["Liberation Sans"]

In [2]:
def meta_analysis(effects, se, method="random", weights=None):
    # From Omer Weissbrod
    assert method in ["fixed", "random"]
    d = effects
    variances = se ** 2

    # compute random-effects variance tau2
    vwts = 1.0 / variances
    fixedsumm = vwts.dot(d) / vwts.sum()
    Q = np.sum(((d - fixedsumm) ** 2) / variances)
    df = len(d) - 1
    tau2 = np.maximum(0, (Q - df) / (vwts.sum() - vwts.dot(vwts) / vwts.sum()))

    # defing weights
    if weights is None:
        if method == "fixed":
            wt = 1.0 / variances
        else:
            wt = 1.0 / (variances + tau2)
    else:
        wt = weights

    # compute summtest
    summ = wt.dot(d) / wt.sum()
    if method == "fixed":
        varsum = np.sum(wt * wt * variances) / (np.sum(wt) ** 2)
    else:
        varsum = np.sum(wt * wt * (variances + tau2)) / (np.sum(wt) ** 2)
    ###summtest = summ / np.sqrt(varsum)

    summary = summ
    se_summary = np.sqrt(varsum)

    return summary, se_summary

In [3]:
# SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp-tables.xlsx?dl=1"
SUPP_TABLE_URL = "supp-tables.xlsx"
snpset = "imputed.mafukb.005"

# UKB
df_trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="ukb-trait-info")
ukb_trait_list = df_trait_info[df_trait_info["in-analysis"] == "T"].id.values
ukb_name_dict = (
    df_trait_info[["id", "display-name"]].set_index("id")["display-name"].to_dict()
)

# PAGE
df_trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="page-trait-info")
page_trait_list = df_trait_info.trait.values
page_name_dict = (
    df_trait_info[["trait", "display-name"]]
    .set_index("trait")["display-name"]
    .to_dict()
)

In [4]:
df_sldxr = pd.read_excel("supp-tables.xlsx", sheet_name="shi-sldxr-estimate")

In [5]:
df_sldxr["rg_mean"] = [float(row.split(" ")[0]) for row in df_sldxr["rg"]]
df_sldxr["rg_se"] = [
    float(row.split(" ")[1].lstrip("(").rstrip(")")) for row in df_sldxr["rg"]
]

In [6]:
dict_trait_map = {
    "Body Mass Index (BMI)": [("page", "bmi"), ("ukb", "log_BMI")],
    "Diastolic Blood Pressure (DBP)": [
        ("page", "diastolic_bp"),
        ("ukb", "diastolic_BP"),
    ],
    "Estimated Glomerular Filtration Rate (EGFR)": [("page", "egfrckdepi")],
    "Hemoglobin A1c (HBA1C)": [("page", "a1c"), ("ukb", "log_HbA1c")],
    "High Density Lipoprotein (HDL)": [("page", "hdl"), ("ukb", "log_HDL")],
    "Height (HEIGHT)": [("page", "height"), ("ukb", "height")],
    "Low Density Lipoprotein (LDL)": [("page", "hdl"), ("ukb", "LDL")],
    "Lymphocyte Count (LYMPH)": [("ukb", "log_lymphocyte")],
    "Mean Corpuscular Hemoglobin (MCH)": [("ukb", "MCH")],
    "Monocyte Count (MONO)": [("ukb", "log_monocyte")],
    "Platelet Count (PLT)": [("page", "platelet_cnt"), ("ukb", "log_platelet")],
    "Red Blood Cell Count (RBC)": [("ukb", "erythrocyte")],
    "Systolic Blood Pressure (SBP)": [("page", "systolic_bp"), ("ukb", "systolic_BP")],
    "*Type 2 Diabetes (T2D)": [("page", "t2d_status"), ("ukb", "250.2")],
    "Total Cholesterol (TC)": [("page", "total_cholesterol"), ("ukb", "cholesterol")],
    "Triglyceride (TG)": [("page", "triglycerides"), ("ukb", "log_triglycerides")],
    "White Blood Cell Count (WBC)": [
        ("page", "total_wbc_cnt"),
        ("ukb", "log_leukocyte"),
    ],
}

In [21]:
print(len(dict_trait_map))

17


In [20]:
print(", ".join([t.split("(")[0].strip() for t in dict_trait_map.keys()]))

Body Mass Index, Diastolic Blood Pressure, Estimated Glomerular Filtration Rate, Hemoglobin A1c, High Density Lipoprotein, Height, Low Density Lipoprotein, Lymphocyte Count, Mean Corpuscular Hemoglobin, Monocyte Count, Platelet Count, Red Blood Cell Count, Systolic Blood Pressure, *Type 2 Diabetes, Total Cholesterol, Triglyceride, White Blood Cell Count


In [7]:
dict_loglik, dict_nindiv = read_estimate(
    "imputed.mafukb.005", ukb_trait_list=ukb_trait_list, page_trait_list=page_trait_list
)

In [8]:
df_sldxr_common = df_sldxr[df_sldxr.iloc[:, 0].isin(dict_trait_map)]
dict_loglik_common = dict()
for name in df_sldxr_common.iloc[:, 0]:
    dict_loglik_common[name] = 0
    for study, trait in dict_trait_map[name]:
        dict_loglik_common[name] += dict_loglik[study, trait]

In [9]:
xs = np.linspace(0, 1, 1001)
meta_ll = 0
for trait in dict_loglik_common:
    loglik = dict_loglik_common[trait]
    meta_ll += loglik
interval = admix_genet_cor.hdi(xs, meta_ll)
print(f"{meta_ll.argmax() / 1000:.2g} [{interval[0]:.2g}, {interval[1]:.2g}]")

0.91 [0.86, 0.95]


In [10]:
method = "fixed"

print(f"{len(dict_trait_map)} common traits:")
meta_mean, meta_se = meta_analysis(
    df_sldxr_common["rg_mean"], df_sldxr_common["rg_se"], method=method
)
print(
    f"{meta_mean:.2g} [{meta_mean - meta_se * 1.96:.2g}, {meta_mean + meta_se * 1.96:.2g}]"
)
print(f"{len(df_sldxr)} all S-LDXR traits:")
meta_mean, meta_se = meta_analysis(
    df_sldxr["rg_mean"], df_sldxr["rg_se"], method=method
)
print(
    f"{meta_mean:.2g} [{meta_mean - meta_se * 1.96:.2g}, {meta_mean + meta_se * 1.96:.2g}]"
)

17 common traits:
0.87 [0.85, 0.89]
31 all S-LDXR traits:
0.86 [0.84, 0.87]
