# Showcasing the results
1. Estimation performance
2. Hypothesis testing performance

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import matplotlib.pyplot as plt
import numpy as np
import scipy
import json
import pandas as pd
import seaborn as sns
import os
import admix
import scipy.stats

In [2]:
def estimate_rg(est, est_var):
    """
    Estimate the ratio of genetic correlation.
    est: (3, ) numpy array
    est_var (3, 3) variance-covariance matrix
    """
    x, y = est[0], est[1]
    rg = y / x
    # grad = [-y / x^2, 1 / x]
    grad = np.array([-y / (x ** 2), 1 / x])

    def quad_form(x, A):
        return np.dot(np.dot(x.T, A), x)

    return rg, quad_form(grad, est_var[0:2, 0:2])

# Plot GCTA

In [None]:
df_plot = []

for hsq in [0.1, 0.25, 0.5]:
    for pcausal in [0.00001, 0.0001, 0.001, 0.01]:
        print(f"Loading hsq={hsq}, pcausal={pcausal}")
        for cor in [0.9, 0.95, 1.0]:
            for hermodel in ["mafukb"]:
                for sim_i in range(0, 500):
                    f_name = (
                        f"out/gcta-estimate/hsq-{hsq}-pcausal-{np.format_float_positional(pcausal)}-cor-{cor}"
                        + f"-hermodel-{hermodel}.sim_{sim_i}"
                    )
                    if not os.path.exists(f_name + ".hsq"):
                        continue
                    try:
                        est = admix.tools.gcta.read_reml(f_name)
                        rg, rg_var = estimate_rg(
                            est["est"].Variance.values, est["varcov"].values
                        )
                        rg_stderr = np.sqrt(rg_var)

                        # results from likelihood ratio test
                        reduced_est = admix.tools.gcta.read_reml(f_name + ".reduced")
                        pval_lrt = scipy.stats.chi2.sf(
                            2 * (est["loglik"] - reduced_est["loglik"]), 1
                        )
                        pval_delta = scipy.stats.norm.cdf((rg - 1) / rg_stderr)
                        df_plot.append(
                            est["est"].Variance.values.tolist()
                            + [
                                hsq,
                                cor,
                                pcausal,
                                hermodel,
                                rg,
                                rg_stderr,
                                pval_lrt,
                                pval_delta,
                                sim_i,
                            ]
                        )
                    except ValueError:
                        print("Invalid value!")
df_plot = pd.DataFrame(
    df_plot,
    columns=[
        "estimated_var_g",
        "estimated_rho",
        "estimated_var_e",
        "hsq",
        "cor",
        "pcausal",
        "hermodel",
        "estimated_ratio",
        "estimated_ratio_stderr",
        "pval_lrt",
        "pval_delta",
        "sim_i",
    ],
)

Loading hsq=0.1, pcausal=1e-05
Invalid value!
Invalid value!
Loading hsq=0.1, pcausal=0.0001
Loading hsq=0.1, pcausal=0.001
Loading hsq=0.1, pcausal=0.01
Loading hsq=0.25, pcausal=1e-05
Loading hsq=0.25, pcausal=0.0001


In [7]:
df_plot.to_csv("results/raw.csv", index=False)
df_plot = pd.read_csv("results/raw.csv")

In [8]:
df_summary = (
    df_plot.groupby(["hsq", "cor", "pcausal"])
    .agg(
        {
            "estimated_ratio": ["mean", "std"],
            "estimated_ratio_stderr": "mean",
            "pval_lrt": lambda x: np.mean(x < 0.05),
            "pval_delta": lambda x: np.mean(x < 0.05),
        }
    )
    .reset_index()
)

# Main display table
Fixing p_causal = 0.1%, varying other parameters

In [17]:
# format table
df_tbl = pd.DataFrame(
    {
        "hsq": df_summary["hsq"],
        "cor": df_summary["cor"],
        "pcausal": df_summary["pcausal"],
        "estimate": df_summary["estimated_ratio"].apply(
            lambda x: f"{x['mean']:.3f}$\pm${x['std']:.3f}", axis=1
        ),
        "estimated_stderr": df_summary["estimated_ratio_stderr"]["mean"]
        .round(3)
        .astype(str),
        "pval_delta": df_summary["pval_delta"]["<lambda>"].round(3).astype(str),
        "pval_lrt": df_summary["pval_lrt"]["<lambda>"].round(3).astype(str),
    }
)

# main display table
# hsq = 0.1, 0.25, 0.5; p_causal = 0.1%
df_main = (
    df_tbl[df_tbl.pcausal.isin([0.001])]
    .drop(columns=["pcausal", "pval_delta"])
    .rename(
        columns={
            "hsq": "$h_g^2$",
            "cor": "$r_g$",
            "estimate": "$\widehat{r_g}$",
            "estimated_stderr": "$\widehat{\text{s.e.}(r_g)}$",
            "pval_lrt": "$\text{Pr}[\text{reject } `r_g=1']$",
        }
    )
)
df_main = df_main.sort_values(df_main.columns[1])
print(df_main.to_latex(escape=False, index=False))

\begin{tabular}{rrlll}
\toprule
 $h_g^2$ &  $r_g$ & $\widehat{r_g}$ & $\widehat{\text{s.e.}(r_g)}$ & $\text{Pr}[\text{reject } `r_g=1']$ \\
\midrule
    0.10 &   0.90 & 0.883$\pm$0.114 &                        0.109 &                               0.268 \\
    0.25 &   0.90 & 0.894$\pm$0.052 &                        0.052 &                                0.66 \\
    0.50 &   0.90 & 0.896$\pm$0.035 &                        0.034 &                               0.936 \\
    0.10 &   0.95 & 0.940$\pm$0.093 &                         0.09 &                               0.104 \\
    0.25 &   0.95 & 0.949$\pm$0.042 &                        0.045 &                               0.214 \\
    0.50 &   0.95 & 0.948$\pm$0.030 &                         0.03 &                               0.476 \\
    0.10 &   1.00 & 0.994$\pm$0.083 &                        0.075 &                               0.064 \\
    0.25 &   1.00 & 1.001$\pm$0.039 &                        0.038 &                           

# Numerical supp. table

In [18]:
# all configurations
df_supp = df_tbl.rename(
    columns={
        "cor": "rg",
        "estimate": "rg_estimate",
        "estimated_stderr": "rg_stderr",
        "pval_lrt": "p[reject rg=1]",
    }
)
df_supp["rg_estimate"] = df_supp["rg_estimate"].apply(lambda x: x.replace("$\pm$", "±"))
df_supp["p[reject rg=1]"] = df_supp["p[reject rg=1]"].astype(float).round(3)
df_supp.to_excel("results/genome-wide-simulation.xlsx", index=False)

# Numbers to cite in the manuscript

In [19]:
# relative bias
df_tmp = df_summary[df_summary.pcausal == 0.001]
relative_bias = (df_tmp["estimated_ratio"]["mean"] - df_tmp["cor"]) / df_tmp["cor"]
print(
    f"Relative bias across 9 simulation settings (p_causal = 0.1%) is {np.mean(relative_bias * 100):.2f}%"
)

# standard errors
df_tmp = df_summary[df_summary.pcausal == 0.001]
relative_bias = (
    df_tmp["estimated_ratio_stderr"]["mean"] - df_tmp["estimated_ratio"]["std"]
) / df_tmp["estimated_ratio"]["std"]
print(
    f"Relative bias of standard error across 9 simulation settings (p_causal = 0.1%) is {np.mean(relative_bias * 100):.2f}%"
)

Relative bias across 9 simulation settings (p_causal = 0.1%) is -0.56%
Relative bias of standard error across 9 simulation settings (p_causal = 0.1%) is -1.79%


In [20]:
for pcausal in df_summary.pcausal.unique():
    print(f"--------pcausal={pcausal}--------")
    # relative bias
    df_tmp = df_summary[df_summary.pcausal == pcausal]
    relative_bias = (df_tmp["estimated_ratio"]["mean"] - df_tmp["cor"]) / df_tmp["cor"]
    print(
        f"Relative bias across 9 simulation settings (p_causal = 0.1%) is {np.mean(relative_bias * 100):.2f}% ({np.std(relative_bias * 100):.2f}%)"
    )

    # standard errors
    relative_bias = (
        df_tmp["estimated_ratio_stderr"]["mean"] - df_tmp["estimated_ratio"]["std"]
    ) / df_tmp["estimated_ratio"]["std"]
    print(
        f"Relative bias of standard error across 9 simulation settings (p_causal = 0.1%) is {np.mean(relative_bias * 100):.2f}% ({np.std(relative_bias * 100):.2f}%)"
    )

    # inflation under the null
    pval = df_tmp.loc[df_tmp.cor == 1.0, "pval_lrt"]["<lambda>"]
    print(
        f"Relative bias of standard error across 9 simulation settings (p_causal = 0.1%) is {np.mean(pval):.2f} ({np.std(pval):.2f})"
    )

--------pcausal=1e-05--------
Relative bias across 9 simulation settings (p_causal = 0.1%) is -1.70% (1.37%)
Relative bias of standard error across 9 simulation settings (p_causal = 0.1%) is -40.20% (11.53%)
Relative bias of standard error across 9 simulation settings (p_causal = 0.1%) is 0.22 (0.11)
--------pcausal=0.0001--------
Relative bias across 9 simulation settings (p_causal = 0.1%) is -0.53% (0.62%)
Relative bias of standard error across 9 simulation settings (p_causal = 0.1%) is -11.13% (1.74%)
Relative bias of standard error across 9 simulation settings (p_causal = 0.1%) is 0.08 (0.01)
--------pcausal=0.001--------
Relative bias across 9 simulation settings (p_causal = 0.1%) is -0.56% (0.58%)
Relative bias of standard error across 9 simulation settings (p_causal = 0.1%) is -1.79% (3.90%)
Relative bias of standard error across 9 simulation settings (p_causal = 0.1%) is 0.06 (0.00)
--------pcausal=0.01--------
Relative bias across 9 simulation settings (p_causal = 0.1%) is -0.

In [22]:
# format table
df_tbl = pd.DataFrame(
    {
        "hsq": df_summary["hsq"],
        "cor": df_summary["cor"],
        "pcausal": df_summary["pcausal"].apply(
            lambda x: f"{np.format_float_positional(x * 100)}\%"
        ),
        "estimate": df_summary["estimated_ratio"].apply(
            lambda x: f"{x['mean']:.3f}$\pm${x['std']:.3f}", axis=1
        ),
        "estimated_stderr": df_summary["estimated_ratio_stderr"]["mean"]
        .round(3)
        .astype(str),
        "pval_delta": df_summary["pval_delta"]["<lambda>"].round(3).astype(str),
        "pval_lrt": df_summary["pval_lrt"]["<lambda>"].round(3).astype(str),
    }
)

# main display table
# hsq = 0.1, 0.25, 0.5; p_causal = 0.1%
df_main = (
    df_tbl[(df_tbl.hsq == 0.25)]
    .drop(columns=["hsq", "pval_delta"])
    .rename(
        columns={
            "cor": "$r_g$",
            "pcausal": "$p_\text{causal}$",
            "estimate": "$\widehat{r_g}$",
            "estimated_stderr": "$\widehat{\text{s.e.}(r_g)}$",
            "pval_lrt": "$\text{Pr}[\text{reject } `r_g=1']$",
        }
    )
)

# exchange columns, such that, column 1: p_causal, column 2: rg
df_main = df_main[
    [df_main.columns[1], df_main.columns[0]] + df_main.columns[2:].tolist()
]
print(df_main.to_latex(escape=False, index=False))

\begin{tabular}{lrlll}
\toprule
$p_\text{causal}$ &  $r_g$ & $\widehat{r_g}$ & $\widehat{\text{s.e.}(r_g)}$ & $\text{Pr}[\text{reject } `r_g=1']$ \\
\midrule
          0.001\% &   0.90 & 0.891$\pm$0.094 &                        0.054 &                               0.578 \\
           0.01\% &   0.90 & 0.900$\pm$0.059 &                        0.052 &                               0.582 \\
            0.1\% &   0.90 & 0.894$\pm$0.052 &                        0.052 &                                0.66 \\
             1.\% &   0.90 & 0.898$\pm$0.057 &                        0.052 &                               0.626 \\
          0.001\% &   0.95 & 0.937$\pm$0.084 &                        0.048 &                               0.356 \\
           0.01\% &   0.95 & 0.948$\pm$0.049 &                        0.045 &                               0.258 \\
            0.1\% &   0.95 & 0.949$\pm$0.042 &                        0.045 &                               0.214 \\
             1.\% &   0

In [23]:
assert False

AssertionError: 

# Deprecated below

In [21]:
df_plot = []

for hsq in [0.1, 0.25, 0.5]:
    for pcausal in [0.0001, 0.001, 0.01, 1.0]:
        for cor in [0.6, 0.8, 1.0]:
            for hermodel in ["mafukb"]:
                for sim_i in range(100):
                    f_name = (
                        f"out/BACKUP-gcta-estimate/hsq-{hsq}-pcausal-{pcausal}-cor-{cor}"
                        + f"-hermodel-{hermodel}.sim_{sim_i}"
                    )
                    if not os.path.exists(f_name + ".hsq"):
                        continue
                    try:
                        est = admix.tools.gcta.read_reml(f_name)
                        rg, rg_var = estimate_rg(
                            est["est"].Variance.values, est["varcov"].values
                        )
                        rg_stderr = np.sqrt(rg_var)
                        df_plot.append(
                            est["est"].Variance.values.tolist()
                            + [hsq, cor, pcausal, hermodel, rg, rg_stderr, sim_i]
                        )
                    except ValueError:
                        print("Invalid value!")
df_plot = pd.DataFrame(
    df_plot,
    columns=[
        "estimated_var_g",
        "estimated_rho",
        "estimated_var_e",
        "hsq",
        "cor",
        "pcausal",
        "hermodel",
        "estimated_ratio",
        "estimated_ratio_stderr",
        "sim_i",
    ],
)

rg_hat_std = df_plot.groupby(["hsq", "cor", "pcausal"]).apply(
    lambda x: np.std(x["estimated_ratio"])
)
mean_rg_std_hat = df_plot.groupby(["hsq", "cor", "pcausal"]).apply(
    lambda x: np.mean(x["estimated_ratio_stderr"])
)
df_plot = pd.DataFrame({"rg_hat_std": rg_hat_std, "mean_rg_std_hat": mean_rg_std_hat})

fig, ax = plt.subplots(figsize=(3, 3), dpi=150)
ax.scatter(df_plot.rg_hat_std, df_plot.mean_rg_std_hat)
ax.plot([0, 0.25], [0, 0.25], "r--", alpha=0.5)
ax.set_xlabel("Standard deviation of $\hat{r_g}$")
ax.set_ylabel("Mean of estimated standard error")