In [1]:
%load_ext lab_black
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from admix.data import quantile_normalize
import seaborn as sns
import admix
import admix_genet_cor
import os

In [2]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp-tables.xlsx?dl=1"

df_trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="ukb-trait-info")
ukb_trait_list = df_trait_info[df_trait_info["in-analysis"] == "T"].id.values

df_trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="page-trait-info")
page_trait_list = df_trait_info.trait.values

In [3]:
PAGE_GENO_DIR = "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr/imputed/"
PAGE_PHENO_DIR = "/u/project/pasaniuc/kangchen/2021-admix-corr/experiments/03-page-genome-wide-profile-likelihood/out/pheno"

UKB_GENO_DIR = "/u/project/sgss/UKBB/UKB-ADMIXED/01-dataset/out/PLINK2/imputed"
UKB_PHENO_DIR = "/u/project/sgss/UKBB/UKB-ADMIXED/02-genet-cor/out/pheno/"

In [5]:
df_params = {"study": [], "trait": []}
df_params["trait"].extend(ukb_trait_list)
df_params["study"].extend(["ukb"] * len(ukb_trait_list))
df_params["trait"].extend(page_trait_list)
df_params["study"].extend(["page"] * len(page_trait_list))
df_params = pd.DataFrame(df_params)
df_params["out"] = df_params.apply(
    lambda x: f"out/gwas-het/{x.study}-{x.trait}.csv", axis=1
)

In [6]:
df_assoc = []
for i, param in df_params.iterrows():
    df_tmp = pd.read_csv(param.out)
    if len(df_tmp) == 0:
        print(param.out, "is empty")
        continue
    df_tmp = pd.DataFrame({"study": [param.study], "trait": [param.trait]}).merge(
        df_tmp, how="cross"
    )
    df_assoc.append(df_tmp)
df_assoc = pd.concat(df_assoc).sort_values(["trait", "CHROM", "POS"])

out/gwas-het/ukb-height.csv is empty
out/gwas-het/ukb-log_BMI.csv is empty
out/gwas-het/ukb-log_heel_BMD.csv is empty
out/gwas-het/ukb-log_leukocyte.csv is empty
out/gwas-het/ukb-log_lymphocyte.csv is empty
out/gwas-het/ukb-log_platelet.csv is empty
out/gwas-het/ukb-neuroticism.csv is empty
out/gwas-het/ukb-years_of_edu.csv is empty
out/gwas-het/page-a1c.csv is empty
out/gwas-het/page-insulin.csv is empty
out/gwas-het/page-qt_interval.csv is empty
out/gwas-het/page-qrs_interval.csv is empty
out/gwas-het/page-systolic_bp.csv is empty
out/gwas-het/page-diastolic_bp.csv is empty
out/gwas-het/page-hypertension.csv is empty
out/gwas-het/page-waist_hip_ratio.csv is empty


In [7]:
df_assoc = df_assoc[
    df_assoc.EUR_af.between(0.005, 0.995) & df_assoc.AFR_af.between(0.005, 0.995)
].reset_index(drop=True)
df_assoc.loc[:, "expected_HET_pval"] = stats.norm.sf(
    quantile_normalize(-df_assoc.HET_pval)
)
df_assoc.loc[:, "bonferroni_significant"] = df_assoc["HET_pval"] < 0.05 / len(df_assoc)

df_assoc = df_assoc[df_assoc["assoc_pval"] < 5e-8]
# df_assoc = df_assoc[~df_assoc.trait.isin(["MCH"])]

In [34]:
def report_assoc(df_assoc):

    bootstrap_slope_list = []
    pearsonr_list = []
    for _ in range(1000):
        df_tmp = df_assoc.sample(len(df_assoc), replace=True)
        slope = admix_genet_cor.locus.deming_regression(
            x=df_tmp["EUR_beta"],
            y=df_tmp["AFR_beta"],
            sx=df_tmp["EUR_beta_stderr"],
            sy=df_tmp["AFR_beta_stderr"],
        )[0]
        bootstrap_slope_list.append(slope)
        pearsonr = stats.pearsonr(df_tmp.EUR_beta, df_tmp.AFR_beta)[0]
        pearsonr_list.append(pearsonr)

    slope = admix_genet_cor.locus.deming_regression(
        x=df_assoc["EUR_beta"],
        y=df_assoc["AFR_beta"],
        sx=df_assoc["EUR_beta_stderr"],
        sy=df_assoc["AFR_beta_stderr"],
    )[0]
    lgc, lgc_ci = admix.data.lambda_gc(df_assoc.HET_pval, bootstrap_ci=True)

    print(f"{len(df_assoc)} SNPs")
    print(
        f"Pearson's r = {stats.pearsonr(df_assoc.EUR_beta, df_assoc.AFR_beta)[0]:.2g}, SE={np.std(pearsonr_list):.2f}"
    )
    print(
        f"Deming regression slope: {slope:.2f}. Bootstrap mean (se): {np.mean(bootstrap_slope_list):.2f} ({np.std(bootstrap_slope_list):.2f})"
    )
    print(f"lambda gc = {lgc:.2f}, ci=[{lgc_ci[0]:.2f}, {lgc_ci[1]:.2f}]")

In [38]:
df_assoc.sort_values("HET_pval")

Unnamed: 0,study,trait,SNP,CHROM,POS,PLINK_P,EUR_af,AFR_af,assoc_pval,HET_pval,EUR_beta,AFR_beta,EUR_beta_stderr,AFR_beta_stderr,expected_HET_pval,bonferroni_significant
17,ukb,MCH,chr16:140238:C:T,16,140238,6.540000e-10,0.428832,0.772786,6.542885e-10,1.854274e-24,0.074391,-0.277259,0.031295,0.026369,0.003906,True
19,ukb,MCH,chr16:207212:A:C,16,207212,2.840000e-09,0.490268,0.844401,2.843981e-09,4.842501e-24,0.049764,-0.289335,0.031103,0.028097,0.011719,True
23,ukb,MCH,chr16:291988:C:T,16,291988,5.680000e-14,0.177670,0.247776,5.677463e-14,1.097601e-21,0.124169,-0.371366,0.042412,0.031606,0.019531,True
22,ukb,MCH,chr16:276138:A:G,16,276138,2.750000e-12,0.586679,0.744141,2.748564e-12,2.192808e-17,-0.000019,-0.275971,0.030206,0.026672,0.027344,True
14,ukb,MCH,chr16:82511:G:A,16,82511,1.460000e-08,0.469891,0.571181,1.458276e-08,6.780125e-15,0.038860,-0.234303,0.030518,0.026027,0.035156,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,page,mean_corp_hgb_conc,chr16:244158:T:C,16,244158,6.630000e-10,0.311663,0.359798,6.625713e-10,9.438585e-01,0.142546,0.146045,0.046537,0.025331,0.964844,False
53,ukb,ever_smoked,chr8:95301836:T:C,8,95301836,4.170000e-09,0.571303,0.444071,4.175101e-09,9.640772e-01,0.085532,0.084424,0.019743,0.018205,0.972656,False
98,page,platelet_cnt,chr6:33579153:T:C,6,33579153,3.050000e-10,0.718672,0.739843,3.051170e-10,9.725788e-01,0.106665,0.105793,0.025971,0.017715,0.980469,False
102,page,platelet_cnt,chr19:16092494:C:A,19,16092494,6.000000e-10,0.019876,0.097543,5.995224e-10,9.846546e-01,-0.171431,-0.169185,0.113732,0.028054,0.988281,False


In [40]:
df_assoc[df_assoc.trait.isin(["MCH"])]

Unnamed: 0,study,trait,SNP,CHROM,POS,PLINK_P,EUR_af,AFR_af,assoc_pval,HET_pval,EUR_beta,AFR_beta,EUR_beta_stderr,AFR_beta_stderr,expected_HET_pval,bonferroni_significant
10,ukb,MCH,chr6:135105435:A:G,6,135105435,2.05e-08,0.259395,0.068354,2.045087e-08,0.6318983,0.187724,0.156203,0.037227,0.056189,0.753906,False
11,ukb,MCH,chr16:42391:G:A,16,42391,2.69e-09,0.079075,0.476997,2.690165e-09,0.04271129,-0.033161,-0.164937,0.061732,0.026178,0.246094,False
12,ukb,MCH,chr16:46496:A:G,16,46496,2.15e-10,0.166058,0.217231,2.153261e-10,2.889832e-13,0.070183,-0.319825,0.043447,0.033769,0.058594,True
13,ukb,MCH,chr16:81470:G:A,16,81470,7.1e-13,0.005474,0.537109,7.100898e-13,0.308098,0.045975,-0.188178,0.229313,0.025934,0.582031,False
14,ukb,MCH,chr16:82511:G:A,16,82511,1.46e-08,0.469891,0.571181,1.458276e-08,6.780125e-15,0.03886,-0.234303,0.030518,0.026027,0.035156,True
15,ukb,MCH,chr16:86889:T:C,16,86889,2.8e-14,0.810523,0.530165,2.804795e-14,0.0002568922,0.248774,0.126545,0.030218,0.027357,0.128906,True
16,ukb,MCH,chr16:111939:G:A,16,111939,1.28e-11,0.867701,0.659071,1.282368e-11,3.650972e-10,0.293704,0.097491,0.031784,0.027832,0.082031,True
17,ukb,MCH,chr16:140238:C:T,16,140238,6.54e-10,0.428832,0.772786,6.542885e-10,1.854274e-24,0.074391,-0.277259,0.031295,0.026369,0.003906,True
18,ukb,MCH,chr16:161106:T:G,16,161106,1.06e-11,0.672749,0.814236,1.064191e-11,0.0001855403,0.242284,0.122894,0.031365,0.028744,0.121094,True
19,ukb,MCH,chr16:207212:A:C,16,207212,2.84e-09,0.490268,0.844401,2.843981e-09,4.842501e-24,0.049764,-0.289335,0.031103,0.028097,0.011719,True


In [35]:
print("# all SNPs")
report_assoc(df_assoc)
print("")
print("# SNPs removing MCH")
report_assoc(df_assoc[~df_assoc.trait.isin(["MCH"])])
print("")

# all SNPs
128 SNPs
Pearson's r = 0.73, SE=0.04
Deming regression slope: 1.36. Bootstrap mean (se): 1.37 (0.13)
lambda gc = 3.02, ci=[2.27, 3.53]

# SNPs removing MCH
103 SNPs
Pearson's r = 0.84, SE=0.03
Deming regression slope: 1.12. Bootstrap mean (se): 1.12 (0.05)
lambda gc = 2.64, ci=[1.59, 3.21]



In [None]:
fig, ax = plt.subplots(figsize=(4, 2), dpi=150)
sns.scatterplot(
    x=df_assoc.EUR_af - df_assoc.AFR_af,
    y=-np.log10(df_assoc.HET_pval),
    hue=df_assoc.hue,
    s=4,
    linewidth=0,
    palette="tab10",
    ax=ax,
)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=labels, loc="best", fontsize=6)

ax.set_xlabel("EUR freq - AFR freq")
ax.set_ylabel("HET $-\log_{10}(p)$")

fig, ax = plt.subplots(figsize=(4, 2), dpi=150)
sns.scatterplot(
    x=df_assoc.EUR_beta - df_assoc.AFR_beta,
    y=-np.log10(df_assoc.HET_pval),
    hue=df_assoc.hue,
    s=4,
    linewidth=0,
    palette="tab10",
    ax=ax,
)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=labels, loc="best", fontsize=6)

ax.set_xlabel("EUR beta - AFR beta")
ax.set_ylabel("HET $-\log_{10}(p)$")

In [None]:
def qqplot(pval, ax=None):
    if ax is None:
        ax = plt.gca()
    expected_pval = stats.norm.sf(quantile_normalize(-pval))
    ax.scatter(-np.log10(expected_pval), -np.log10(pval), s=2)
    lim = max(-np.log10(expected_pval))
    ax.plot([0, lim], [0, lim], "r--")
    ax.set_xlabel("Expected -$\log_{10}(p)$")
    ax.set_ylabel("Observed -$\log_{10}(p)$")

In [None]:
# color points by trait after Bonferroni correction
df_plot = df_assoc[["trait_id", "SNPS", "HET_pval", "assoc_pval"]].copy()
df_plot.loc[:, "expected_HET_pval"] = stats.norm.sf(
    quantile_normalize(-df_plot.HET_pval)
)
df_plot.loc[:, "bonferroni_significant"] = df_plot["HET_pval"] < 0.05 / len(df_plot)

df_plot["hue"] = ""
df_plot.loc[df_plot.bonferroni_significant, "hue"] = df_plot.loc[
    df_plot.bonferroni_significant, "trait_id"
]
df_plot.loc[~df_plot.bonferroni_significant, "hue"] = "non-sig"

In [None]:
fig, ax = plt.subplots(figsize=(4, 4), dpi=150)
sns.scatterplot(
    x=-np.log10(df_plot.expected_HET_pval),
    y=-np.log10(df_plot.HET_pval),
    hue=df_plot.hue,
    s=10,
    linewidth=0,
    palette="tab10",
    ax=ax,
)
lim = max(-np.log10(df_plot.expected_HET_pval))
ax.plot([0, lim], [0, lim], "r--")
ax.set_xlabel("Expected -$\log_{10}(p)$")
ax.set_ylabel("Observed -$\log_{10}(p)$")
handles, labels = ax.get_legend_handles_labels()
ax.set_title("Heterogeneity p-value at index SNPs")

ax.legend(handles=handles, labels=labels, loc="best")