In [1]:
%load_ext lab_black
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from admix.data import quantile_normalize
import seaborn as sns
import admix
import admix_genet_cor
import os

In [2]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp-tables.xlsx?dl=1"

df_trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="ukb-trait-info")
ukb_trait_list = df_trait_info[df_trait_info["in-analysis"] == "T"].id.values

df_trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="page-trait-info")
page_trait_list = df_trait_info.trait.values

In [3]:
df_params = {"study": [], "trait": []}
df_params["trait"].extend(ukb_trait_list)
df_params["study"].extend(["ukb"] * len(ukb_trait_list))
df_params["trait"].extend(page_trait_list)
df_params["study"].extend(["page"] * len(page_trait_list))
df_params = pd.DataFrame(df_params)
df_params["out"] = df_params.apply(
    lambda x: f"out/gwas-het/{x.study}-{x.trait}.csv", axis=1
)

In [4]:
df_assoc = []
for i, param in df_params.iterrows():
    df_tmp = pd.read_csv(param.out)
    if len(df_tmp) == 0:
        print(param.out, "is empty")
        continue
    df_tmp = pd.DataFrame({"study": [param.study], "trait": [param.trait]}).merge(
        df_tmp, how="cross"
    )
    df_assoc.append(df_tmp)
df_assoc = pd.concat(df_assoc).sort_values(["trait", "CHROM", "POS"])

out/gwas-het/ukb-height.csv is empty
out/gwas-het/ukb-log_BMI.csv is empty
out/gwas-het/ukb-log_heel_BMD.csv is empty
out/gwas-het/ukb-log_leukocyte.csv is empty
out/gwas-het/ukb-log_lymphocyte.csv is empty
out/gwas-het/ukb-log_platelet.csv is empty
out/gwas-het/ukb-neuroticism.csv is empty
out/gwas-het/ukb-years_of_edu.csv is empty
out/gwas-het/page-a1c.csv is empty
out/gwas-het/page-insulin.csv is empty
out/gwas-het/page-qrs_interval.csv is empty
out/gwas-het/page-systolic_bp.csv is empty
out/gwas-het/page-diastolic_bp.csv is empty
out/gwas-het/page-hypertension.csv is empty
out/gwas-het/page-waist_hip_ratio.csv is empty


In [5]:
df_assoc

Unnamed: 0,study,trait,SNP,CHROM,POS,PLINK_P,EUR_af,AFR_af,assoc_pval,HET_pval,EUR_beta,AFR_beta,EUR_beta_stderr,AFR_beta_stderr
0,ukb,244,chr1:24361499:C:T,1,24361499,1.210000e-10,0.014894,0.000000,1.208568e-10,1.208568e-10,3.319139e-01,-1.244192e-16,5.141827e-02,1.807700e-17
1,ukb,244,chr1:100408441:G:A,1,100408441,8.890000e-09,0.000592,0.011376,8.886700e-09,1.612634e-01,-8.263222e-02,2.891494e-01,2.608440e-01,4.881995e-02
2,ukb,244,chr1:106432245:A:G,1,106432245,3.420000e-09,0.000000,0.012221,3.421417e-09,3.421417e-09,2.228820e-16,2.842415e-01,3.740773e-17,4.798842e-02
3,ukb,244,chr1:212658725:T:C,1,212658725,1.690000e-09,0.034793,0.000000,1.686677e-09,1.686677e-09,2.071622e-01,1.035936e-17,3.430214e-02,1.221305e-17
4,ukb,244,chr1:231284020:T:A,1,231284020,1.980000e-08,0.028580,0.003124,1.981397e-08,4.154756e-04,1.452481e-01,5.107106e-01,3.806755e-02,9.532462e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,page,triglycerides,chr11:116768388:A:G,11,116768388,2.480000e-08,0.926603,0.999418,2.476378e-08,1.177976e-01,-3.228061e-01,-2.923039e-01,5.577032e-02,5.482548e-02
8,page,triglycerides,chr11:116791691:G:C,11,116791691,1.120000e-21,0.057652,0.062173,1.116276e-21,2.056291e-01,2.050204e-01,2.922744e-01,6.176682e-02,3.193581e-02
9,page,triglycerides,chr19:44907291:A:G,19,44907291,5.820000e-12,0.001357,0.025309,5.819801e-12,5.688763e-01,5.680030e-01,3.393113e-01,3.984812e-01,5.009605e-02
10,page,triglycerides,chr19:44919330:A:G,19,44919330,8.910000e-31,0.009950,0.148276,8.909676e-31,2.538615e-01,-4.139284e-01,-2.451674e-01,1.469469e-01,2.158000e-02


In [6]:
df_assoc = df_assoc[
    df_assoc.EUR_af.between(0.005, 0.995) & df_assoc.AFR_af.between(0.005, 0.995)
].reset_index(drop=True)

# df_assoc = df_assoc[
#     (df_assoc.EUR_beta_stderr > 1e-6) & (df_assoc.AFR_beta_stderr > 1e-6)
# ].reset_index(drop=True)

df_assoc.loc[:, "expected_HET_pval"] = stats.norm.sf(
    quantile_normalize(-df_assoc.HET_pval)
)
df_assoc.loc[:, "bonferroni_significant"] = df_assoc["HET_pval"] < 0.05 / len(df_assoc)

df_assoc = df_assoc[df_assoc["assoc_pval"] < 5e-8]
# df_assoc = df_assoc[~df_assoc.trait.isin(["MCH"])]

In [7]:
def report_assoc(df_assoc):

    bootstrap_slope_list = []
    pearsonr_list = []
    for _ in range(1000):
        df_tmp = df_assoc.sample(len(df_assoc), replace=True)
        slope = admix_genet_cor.locus.deming_regression(
            x=df_tmp["EUR_beta"],
            y=df_tmp["AFR_beta"],
            sx=df_tmp["EUR_beta_stderr"],
            sy=df_tmp["AFR_beta_stderr"],
        )[0]
        bootstrap_slope_list.append(slope)
        pearsonr = stats.pearsonr(df_tmp.EUR_beta, df_tmp.AFR_beta)[0]
        pearsonr_list.append(pearsonr)

    slope = admix_genet_cor.locus.deming_regression(
        x=df_assoc["EUR_beta"],
        y=df_assoc["AFR_beta"],
        sx=df_assoc["EUR_beta_stderr"],
        sy=df_assoc["AFR_beta_stderr"],
    )[0]
    lgc, lgc_ci = admix.data.lambda_gc(df_assoc.HET_pval, bootstrap_ci=True)

    print(f"{len(df_assoc)} SNPs")
    print(
        f"Pearson's r = {stats.pearsonr(df_assoc.EUR_beta, df_assoc.AFR_beta)[0]:.2g}, SE={np.std(pearsonr_list):.2f}"
    )
    print(
        f"Deming regression slope: {slope:.2f}. Bootstrap mean (se): {np.mean(bootstrap_slope_list):.2f} ({np.std(bootstrap_slope_list):.2f})"
    )
    print(f"lambda gc = {lgc:.2f}, ci=[{lgc_ci[0]:.2f}, {lgc_ci[1]:.2f}]")

In [8]:
mch_chr16_snps = df_assoc.trait.isin(["MCH"]) & (df_assoc.CHROM == 16)

In [14]:
df_assoc.trait.unique()

array(['244', '250.1', '401', 'LDL', 'MCH', 'bmi', 'cholesterol',
       'cigs_per_day_excl_nonsmk_updated', 'crp', 'erythrocyte',
       'ever_smoked', 'glucose', 'hdl', 'height', 'ldl', 'log_HDL',
       'log_HLR_reticulocyte', 'log_monocyte', 'log_triglycerides',
       'mean_corp_hgb_conc', 'platelet_cnt', 'pr_interval', 'systolic_BP',
       't2d_status', 'total_cholesterol', 'total_wbc_cnt',
       'triglycerides'], dtype=object)

In [15]:
df_assoc[df_assoc.trait == "log_triglycerides"]

Unnamed: 0,study,trait,SNP,CHROM,POS,PLINK_P,EUR_af,AFR_af,assoc_pval,HET_pval,EUR_beta,AFR_beta,EUR_beta_stderr,AFR_beta_stderr,expected_HET_pval,bonferroni_significant
93,ukb,log_triglycerides,chr11:46260284:A:G,11,46260284,4.7e-10,0.272246,0.896642,4.700917e-10,0.215622,-0.125268,-0.17719,0.037316,0.02938,0.452756,False
94,ukb,log_triglycerides,chr19:35065736:T:C,19,35065736,3.45e-09,0.858864,0.289347,3.453136e-09,0.000335,0.19774,0.069047,0.028606,0.030159,0.153543,True


In [16]:
df_assoc[df_assoc.trait == "triglycerides"]

Unnamed: 0,study,trait,SNP,CHROM,POS,PLINK_P,EUR_af,AFR_af,assoc_pval,HET_pval,EUR_beta,AFR_beta,EUR_beta_stderr,AFR_beta_stderr,expected_HET_pval,bonferroni_significant
117,page,triglycerides,chr2:27508073:T:C,2,27508073,1.17e-15,0.60341,0.917256,1.170335e-15,0.575712,-0.164749,-0.152418,0.02582,0.019967,0.69685,False
118,page,triglycerides,chr3:171036259:T:C,3,171036259,4.97e-08,0.103387,0.613238,4.969165e-08,0.484314,0.046658,0.079783,0.046864,0.01453,0.673228,False
119,page,triglycerides,chr8:19965681:T:C,8,19965681,2.45e-09,0.034727,0.080749,2.447828e-09,0.991302,0.16199,0.162931,0.08224,0.028652,0.988189,False
120,page,triglycerides,chr8:19973410:C:T,8,19973410,1.01e-13,0.134507,0.479271,1.011596e-13,0.76856,-0.09416,-0.107054,0.043452,0.014528,0.870079,False
121,page,triglycerides,chr8:20119800:C:G,8,20119800,5.19e-12,0.033705,0.285046,5.187268e-12,0.613345,-0.075305,-0.116596,0.080655,0.016906,0.73622,False
122,page,triglycerides,chr11:116714909:G:A,11,116714909,3.8e-09,0.026395,0.09132,3.802519e-09,0.036009,-0.031885,0.168486,0.09228,0.026995,0.240157,False
123,page,triglycerides,chr11:116717913:T:C,11,116717913,3.17e-11,0.801806,0.880437,3.170534e-11,0.582416,-0.142609,-0.131389,0.025628,0.020659,0.704724,False
124,page,triglycerides,chr11:116791691:G:C,11,116791691,1.12e-21,0.057652,0.062173,1.116276e-21,0.205629,0.20502,0.292274,0.061767,0.031936,0.413386,False
125,page,triglycerides,chr19:44919330:A:G,19,44919330,8.91e-31,0.00995,0.148276,8.909676000000001e-31,0.253862,-0.413928,-0.245167,0.146947,0.02158,0.507874,False
126,page,triglycerides,chr19:44928196:G:A,19,44928196,1.2e-08,0.10919,0.115679,1.198909e-08,0.218493,0.07534,0.13709,0.045371,0.024138,0.46063,False


In [21]:
print("# all SNPs")
report_assoc(df_assoc)
print("")
print("# SNPs removing MCH")
report_assoc(df_assoc[~mch_chr16_snps])
print("")

# all SNPs
127 SNPs
Pearson's r = 0.73, SE=0.04
Deming regression slope: 1.36. Bootstrap mean (se): 1.37 (0.13)
lambda gc = 2.99, ci=[2.26, 3.52]

# SNPs removing MCH
103 SNPs
Pearson's r = 0.86, SE=0.03
Deming regression slope: 1.11. Bootstrap mean (se): 1.11 (0.05)
lambda gc = 2.35, ci=[1.62, 3.18]



In [26]:
(159600370 + 159676405) / 2

159638387.5

In [25]:
df_assoc[~mch_chr16_snps].sort_values("HET_pval").iloc[0:20]

Unnamed: 0,study,trait,SNP,CHROM,POS,PLINK_P,EUR_af,AFR_af,assoc_pval,HET_pval,EUR_beta,AFR_beta,EUR_beta_stderr,AFR_beta_stderr,expected_HET_pval,bonferroni_significant
51,ukb,erythrocyte,chr16:357598:A:G,16,357598,3.38e-08,0.346212,0.536559,3.376903e-08,4e-06,0.006317,0.165138,0.029748,0.023105,0.106299,True
42,page,crp,chr1:159600370:C:T,1,159600370,2.57e-09,0.577977,0.458041,2.566003e-09,8e-05,0.02014,-0.123454,0.032176,0.017337,0.114173,True
43,page,crp,chr1:159676405:T:C,1,159676405,4.91e-08,0.742752,0.70439,4.91003e-08,0.000248,-0.021937,0.124031,0.035386,0.018975,0.129921,True
56,page,hdl,chr11:116810245:A:C,11,116810245,1.92e-08,0.034267,0.005148,1.920653e-08,0.000263,0.188877,0.67307,0.079191,0.107041,0.145669,True
94,ukb,log_triglycerides,chr19:35065736:T:C,19,35065736,3.45e-09,0.858864,0.289347,3.453136e-09,0.000335,0.19774,0.069047,0.028606,0.030159,0.153543,True
81,page,ldl,chr19:44905371:T:C,19,44905371,1.62e-08,0.082986,0.025212,1.619685e-08,0.002068,-0.089667,-0.302569,0.04967,0.048743,0.161417,False
3,ukb,250.1,chr10:26901162:A:T,10,26901162,1.92e-08,0.005411,0.009335,1.923999e-08,0.00239,0.394186,0.140498,0.0702,0.045379,0.169291,False
2,ukb,250.1,chr9:135451674:C:T,9,135451674,2.87e-08,0.006562,0.022612,2.869065e-08,0.003242,-0.021529,0.182347,0.063115,0.029012,0.185039,False
106,page,total_cholesterol,chr9:104826853:A:T,9,104826853,1.72e-09,0.192852,0.790112,1.715821e-09,0.004541,-0.00148,0.09331,0.034029,0.01451,0.192913,False
100,page,platelet_cnt,chr10:63318234:C:G,10,63318234,1.14e-08,0.426833,0.273351,1.14217e-08,0.008402,0.033297,0.114896,0.027281,0.018303,0.208661,False
