In [4]:
!wget https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-019-1310-4/MediaObjects/41586_2019_1310_MOESM3_ESM.xlsx -O page-supp-tables.xlsx

--2021-09-23 18:57:46--  https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-019-1310-4/MediaObjects/41586_2019_1310_MOESM3_ESM.xlsx
Resolving static-content.springer.com (static-content.springer.com)... 151.101.196.95
Connecting to static-content.springer.com (static-content.springer.com)|151.101.196.95|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1887821 (1.8M) [application/octet-stream]
Saving to: ‘page-supp-tables.xlsx’


2021-09-23 18:57:46 (40.9 MB/s) - ‘page-supp-tables.xlsx’ saved [1887821/1887821]



In [5]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import xarray as xr
import numpy as np
import pandas as pd
import admix
import dask
import dask.array as da
from tqdm import tqdm
import statsmodels.api as sm
from scipy.stats import pearsonr, ttest_ind
import seaborn as sns
import matplotlib.pyplot as plt
from admix.data import quantile_normalize

import requests
import json
import sys
import pickle
import vcf

sys.path.append("../")
from utils import *

# Loci analysis
1. Take the top SNPs for each significant loci
2. Test the heterogeneity score.
3. Aggregate over multiple traits and multiple regions and show QQ-plot

Also see 
- https://github.com/gokceneraslan/opentargets-genetics-python
- https://community.opentargets.org/t/how-to-access-finngen-gwas-data-using-the-open-targets-genetics-portal-api/254/4
- https://api.genetics.opentargets.org/graphql/schema

In [None]:
df = pd.read_excel(URL_SUPP_TABLE, sheet_name="ST5 All known SNPs by trait", skiprows=1)
df = df.loc[~pd.isna(df["Chromosome"]), :].astype({"Chromosome": int, "Pos_hg19": int})

In [4]:
study_info_query = """
    query test ($myStudyId: String! ) {
      studyInfo(studyId: $myStudyId) {
          traitReported
          source
          traitEfos
          pmid
          pubAuthor
          ancestryInitial
          numAssocLoci
          nTotal
      }
    }
"""

manhattan_query = """
    query test ($myStudyId: String! ) {
      manhattan(studyId: $myStudyId) {
        associations {
          variant {
            id
            rsId
            chromosome
            position
            nearestCodingGene {
              id
              symbol
            }
            nearestCodingGeneDistance
          }
          pval
          credibleSetSize
          ldSetSize
          oddsRatio
          oddsRatioCILower
          oddsRatioCIUpper
          beta
          betaCILower
          betaCIUpper
          direction
        }
      }
    }
"""

In [5]:
def simulate(apa, beta, cov):
    cov_effects = np.random.normal(loc=0, scale=0.1, size=cov.shape[1])
    y = (
        np.dot(apa, beta)
        + np.dot(cov, cov_effects)
        + np.random.normal(size=apa.shape[0])
    )
    return y


def test_het(apa, y, cov):
    design = sm.add_constant(np.hstack([apa, cov]))
    model = sm.OLS(y, design).fit()

    A = np.zeros([1, len(model.params)])
    A[0, 1] = 1
    A[0, 2] = -1
    p_ftest = model.f_test(A).pvalue.item()
    return p_ftest, model


def test_assoc(apa, y, cov):
    design = sm.add_constant(np.hstack([apa.sum(axis=1)[:, np.newaxis], cov]))
    model = sm.OLS(y, design).fit()
    return model.pvalues[1], model

In [6]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp_tables.xlsx?dl=1"
trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="trait-info")
trait_list = trait_info["trait"].values

In [7]:
base_url = "https://api.genetics.opentargets.org/graphql"

df_assoc_list = []
for _, row in trait_info.iterrows():
    variables = {"myStudyId": row["open-targets-studyid"]}
    r = requests.post(base_url, json={"query": manhattan_query, "variables": variables})

    response = json.loads(r.text)
    assoc = response["data"]["manhattan"]["associations"]

    df_assoc = dict()
    # variant information
    for v in [
        "id",
        "rsId",
        "chromosome",
        "nearestCodingGene",
        "nearestCodingGeneDistance",
    ]:
        if v == "nearestCodingGene":
            df_assoc[v] = [d["variant"][v]["symbol"] for d in assoc]
        else:
            df_assoc[v] = [d["variant"][v] for d in assoc]
    for v in [
        "pval",
        "credibleSetSize",
        "ldSetSize",
        "oddsRatio",
        "oddsRatioCILower",
        "oddsRatioCIUpper",
        "beta",
        "betaCILower",
        "betaCIUpper",
        "direction",
    ]:
        df_assoc[v] = [d[v] for d in assoc]
    df_assoc = pd.DataFrame(df_assoc)
    df_assoc.insert(
        3, "position", value=df_assoc["id"].apply(lambda x: int(x.split("_")[1]))
    )
    df_assoc.insert(0, "trait", value=row.trait)
    df_assoc_list.append(df_assoc)
df_assoc = pd.concat(df_assoc_list)
df_assoc = df_assoc.drop(
    columns=[
        "oddsRatio",
        "oddsRatioCILower",
        "oddsRatioCIUpper",
        "beta",
        "betaCILower",
        "betaCIUpper",
        "direction",
        "credibleSetSize",
    ]
).rename(columns={"pval": "GWAS_catalog_pval"})
df_assoc = df_assoc.reset_index(drop=True)

In [8]:
dset = load_hm3()

100%|██████████| 22/22 [00:03<00:00,  7.11it/s]


In [99]:
DATA_DIR = "/u/project/sgss/PAGE/ImputedGWAS_topmedfrz8/"

dict_record = {}
for _, row in df_assoc.iterrows():
    chrom = row.chromosome
    position = row.position
    vcf_reader = vcf.Reader(filename=join(DATA_DIR, f"MEGA_all.chr{chrom}.filtered.vcf.gz"))
    try:
        record = next(vcf_reader.fetch(f"chr{chrom}", position - 1, position))
        dict_record[row.id] = record
    except StopIteration:
        print(f"{row.id} not available in data")

dict_hit_info = {}
for snp in dict_record:
    record = dict_record[snp]
    dict_hit_info[snp] = {"gt": np.array([sample.gt_alleles for sample in record.samples], dtype=int),
                          "sample": [sample.sample for sample in record.samples],
                          "info": record.INFO}
    
with open("out/locus_het/gwas_hit_info.pkl", 'wb') as f:
    pickle.dump(dict_hit_info, f)
    
# with open("data/gwas_hit_info.pkl", 'rb') as f:
#     dict_hit_info = pickle.load(f)

  4%|▍         | 1/24 [00:16<06:12, 16.20s/it]

1_159204893_T_C not available in data


  8%|▊         | 2/24 [00:48<09:27, 25.79s/it]

11_5227002_T_A not available in data


 62%|██████▎   | 15/24 [03:56<01:43, 11.52s/it]

21_34455875_G_A not available in data


 67%|██████▋   | 16/24 [04:04<01:26, 10.77s/it]

6_36678991_C_CTA not available in data


100%|██████████| 24/24 [06:22<00:00, 15.95s/it]


In [11]:
with open("out/locus_het/gwas_hit_info.pkl", "rb") as f:
    dict_hit_info = pickle.load(f)

In [17]:
gt = np.dstack([dict_hit_info[snp]["gt"] for snp in dict_hit_info])
sample = [dict_hit_info[snp]["sample"] for snp in dict_hit_info]
assert all(x == sample[0] for x in sample)
sample = sample[0]

R2 = [dict_hit_info[snp]["info"]["R2"] for snp in dict_hit_info]
AF = [dict_hit_info[snp]["info"]["AF"] for snp in dict_hit_info]

dset_gwas_hit = xr.Dataset(
    data_vars={
        "geno": (
            ("indiv", "snp", "ploidy"),
            da.from_array(np.swapaxes(gt, 1, 2), chunks=-1),
        ),
    },
    coords={
        "snp": np.array(list(dict_hit_info.keys())).astype(str),
        "indiv": np.array(sample).astype(str),
        "R2@snp": ("snp", np.array(R2, dtype=float)),
        "AF@snp": ("snp", np.array(AF, dtype=float)),
    },
    attrs={"n_anc": 2},
)

info = [s.split("_") for s in dset_gwas_hit.snp.values]
dset_gwas_hit = dset_gwas_hit.assign_coords(
    {
        "CHROM@snp": ("snp", [int(i[0]) for i in info]),
        "POS@snp": ("snp", [int(i[1]) for i in info]),
        "REF@snp": ("snp", [str(i[2]) for i in info]),
        "ALT@snp": ("snp", [str(i[3]) for i in info]),
    }
)

# align `dset_gwas_hit to `dset`
dset_gwas_hit = dset_gwas_hit.sel(indiv=dset.indiv.values)

# fill in lanc
df_dset = dset.snp.to_dataframe().reset_index(drop=True)
snp_index = []
for _, snp in tqdm(dset_gwas_hit.snp.to_dataframe().iterrows()):
    chrom, pos = snp["CHROM@snp"], snp["POS@snp"]
    df_tmp = df_dset[df_dset["CHROM@snp"] == chrom]
    snp_index.append((df_tmp["POS@snp"] - pos).abs().idxmin())
lanc = dset.isel(snp=snp_index).lanc.values
dset_gwas_hit = dset_gwas_hit.assign(
    lanc=(("indiv", "snp", "ploidy"), da.from_array(lanc, chunks=-1))
)

# fill in individual information
for col in dset:
    if col.endswith("@indiv"):
        dset_gwas_hit.coords[col] = ("indiv", dset[col].values)
admix.tools.allele_per_anc(dset_gwas_hit)
dset_gwas_hit.to_zarr("out/locus_het/gwas_hit.zarr")

179it [00:00, 217.99it/s]


<xarray.backends.zarr.ZarrStore at 0x2af0a3d649e0>

In [10]:
df_assoc["EUR_af"] = np.nan
df_assoc["AFR_af"] = np.nan

df_assoc["EUR_beta"] = np.nan
df_assoc["EUR_beta_stderr"] = np.nan
df_assoc["AFR_beta"] = np.nan
df_assoc["AFR_beta_stderr"] = np.nan
df_assoc["HET_pval"] = np.nan

In [11]:
for row_i, row in tqdm(df_assoc.iterrows()):
    chrom = row.chromosome
    position = row.position
    if row.id not in dset_gwas_hit.snp.values:
        continue
    dset_snp = dset_gwas_hit.sel(snp=[row.id])
    dset_snp = dset_snp.sel(indiv=~np.isnan(dset_snp[f"{row.trait}@indiv"]))
    apa = dset_snp["allele_per_anc"].values[:, 0, :]
    admix.tools.af_per_anc(dset_snp)
    df_covar = dict()
    for col in ["age", "sex", "study"] + [f"geno_EV{i}" for i in range(1, 11)]:
        df_covar[col] = dset_snp[col + "@indiv"].values
    df_covar = pd.DataFrame(df_covar)
    study_dummies = pd.get_dummies(df_covar["study"], drop_first=True)
    study_dummies.columns = [f"study_dummy_{s}" for s in study_dummies.columns]
    df_covar = pd.concat([df_covar, study_dummies], axis=1)
    df_covar = df_covar.drop(columns=["study"])
    covar = df_covar.values
    covar = (covar - covar.mean(axis=0)) / covar.std(axis=0)
    y = dset_snp[f"{row.trait}@indiv"].values
    y = quantile_normalize(y)
    p_het, model_het = test_het(apa, y, covar)
    p_assoc, model_assoc = test_assoc(apa, y, covar)
    df_assoc.loc[row_i, ["EUR_af", "AFR_af"]] = (
        dset_snp["af_per_anc"].values[0, 0],
        dset_snp["af_per_anc"].values[0, 1],
    )
    df_assoc.loc[
        row_i,
        [
            "assoc_pval",
            "HET_pval",
            "EUR_beta",
            "AFR_beta",
            "EUR_beta_stderr",
            "AFR_beta_stderr",
        ],
    ] = [
        p_assoc,
        p_het,
        model_het.params[1],
        model_het.params[2],
        model_het.bse[1],
        model_het.bse[2],
    ]

211it [01:20,  2.62it/s]


In [12]:
df_assoc

Unnamed: 0,trait,id,rsId,chromosome,position,nearestCodingGene,nearestCodingGeneDistance,GWAS_catalog_pval,ldSetSize,EUR_af,AFR_af,EUR_beta,EUR_beta_stderr,AFR_beta,AFR_beta_stderr,HET_pval,assoc_pval
0,crp,1_65624099_C_T,rs6700896,1,65624099.0,PDE4B,168415.0,2.000000e-25,129.0,0.408433,0.465942,-0.078825,0.026669,-0.058604,0.016611,0.475105,3.624774e-05
1,crp,1_154454494_A_C,rs2228145,1,154454494.0,TDRD10,47725.0,9.000000e-34,21.0,0.412968,0.076964,-0.156980,0.026353,-0.016431,0.032193,0.000445,1.553822e-06
2,crp,1_159685728_G_A,rs726640,1,159685728.0,CRP,28861.0,6.000000e-117,47.0,0.001030,0.208739,0.232544,0.483246,0.365418,0.020218,0.783419,1.096978e-71
3,crp,1_247438293_T_C,rs12239046,1,247438293.0,OR2B11,19812.0,1.000000e-09,3.0,0.638524,0.451357,0.035252,0.022846,0.035906,0.016719,0.978590,1.791943e-02
4,crp,2_27508073_T_C,rs1260326,2,27508073.0,GCKR,11234.0,3.000000e-27,5.0,0.604076,0.918421,-0.094591,0.027497,-0.065589,0.021587,0.212499,5.435033e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,bmi,13_53528071_G_A,rs12429545,13,53528071.0,OLFM4,499258.0,2.000000e-08,6.0,0.155075,0.018685,-0.014697,0.029416,0.076984,0.043621,0.079769,5.743546e-01
207,bmi,16_3963466_C_T,rs2531995,16,3963466.0,CREBBP,82740.0,5.000000e-09,8.0,0.579632,0.094109,0.001340,0.016689,0.041867,0.020054,0.102777,1.909695e-01
208,bmi,16_53769662_T_A,rs1558902,16,53769662.0,RPGRIP1L,65724.0,4.000000e-29,29.0,0.381213,0.038373,0.063977,0.019643,0.055455,0.030748,0.810922,2.757239e-04
209,bmi,18_60161902_T_C,rs6567160,18,60161902.0,MC4R,210873.0,2.000000e-11,17.0,0.208939,0.182251,0.086659,0.025641,0.050865,0.015117,0.212765,9.926148e-06


In [407]:
df_assoc.to_excel("out/locus_het/sumstats.xlsx", index=False)