In [1]:
!wget https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-019-1310-4/MediaObjects/41586_2019_1310_MOESM3_ESM.xlsx -O page-supp-tables.xlsx

--2021-09-24 08:38:37--  https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-019-1310-4/MediaObjects/41586_2019_1310_MOESM3_ESM.xlsx
Resolving static-content.springer.com (static-content.springer.com)... 151.101.196.95
Connecting to static-content.springer.com (static-content.springer.com)|151.101.196.95|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1887821 (1.8M) [application/octet-stream]
Saving to: ‘page-supp-tables.xlsx’


2021-09-24 08:38:37 (190 MB/s) - ‘page-supp-tables.xlsx’ saved [1887821/1887821]



In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import xarray as xr
import numpy as np
import pandas as pd
import admix
import dask
import dask.array as da
from tqdm import tqdm
import statsmodels.api as sm
from scipy.stats import pearsonr, ttest_ind
import seaborn as sns
import matplotlib.pyplot as plt
from admix.data import quantile_normalize

import requests
import json
import sys
import pickle
import vcf
import subprocess

sys.path.append("../../")
import common

# Loci analysis
1. Take the top SNPs for each significant loci
2. Test the heterogeneity score.
3. Aggregate over multiple traits and multiple regions and show QQ-plot

Also see 
- https://github.com/gokceneraslan/opentargets-genetics-python
- https://community.opentargets.org/t/how-to-access-finngen-gwas-data-using-the-open-targets-genetics-portal-api/254/4
- https://api.genetics.opentargets.org/graphql/schema

# Process GWAS hits from PAGE study

In [131]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp_tables.xlsx?dl=1"
trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="trait-info")
trait_list = trait_info["trait"].values

In [205]:
df_assoc = pd.read_csv("page-gwas.tsv", sep="\t")
df_assoc = (
    df_assoc[
        [
            "DISEASE/TRAIT",
            "INITIAL SAMPLE SIZE",
            "REGION",
            "SNPS",
            "CHR_ID",
            "CHR_POS",
            "P-VALUE",
            "STUDY ACCESSION",
        ]
    ]
    .dropna(subset=["CHR_POS"])
    .astype({"CHR_POS": int})
)

df_assoc = df_assoc.loc[df_assoc.CHR_ID.isin(np.arange(1, 23).astype(str))]
# NOTE: the 24 + 2 additional traits are waist-hip ratio for males and females

# convert trait_id
gwas_catalog_name2id = {
    row["GWAS catalog name"]: row["trait"] for _, row in trait_info.iterrows()
}
df_assoc.insert(
    0, "trait_id", df_assoc["DISEASE/TRAIT"].apply(lambda x: gwas_catalog_name2id[x])
)

In [7]:
!mkdir -p out/locus_het/assoc_loc
for chrom in tqdm(range(1, 2)):
    df_loc = (
        df_assoc[df_assoc["CHR_ID"] == str(chrom)][["CHR_ID", "CHR_POS"]]
        .drop_duplicates()
        .sort_values(["CHR_POS"])
    )
    df_loc["CHR_ID"] = "chr" + df_loc["CHR_ID"]
    df_loc.rename(columns={"CHR_ID": "CHROM", "CHR_POS": "POS"}).to_csv(
        f"out/locus_het/assoc_loc/chr{chrom}.tsv", sep="\t", index=False, header=False
    )
    cmd = (
        f"tabix -hR out/locus_het/assoc_loc/chr{chrom}.tsv "
        + f"/u/project/sgss/PAGE/ImputedGWAS_topmedfrz8/MEGA_all.chr{chrom}.filtered.vcf.gz "
        + f"> out/locus_het/assoc_loc/chr{chrom}.vcf"
    )
    subprocess.check_call(cmd, shell=True)

100%|██████████| 1/1 [02:11<00:00, 131.89s/it]


In [52]:
dset_hm3 = common.load_page_hm3()

100%|██████████| 22/22 [00:00<00:00, 71.99it/s]


In [None]:
# load dset_gwas_hit
dset_list = []
for chrom in tqdm(range(1, 23)):
    dset_list.append(admix.io.read_vcf(f"out/locus_het/assoc_loc/chr{chrom}.vcf"))
dset_gwas_hit = xr.concat(dset_list, dim="snp")
dset_gwas_hit["geno"] = dset_gwas_hit.geno.dims, da.from_array(
    dset_gwas_hit.geno.values, chunks=-1
)
dset_gwas_hit.attrs["n_anc"] = 2

In [187]:
# align `dset_gwas_hit to `dset`
dset_gwas_hit = dset_gwas_hit.sel(indiv=dset_hm3.indiv.values)

# fill in lanc
df_dset_hm3 = dset_hm3.snp.to_dataframe().reset_index(drop=True)
snp_index = []
for _, snp in tqdm(dset_gwas_hit.snp.to_dataframe().iterrows()):
    chrom, pos = snp["CHROM"], snp["POS"]
    df_tmp = df_dset_hm3[df_dset_hm3["CHROM"] == chrom]
    snp_index.append((df_tmp["POS"] - pos).abs().idxmin())
lanc = dset_hm3.isel(snp=snp_index).lanc.values
dset_gwas_hit = dset_gwas_hit.assign(
    lanc=(("indiv", "snp", "ploidy"), da.from_array(lanc, chunks=-1))
)

# fill in individual information
for k in dset_hm3.coords:
    if dset_hm3.coords[k].dims == ("indiv",):
        dset_gwas_hit.coords[k] = ("indiv", dset_hm3.coords[k].data)

# for duplicated snps (with same CHROM and POS), retain only the one with larger MAF
df_tmp = dset_gwas_hit.snp.to_dataframe()
df_tmp = df_tmp.groupby(["CHROM", "POS"], as_index=False).apply(
    lambda group: group.loc[group.MAF == group.MAF.max()]
)
dset_gwas_hit = dset_gwas_hit.sel(snp=dset_gwas_hit.snp.isin(df_tmp.snp))
admix.tools.allele_per_anc(dset_gwas_hit)
dset_gwas_hit.to_zarr("out/locus_het/gwas_hit.zarr")

904it [00:03, 301.23it/s]


<xarray.backends.zarr.ZarrStore at 0x2aaffe7f5cf0>

In [188]:
df_assoc["EUR_af"] = np.nan
df_assoc["AFR_af"] = np.nan

df_assoc["EUR_beta"] = np.nan
df_assoc["EUR_beta_stderr"] = np.nan
df_assoc["AFR_beta"] = np.nan
df_assoc["AFR_beta_stderr"] = np.nan
df_assoc["HET_pval"] = np.nan

In [207]:
for row_i, row in tqdm(df_assoc.iterrows(), total=df_assoc.shape[0]):
    chrom = int(row.CHR_ID)
    position = row.CHR_POS

    dset_snp_pos = np.where(
        (dset_gwas_hit.CHROM == chrom) & (dset_gwas_hit.POS == position)
    )[0]
    if len(dset_snp_pos) == 0:
        print(f"{row_i} is missing")
        continue
    assert len(dset_snp_pos) == 1
    dset_snp = dset_gwas_hit.isel(snp=dset_snp_pos)
    dset_snp = dset_snp.sel(indiv=~np.isnan(dset_snp.coords[row.trait_id]))
    apa = dset_snp["allele_per_anc"].values[:, 0, :]
    admix.tools.af_per_anc(dset_snp)
    df_covar = dict()
    for col in ["age", "sex", "study"] + [f"geno_EV{i}" for i in range(1, 11)]:
        df_covar[col] = dset_snp.coords[col].values
    df_covar = pd.DataFrame(df_covar)
    study_dummies = pd.get_dummies(df_covar["study"], drop_first=True)
    study_dummies.columns = [f"study_dummy_{s}" for s in study_dummies.columns]
    df_covar = pd.concat([df_covar, study_dummies], axis=1)
    df_covar = df_covar.drop(columns=["study"])
    covar = df_covar.values
    covar = (covar - covar.mean(axis=0)) / covar.std(axis=0)
    y = dset_snp[row.trait_id].values
    y = quantile_normalize(y)
    p_het, model_het = common.test_het(apa, y, covar)
    p_assoc, model_assoc = common.test_assoc(apa, y, covar)
    df_assoc.loc[row_i, ["EUR_af", "AFR_af"]] = (
        dset_snp["af_per_anc"].values[0, 0],
        dset_snp["af_per_anc"].values[0, 1],
    )
    df_assoc.loc[
        row_i,
        [
            "assoc_pval",
            "HET_pval",
            "EUR_beta",
            "AFR_beta",
            "EUR_beta_stderr",
            "AFR_beta_stderr",
        ],
    ] = [
        p_assoc,
        p_het,
        model_het.params[1],
        model_het.params[2],
        model_het.bse[1],
        model_het.bse[2],
    ]

  2%|▏         | 24/1243 [00:34<30:44,  1.51s/it]

24 is missing


  8%|▊         | 96/1243 [02:15<26:28,  1.39s/it]

97 is missing
98 is missing


 15%|█▍        | 182/1243 [04:13<24:48,  1.40s/it]

183 is missing


 16%|█▋        | 203/1243 [04:42<24:30,  1.41s/it]

204 is missing


 16%|█▋        | 205/1243 [04:43<18:39,  1.08s/it]

206 is missing


 20%|██        | 250/1243 [05:48<24:45,  1.50s/it]

251 is missing


 32%|███▏      | 395/1243 [09:13<20:11,  1.43s/it]

401 is missing


 33%|███▎      | 412/1243 [09:36<19:28,  1.41s/it]

420 is missing


 40%|███▉      | 495/1243 [11:33<17:36,  1.41s/it]

504 is missing


 49%|████▉     | 615/1243 [14:20<14:35,  1.39s/it]

625 is missing
626 is missing
627 is missing


 50%|████▉     | 619/1243 [14:22<07:35,  1.37it/s]

629 is missing


 51%|█████▏    | 639/1243 [14:48<13:58,  1.39s/it]

649 is missing


 67%|██████▋   | 827/1243 [19:14<09:46,  1.41s/it]

840 is missing


 71%|███████   | 883/1243 [20:33<08:29,  1.42s/it]

897 is missing


 79%|███████▊  | 977/1243 [22:46<06:17,  1.42s/it]

991 is missing


 79%|███████▉  | 983/1243 [22:53<05:40,  1.31s/it]

998 is missing


 80%|███████▉  | 991/1243 [23:03<05:42,  1.36s/it]

1006 is missing


 80%|████████  | 998/1243 [23:12<05:26,  1.33s/it]

1013 is missing


 81%|████████  | 1008/1243 [23:25<05:29,  1.40s/it]

1023 is missing


 81%|████████▏ | 1010/1243 [23:26<04:14,  1.09s/it]

1025 is missing


 82%|████████▏ | 1018/1243 [23:36<05:04,  1.35s/it]

1033 is missing


 84%|████████▎ | 1038/1243 [24:03<04:48,  1.41s/it]

1053 is missing


 89%|████████▉ | 1110/1243 [25:42<03:06,  1.40s/it]

1125 is missing


 97%|█████████▋| 1206/1243 [27:57<00:50,  1.38s/it]

1222 is missing


 99%|█████████▉| 1236/1243 [28:38<00:09,  1.40s/it]

1252 is missing


100%|██████████| 1243/1243 [28:47<00:00,  1.39s/it]


In [208]:
df_assoc.to_excel("out/locus_het/sumstats.xlsx", index=False)