# Loci analysis
1. Take the top SNPs for each significant loci
2. Test the heterogeneity score.
3. Aggregate over multiple traits and multiple regions and show QQ-plot

`page-gwas.tsv` is downloaded from GWAS catalog https://www.ebi.ac.uk/gwas/publications/31217584 (Download catalog data)

**Links below are not used in this notebook but I preserve these just in case**

Also see 
- https://github.com/gokceneraslan/opentargets-genetics-python
- https://community.opentargets.org/t/how-to-access-finngen-gwas-data-using-the-open-targets-genetics-portal-api/254/4
- https://api.genetics.opentargets.org/graphql/schema

!wget https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-019-1310-4/MediaObjects/41586_2019_1310_MOESM3_ESM.xlsx -O page-supp-tables.xlsx


In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import xarray as xr
import numpy as np
import pandas as pd
import admix
import dask
import dask.array as da
from tqdm import tqdm
import statsmodels.api as sm
import matplotlib.pyplot as plt
from admix.data import quantile_normalize
import submitit

import admix_genet_cor

# Process GWAS hits from PAGE study

In [2]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp_tables.xlsx?dl=1"
trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="trait-info")
trait_list = trait_info["trait"].values

In [3]:
df_assoc = pd.read_csv("./page-gwas.tsv", sep="\t")
df_assoc = (
    df_assoc[
        [
            "DISEASE/TRAIT",
            "INITIAL SAMPLE SIZE",
            "REGION",
            "SNPS",
            "CHR_ID",
            "CHR_POS",
            "P-VALUE",
            "STUDY ACCESSION",
        ]
    ]
    .dropna(subset=["CHR_POS"])
    .astype({"CHR_POS": int})
)

df_assoc = df_assoc.loc[df_assoc.CHR_ID.isin(np.arange(1, 23).astype(str))]
# NOTE: the 24 + 2 additional traits are waist-hip ratio for males and females

# convert trait_id
gwas_catalog_name2id = {
    row["GWAS catalog name"]: row["trait"] for _, row in trait_info.iterrows()
}
df_assoc.insert(
    0, "trait_id", df_assoc["DISEASE/TRAIT"].apply(lambda x: gwas_catalog_name2id[x])
)
df_assoc["CHR_ID"] = df_assoc["CHR_ID"].astype(int)
df_assoc = (
    df_assoc.sort_values(["trait_id", "CHR_ID", "CHR_POS"])
    .drop_duplicates(["trait_id", "CHR_ID", "CHR_POS"])
    .reset_index(drop=True)
)
df_assoc.index = (
    df_assoc["trait_id"]
    + ":"
    + df_assoc["CHR_ID"].astype(str)
    + ":"
    + df_assoc["CHR_POS"].astype(str)
)

In [5]:
import dapgen
from os.path import join

PFILE_DIR = "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01_dataset/out/aframr/imputed"
SAMPLE_INFO_PATH = "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01_dataset/out/aframr/sample_info.txt"

In [35]:
def submit_locus_hetero(trait, out):
    df_assoc_trait = df_assoc[df_assoc.trait_id == trait].copy()

    # format phenotype and covariates

    dset = admix.dataset.read_dataset(
        join(PFILE_DIR, "chr22"),
        indiv_info=SAMPLE_INFO_PATH,
        n_anc=2,
    )
    dset_assoc = dset[:, ~np.isnan(dset.indiv[trait]).values]
    covar_cols = ["age", "sex", "study"] + [f"geno_EV{i}" for i in range(1, 11)]

    df_pheno = dset_assoc.indiv[[trait]].copy()
    df_covar = dset_assoc.indiv[covar_cols].copy()
    # create study dummies variables
    study_dummies = pd.get_dummies(df_covar["study"], drop_first=True)
    study_dummies.columns = [f"study_dummy_{s}" for s in study_dummies.columns]
    df_covar = pd.concat([df_covar, study_dummies], axis=1)
    df_covar = df_covar.drop(columns=["study"])

    for col in df_pheno.columns:
        df_pheno[col] = admix.data.quantile_normalize(df_pheno[col])

    for col in df_covar.columns:
        df_covar[col] = admix.data.quantile_normalize(df_covar[col])

    for chrom in range(1, 23):
        # subset df_assoc
        df_assoc_trait_chrom = df_assoc_trait[df_assoc_trait.CHR_ID == chrom]

        # subset dset
        dset = admix.dataset.read_dataset(
            join(PFILE_DIR, f"chr{chrom}"),
            indiv_info=SAMPLE_INFO_PATH,
            n_anc=2,
        )
        dset = dset[
            dset.snp.POS.isin(df_assoc_trait_chrom.CHR_POS.values).values,
            df_pheno.index.values,
        ]
        dset.persist()

        apa = dset.allele_per_anc()
        af = dset.af_per_anc()

        for snp_i in tqdm(range(dset.n_snp)):
            p_het, model_het = admix_genet_cor.test_snp_het(
                apa[snp_i, :, :], df_pheno.values, df_covar.values
            )
            p_assoc, model_assoc = admix_genet_cor.test_snp_assoc(
                apa[snp_i, :, :], df_pheno.values, df_covar.values
            )
            snp_idx = f"{trait}:{dset.snp.CHROM[snp_i]}:{dset.snp.POS[snp_i]}"
            df_assoc_trait.loc[snp_idx, ["EUR_af", "AFR_af"]] = (
                af[snp_i, 0],
                af[snp_i, 1],
            )

            df_assoc_trait.loc[
                snp_idx,
                [
                    "assoc_pval",
                    "HET_pval",
                    "EUR_beta",
                    "AFR_beta",
                    "EUR_beta_stderr",
                    "AFR_beta_stderr",
                ],
            ] = [
                p_assoc,
                p_het,
                model_het.params[1],
                model_het.params[2],
                model_het.bse[1],
                model_het.bse[2],
            ]
    df_assoc_trait.to_csv(out)

In [39]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=20,
    memory_g=16,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_locus_hetero,
    trait_list,
    [f"out/locus_hetero/{trait}.csv" for trait in trait_list],
)

# DEPRECATED BELOW
-----------

In [None]:
# for chrom in range(1, 23):
#     df_loc = (
#         df_assoc[df_assoc["CHR_ID"] == str(chrom)][["CHR_ID", "CHR_POS"]]
#         .drop_duplicates()
#         .sort_values(["CHR_POS"])
#     )
#     pfile = join(PFILE_DIR, f"chr{chrom}")
#     df_snp = dapgen.read_pvar(pfile + ".pvar")
#     snp_mask = df_snp.POS.isin(df_loc.CHR_POS)

#     print(f"chrom={chrom}, {sum(snp_mask)}/{len(df_loc)} SNPs is found")

#     lanc = admix.io.read_lanc(pfile + ".lanc")[snp_mask, :, :].compute()
#     lanc = da.from_array(lanc)

#     admix.tools.plink2_subset(
#         pfile, f"out/gwas_hit_geno/chr{chrom}", df_snp.index.values[snp_mask]
#     )
#     admix.io.write_lanc(f"out/gwas_hit_geno/chr{chrom}.lanc", lanc)

In [7]:
!mkdir -p out/locus_het/assoc_loc
for chrom in tqdm(range(1, 2)):
    df_loc = (
        df_assoc[df_assoc["CHR_ID"] == str(chrom)][["CHR_ID", "CHR_POS"]]
        .drop_duplicates()
        .sort_values(["CHR_POS"])
    )
    df_loc["CHR_ID"] = "chr" + df_loc["CHR_ID"]
    df_loc.rename(columns={"CHR_ID": "CHROM", "CHR_POS": "POS"}).to_csv(
        f"out/locus_het/assoc_loc/chr{chrom}.tsv", sep="\t", index=False, header=False
    )
    cmd = (
        f"tabix -hR out/locus_het/assoc_loc/chr{chrom}.tsv "
        + f"/u/project/sgss/PAGE/ImputedGWAS_topmedfrz8/MEGA_all.chr{chrom}.filtered.vcf.gz "
        + f"> out/locus_het/assoc_loc/chr{chrom}.vcf"
    )
    subprocess.check_call(cmd, shell=True)

100%|██████████| 1/1 [02:11<00:00, 131.89s/it]


In [None]:
# load dset_gwas_hit
dset_list = []
for chrom in tqdm(range(1, 23)):
    dset_list.append(admix.io.read_vcf(f"out/locus_het/assoc_loc/chr{chrom}.vcf"))
dset_gwas_hit = xr.concat(dset_list, dim="snp")
dset_gwas_hit["geno"] = dset_gwas_hit.geno.dims, da.from_array(
    dset_gwas_hit.geno.values, chunks=-1
)
dset_gwas_hit.attrs["n_anc"] = 2

In [187]:
# align `dset_gwas_hit to `dset`
dset_gwas_hit = dset_gwas_hit.sel(indiv=dset_hm3.indiv.values)

# fill in lanc
df_dset_hm3 = dset_hm3.snp.to_dataframe().reset_index(drop=True)
snp_index = []
for _, snp in tqdm(dset_gwas_hit.snp.to_dataframe().iterrows()):
    chrom, pos = snp["CHROM"], snp["POS"]
    df_tmp = df_dset_hm3[df_dset_hm3["CHROM"] == chrom]
    snp_index.append((df_tmp["POS"] - pos).abs().idxmin())
lanc = dset_hm3.isel(snp=snp_index).lanc.values
dset_gwas_hit = dset_gwas_hit.assign(
    lanc=(("indiv", "snp", "ploidy"), da.from_array(lanc, chunks=-1))
)

# fill in individual information
for k in dset_hm3.coords:
    if dset_hm3.coords[k].dims == ("indiv",):
        dset_gwas_hit.coords[k] = ("indiv", dset_hm3.coords[k].data)

# for duplicated snps (with same CHROM and POS), retain only the one with larger MAF
df_tmp = dset_gwas_hit.snp.to_dataframe()
df_tmp = df_tmp.groupby(["CHROM", "POS"], as_index=False).apply(
    lambda group: group.loc[group.MAF == group.MAF.max()]
)
dset_gwas_hit = dset_gwas_hit.sel(snp=dset_gwas_hit.snp.isin(df_tmp.snp))
admix.tools.allele_per_anc(dset_gwas_hit)
dset_gwas_hit.to_zarr("out/locus_het/gwas_hit.zarr")

904it [00:03, 301.23it/s]


<xarray.backends.zarr.ZarrStore at 0x2aaffe7f5cf0>

In [188]:
df_assoc["EUR_af"] = np.nan
df_assoc["AFR_af"] = np.nan

df_assoc["EUR_beta"] = np.nan
df_assoc["EUR_beta_stderr"] = np.nan
df_assoc["AFR_beta"] = np.nan
df_assoc["AFR_beta_stderr"] = np.nan
df_assoc["HET_pval"] = np.nan

for row_i, row in tqdm(df_assoc.iterrows(), total=df_assoc.shape[0]):
    chrom = int(row.CHR_ID)
    position = row.CHR_POS

    dset_snp_pos = np.where(
        (dset_gwas_hit.CHROM == chrom) & (dset_gwas_hit.POS == position)
    )[0]
    if len(dset_snp_pos) == 0:
        print(f"{row_i} is missing")
        continue
    assert len(dset_snp_pos) == 1
    dset_snp = dset_gwas_hit.isel(snp=dset_snp_pos)
    dset_snp = dset_snp.sel(indiv=~np.isnan(dset_snp.coords[row.trait_id]))
    apa = dset_snp["allele_per_anc"].values[:, 0, :]
    admix.tools.af_per_anc(dset_snp)
    df_covar = dict()
    for col in ["age", "sex", "study"] + [f"geno_EV{i}" for i in range(1, 11)]:
        df_covar[col] = dset_snp.coords[col].values
    df_covar = pd.DataFrame(df_covar)
    study_dummies = pd.get_dummies(df_covar["study"], drop_first=True)
    study_dummies.columns = [f"study_dummy_{s}" for s in study_dummies.columns]
    df_covar = pd.concat([df_covar, study_dummies], axis=1)
    df_covar = df_covar.drop(columns=["study"])
    covar = df_covar.values
    covar = (covar - covar.mean(axis=0)) / covar.std(axis=0)
    y = dset_snp[row.trait_id].values
    y = quantile_normalize(y)
    p_het, model_het = common.test_het(apa, y, covar)
    p_assoc, model_assoc = common.test_assoc(apa, y, covar)
    df_assoc.loc[row_i, ["EUR_af", "AFR_af"]] = (
        dset_snp["af_per_anc"].values[0, 0],
        dset_snp["af_per_anc"].values[0, 1],
    )
    df_assoc.loc[
        row_i,
        [
            "assoc_pval",
            "HET_pval",
            "EUR_beta",
            "AFR_beta",
            "EUR_beta_stderr",
            "AFR_beta_stderr",
        ],
    ] = [
        p_assoc,
        p_het,
        model_het.params[1],
        model_het.params[2],
        model_het.bse[1],
        model_het.bse[2],
    ]

In [208]:
df_assoc.to_excel("out/locus_het/sumstats.xlsx", index=False)