# Loci analysis
1. Take the top SNPs for each significant loci
2. Test the heterogeneity score.
3. Aggregate over multiple traits and multiple regions and show QQ-plot

`page-gwas.tsv` is downloaded from GWAS catalog https://www.ebi.ac.uk/gwas/publications/31217584 (Download catalog data)

**Links below are not used in this notebook but I preserve these just in case**

Also see 
- https://github.com/gokceneraslan/opentargets-genetics-python
- https://community.opentargets.org/t/how-to-access-finngen-gwas-data-using-the-open-targets-genetics-portal-api/254/4
- https://api.genetics.opentargets.org/graphql/schema

!wget https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-019-1310-4/MediaObjects/41586_2019_1310_MOESM3_ESM.xlsx -O page-supp-tables.xlsx


In [2]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import pandas as pd
import admix
import dask
import dask.array as da
from tqdm import tqdm
import statsmodels.api as sm
import matplotlib.pyplot as plt
from admix.data import quantile_normalize
import submitit

import admix_genet_cor

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


# Process GWAS hits from PAGE study

In [3]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp_tables.xlsx?dl=1"
trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="trait-info")
trait_list = trait_info["trait"].values

In [4]:
df_assoc = pd.read_csv("./page-gwas.tsv", sep="\t")
df_assoc = (
    df_assoc[
        [
            "DISEASE/TRAIT",
            "INITIAL SAMPLE SIZE",
            "REGION",
            "SNPS",
            "CHR_ID",
            "CHR_POS",
            "P-VALUE",
            "STUDY ACCESSION",
        ]
    ]
    .dropna(subset=["CHR_POS"])
    .astype({"CHR_POS": int})
)

df_assoc = df_assoc.loc[df_assoc.CHR_ID.isin(np.arange(1, 23).astype(str))]
# NOTE: the 24 + 2 additional traits are waist-hip ratio for males and females

# convert trait_id
gwas_catalog_name2id = {
    row["GWAS catalog name"]: row["trait"] for _, row in trait_info.iterrows()
}
df_assoc.insert(
    0, "trait_id", df_assoc["DISEASE/TRAIT"].apply(lambda x: gwas_catalog_name2id[x])
)
df_assoc["CHR_ID"] = df_assoc["CHR_ID"].astype(int)
df_assoc = (
    df_assoc.sort_values(["trait_id", "CHR_ID", "CHR_POS"])
    .drop_duplicates(["trait_id", "CHR_ID", "CHR_POS"])
    .reset_index(drop=True)
)
df_assoc.index = (
    df_assoc["trait_id"]
    + ":"
    + df_assoc["CHR_ID"].astype(str)
    + ":"
    + df_assoc["CHR_POS"].astype(str)
)

In [5]:
import dapgen
from os.path import join

PFILE_DIR = "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01_dataset/out/aframr/imputed"
SAMPLE_INFO_PATH = "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01_dataset/out/aframr/sample_info.txt"

In [6]:
def submit_locus_hetero(trait, out, duffy_covar=False):
    df_assoc_trait = df_assoc[df_assoc.trait_id == trait].copy()

    # format phenotype and covariates

    dset = admix.dataset.read_dataset(
        join(PFILE_DIR, "chr1"),
        indiv_info=SAMPLE_INFO_PATH,
        n_anc=2,
    )
    dset_assoc = dset[:, ~np.isnan(dset.indiv[trait]).values]
    covar_cols = ["age", "sex", "study"] + [f"geno_EV{i}" for i in range(1, 11)]

    df_pheno = dset_assoc.indiv[[trait]].copy()
    df_covar = dset_assoc.indiv[covar_cols].copy()
    # create study dummies variables
    study_dummies = pd.get_dummies(df_covar["study"], drop_first=True)
    study_dummies.columns = [f"study_dummy_{s}" for s in study_dummies.columns]
    df_covar = pd.concat([df_covar, study_dummies], axis=1)
    df_covar = df_covar.drop(columns=["study"])

    # special case for duffy SNPs, include the duffy SNPs in the covariate
    if duffy_covar:
        # find closest SNPs to Duffy SNP (GRCH38: 1:159204893)
        duffy_snp_loc = np.argmin(np.abs(dset_assoc.snp.POS - 159204893))
        assert dset_assoc.snp.CHROM.iloc[duffy_snp_loc] == 1
        duffy_lanc = dset_assoc[duffy_snp_loc].lanc.sum(axis=[0, 2]).compute()
        df_covar["duffy_lanc"] = duffy_lanc

    for col in df_pheno.columns:
        df_pheno[col] = admix.data.quantile_normalize(df_pheno[col])

    for col in df_covar.columns:
        df_covar[col] = admix.data.quantile_normalize(df_covar[col])

    for chrom in range(1, 23):
        # subset df_assoc
        df_assoc_trait_chrom = df_assoc_trait[df_assoc_trait.CHR_ID == chrom]

        # subset dset
        dset = admix.dataset.read_dataset(
            join(PFILE_DIR, f"chr{chrom}"),
            indiv_info=SAMPLE_INFO_PATH,
            n_anc=2,
        )
        dset = dset[
            dset.snp.POS.isin(df_assoc_trait_chrom.CHR_POS.values).values,
            df_pheno.index.values,
        ]
        dset.persist()

        apa = dset.allele_per_anc()
        af = dset.af_per_anc()

        for snp_i in tqdm(range(dset.n_snp)):
            p_het, model_het = admix_genet_cor.test_snp_het(
                apa[snp_i, :, :], df_pheno.values, df_covar.values
            )
            p_assoc, model_assoc = admix_genet_cor.test_snp_assoc(
                apa[snp_i, :, :], df_pheno.values, df_covar.values
            )
            snp_idx = f"{trait}:{dset.snp.CHROM[snp_i]}:{dset.snp.POS[snp_i]}"
            df_assoc_trait.loc[snp_idx, ["EUR_af", "AFR_af"]] = (
                af[snp_i, 0],
                af[snp_i, 1],
            )

            df_assoc_trait.loc[
                snp_idx,
                [
                    "assoc_pval",
                    "HET_pval",
                    "EUR_beta",
                    "AFR_beta",
                    "EUR_beta_stderr",
                    "AFR_beta_stderr",
                ],
            ] = [
                p_assoc,
                p_het,
                model_het.params[1],
                model_het.params[2],
                model_het.bse[1],
                model_het.bse[2],
            ]
    df_assoc_trait.to_csv(out)

In [8]:
submit_locus_hetero(
    "total_wbc_cnt", "out/locus_hetero/total_wbc_cnt.duffy_covar.csv", True
)

100%|██████████| 20/20 [00:02<00:00,  8.61it/s]
100%|██████████| 3/3 [00:00<00:00, 14.03it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 12.18it/s]
0it [00:00, ?it/s]
100%|██████████| 5/5 [00:00<00:00, 13.27it/s]
100%|██████████| 2/2 [00:00<00:00, 10.97it/s]
100%|██████████| 1/1 [00:00<00:00, 12.78it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 12.40it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 9/9 [00:00<00:00, 12.34it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [39]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=20,
    memory_g=16,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_locus_hetero,
    trait_list,
    [f"out/locus_hetero/{trait}.csv" for trait in trait_list],
)