# Loci analysis
1. Take the top SNPs for each significant loci
2. Test the heterogeneity score.
3. Aggregate over multiple traits and multiple regions and show QQ-plot

`out/page-gwas.tsv` is downloaded from GWAS catalog https://www.ebi.ac.uk/gwas/publications/31217584 (Download catalog data)

**Links below are not used in this notebook but I preserve these just in case**

Also see 
- https://github.com/gokceneraslan/opentargets-genetics-python
- https://community.opentargets.org/t/how-to-access-finngen-gwas-data-using-the-open-targets-genetics-portal-api/254/4
- https://api.genetics.opentargets.org/graphql/schema

!wget https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-019-1310-4/MediaObjects/41586_2019_1310_MOESM3_ESM.xlsx -O page-supp-tables.xlsx


In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import pandas as pd
import admix
import dask
import dask.array as da
from tqdm import tqdm
import statsmodels.api as sm
import matplotlib.pyplot as plt
from admix.data import quantile_normalize
import submitit

import admix_genet_cor

In [2]:
SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp_tables.xlsx?dl=1"
trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="trait-info")
trait_list = trait_info["trait"].values

In [8]:
df_assoc = pd.read_csv("./out/page-gwas.tsv", sep="\t")
df_assoc = (
    df_assoc[
        [
            "DISEASE/TRAIT",
            "INITIAL SAMPLE SIZE",
            "REGION",
            "SNPS",
            "CHR_ID",
            "CHR_POS",
            "P-VALUE",
            "STUDY ACCESSION",
        ]
    ]
    .dropna(subset=["CHR_POS"])
    .rename(columns={"CHR_ID": "CHROM", "CHR_POS": "POS"})
)

df_assoc = df_assoc.loc[df_assoc.CHROM.isin(np.arange(1, 23).astype(str))]
# NOTE: the 24 + 2 additional traits are waist-hip ratio for males and females

# convert trait_id
gwas_catalog_name2id = {
    row["GWAS catalog name"]: row["trait"] for _, row in trait_info.iterrows()
}
df_assoc.insert(
    0, "trait", df_assoc["DISEASE/TRAIT"].apply(lambda x: gwas_catalog_name2id[x])
)
df_assoc = (
    df_assoc.sort_values(["trait", "CHROM", "POS"])
    .drop_duplicates(["trait", "CHROM", "POS"])
    .reset_index(drop=True)
    .astype({"CHROM": int, "POS": int})
)
df_assoc.index = (
    df_assoc["trait"]
    + ":"
    + df_assoc["CHROM"].astype(str)
    + ":"
    + df_assoc["POS"].astype(str)
)
df_assoc.drop(columns=["REGION", "INITIAL SAMPLE SIZE", "STUDY ACCESSION"]).to_csv(
    "out/page-gwas-formatted.tsv", sep="\t"
)