In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import dapgen
import admix_genet_cor
import admix
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import glob

In [2]:
ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr"
)
PFILE_DIR = os.path.join(ROOT_DIR, "imputed")

SUPP_TABLE_URL = "https://www.dropbox.com/s/jck2mhjby2ur55j/supp-tables.xlsx?dl=1"

# trait_list = [
#     f.split("/")[-1].split(".")[0]
#     for f in glob.glob(os.path.join(ROOT_DIR, "pheno", "*.tsv"))
# ]
trait_info = pd.read_excel(SUPP_TABLE_URL, sheet_name="page-trait-info")
trait_list = trait_info["trait"].values
dict_trait_display_name = {
    row["trait"]: row["display-name"] for _, row in trait_info.iterrows()
}

In [3]:
# include duffy SNP information
dset = admix.io.read_dataset(
    os.path.join(ROOT_DIR, "imputed", "chr1"),
    n_anc=2,
)
# find closest SNPs
duffy_snp_loc = np.argmin(np.abs(dset.snp.POS - 159204893))
assert dset.snp.CHROM.iloc[duffy_snp_loc] == 1
duffy_lanc = pd.Series(
    dset[duffy_snp_loc].lanc.sum(axis=[0, 2]).compute(), index=dset.indiv.index
)

2022-03-24 09:33.47 [info     ] admix.Dataset: read local ancestry from /u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr/imputed/chr1.lanc


In [4]:
PFILE_DIR = "/u/home/k/kangchen/PAGE-QC/01-dataset/out/aframr/hm3/"
with open(os.path.join(PFILE_DIR, "merged/merged.king.cutoff.in.id")) as f:
    unrelated_indiv = [i.strip() for i in f.readlines()[1:]]

In [6]:
for trait in trait_list:

    df_trait = (
        pd.read_csv(os.path.join(ROOT_DIR, f"pheno/{trait}.tsv"), sep="\t", index_col=0)
        .dropna(subset=[trait])
        .rename(columns={trait: "pheno"})
    )
    df_trait.index = df_trait.index.astype(str)
    n_indiv1 = len(df_trait)
    df_trait = df_trait[df_trait.index.isin(unrelated_indiv)]
    n_indiv2 = len(df_trait)
    print(
        f"{trait}: {n_indiv2}/{n_indiv1} ({n_indiv2 / n_indiv1 * 100 : .3g}%) unrelated individuals are retained"
    )
    # include duffy as covariates for total_wbc_cnt
    if trait in ["total_wbc_cnt", "crp"]:
        df_trait["duffy_lanc"] = duffy_lanc.reindex(df_trait.index)
    #     for covar in ["page1pc", "page10pc", "sample10pc"]:
    for covar in ["sample10pc"]:
        df_trait_covar = df_trait.copy()
        if covar == "page1pc":
            df_trait_covar = df_trait_covar.drop(
                columns=[f"geno_EV{i}" for i in range(2, 11)]
                + [f"PC{i}" for i in range(1, 11)]
            )
        elif covar == "page10pc":
            df_trait_covar = df_trait_covar.drop(
                columns=[f"PC{i}" for i in range(1, 11)]
            )
        elif covar == "sample10pc":
            df_trait_covar = df_trait_covar.drop(
                columns=[f"geno_EV{i}" for i in range(1, 11)]
            )
        else:
            raise NotImplementedError
        df_trait_covar.to_csv(f"out/pheno/{trait}-{covar}.tsv", sep="\t", na_rep="NA")

crp: 8321/8521 ( 97.7%) unrelated individuals are retained
total_wbc_cnt: 8615/8889 ( 96.9%) unrelated individuals are retained
mean_corp_hgb_conc: 3650/3816 ( 95.6%) unrelated individuals are retained
platelet_cnt: 8597/8871 ( 96.9%) unrelated individuals are retained
hdl: 9929/10248 ( 96.9%) unrelated individuals are retained
ldl: 9574/9875 ( 97%) unrelated individuals are retained
triglycerides: 9896/10217 ( 96.9%) unrelated individuals are retained
total_cholesterol: 9981/10300 ( 96.9%) unrelated individuals are retained
cigs_per_day_excl_nonsmk_updated: 6995/7171 ( 97.5%) unrelated individuals are retained
coffee_cup_day: 11587/11862 ( 97.7%) unrelated individuals are retained
a1c: 1740/1854 ( 93.9%) unrelated individuals are retained
insulin: 7753/7927 ( 97.8%) unrelated individuals are retained
glucose: 9646/9937 ( 97.1%) unrelated individuals are retained
t2d_status: 14516/15014 ( 96.7%) unrelated individuals are retained
qt_interval: 4089/4196 ( 97.4%) unrelated individuals ar