# Trait-specific file

one trait for each file, include all covariate information (it is okay to have redundant information across traits) and PRS mean and standard errors. The standard errors can come from different sources, e.g., (1) sampling (2) PC1 (3) genetic distance (L2 distane of all PCs)

In [2]:
%load_ext lab_black

import numpy as np
import pandas as pd
import os
import glob
from typing import List
from tqdm import tqdm
import submitit
import matplotlib.pyplot as plt

In [3]:
COVAR_COLS = ["AGE", "SEX", "DEPRIVATION_INDEX"] + [f"PC{i}" for i in range(1, 11)]

TEST_COLS = ["SEX", "glasses"] + [
    "AGE",
    "years_of_edu",
    "income",
    "DEPRIVATION_INDEX",
    "PC1",
    "PC2",
    "ever_smoked",
]

DATA_DIR = "out"
PHENO_DIR = "/u/project/sgss/UKBB/PRS-RESEARCH/03-compile-pheno/out"

In [4]:
def load_trait_info(trait: str, covar_cols: List[str]) -> pd.DataFrame:
    """
    Load
    (1) trait values PHENO_DIR/{trait}.tsv
    (2) polygenic score DATA_DIR/pred/{trait}.score_summary.tsv.gz
    (3) covariates DATA_DIR/covar.tsv
    (3) covariates to adjust for, e.g., age, sex, top 10 PCs
    (4) covariate to test

    Parameters
    ----------
    trait: str
        trait name
    covar_cols: List[str]
        list of covariates to load from covar file
    """

    ## 1. load trait and score
    df_trait = pd.read_csv(
        os.path.join(PHENO_DIR, f"{trait}.tsv"), sep="\t", index_col=0
    ).drop(columns=["IID"])

    df_score = pd.read_csv(
        os.path.join(DATA_DIR, f"pred/{trait}.score_summary.tsv.gz"),
        sep="\t",
        index_col=0,
    )
    df_score.index = [int(i.split("_")[0]) for i in df_score.index]

    ## 2. load covariates
    df_covar = pd.read_csv(os.path.join(DATA_DIR, "covar.tsv"), sep="\t", index_col=0)

    # add some phenotype to the covariates
    for col in covar_cols:
        if col in df_covar.columns:
            continue
        else:
            tmp_path = os.path.join(PHENO_DIR, f"{col}.tsv")
            if os.path.exists(tmp_path):
                df_tmp = pd.read_csv(tmp_path, sep="\t", index_col=0).drop(
                    columns=["IID"]
                )
                df_covar[col] = df_tmp["PHENO"].reindex(df_covar.index)
            else:
                warnings.warn(f"{tmp_path} does not exist")

    # merge all files together
    df_trait = pd.merge(df_score, df_trait, left_index=True, right_index=True)
    df_trait = pd.merge(df_trait, df_covar, left_index=True, right_index=True)

    return df_trait

In [5]:
trait_list = list(
    set(
        [
            t.split("/")[-1].split(".")[0]
            for t in glob.glob(os.path.join(DATA_DIR, "pred/*"))
        ]
    )
)
print(f"{len(trait_list)} traits in total.")

111 traits in total.


In [12]:
MAIN_TRAIT_LIST = [
    "log_monocyte",
    "log_leukocyte",
    "log_CRP",
    "log_triglycerides",
    "LDL",
    "log_SHBG",
    "systolic_BP",
    "diastolic_BP",
    "FEV1",
    "log_heel_BMD",
    "height",
    "years_of_edu",
    "neuroticism",
]

In [13]:
for trait in tqdm(MAIN_TRAIT_LIST):
    df_trait = load_trait_info(trait, list(set(TEST_COLS) | set(COVAR_COLS)))
    df_trait.index.name = "indiv"
    df_trait.to_csv(f"out/per-trait-info/{trait}.tsv.gz", sep="\t", na_rep="NA")

100%|██████████| 13/13 [02:37<00:00, 12.14s/it]
