# Trait-specific file

- one trait for each file, include all covariate information, PRS mean and standard errors. 
- The standard errors can come from different sources, e.g., (1) sampling (2) PC1 (3) genetic distance (L2 distane of all PCs)

In [1]:
%load_ext lab_black

import numpy as np
import pandas as pd
import os
import glob
from typing import List
from tqdm import tqdm
import submitit
import matplotlib.pyplot as plt

In [2]:
COVAR_COLS = ["AGE", "SEX", "DEPRIVATION_INDEX", "GENET_DIST"] + [
    f"PC{i}" for i in range(1, 11)
]

TEST_COLS = ["SEX", "glasses"] + [
    "AGE",
    "years_of_edu",
    "income",
    "DEPRIVATION_INDEX",
    "PC1",
    "PC2",
    "drink_alcohol",
    "ever_smoked",
    "log_BMI",
]

DATA_DIR = "out"
PHENO_DIR = "/u/project/sgss/UKBB/PRS-RESEARCH/03-compile-pheno/out"

In [3]:
def load_trait_info(trait: str, covar_cols: List[str]) -> pd.DataFrame:
    """
    Load
    (1) trait values PHENO_DIR/{trait}.tsv
    (2) polygenic score DATA_DIR/pred/{trait}.score_summary.tsv.gz
    (3) covariates DATA_DIR/covar.tsv
    (3) covariates to adjust for, e.g., age, sex, top 10 PCs
    (4) covariate to test

    Parameters
    ----------
    trait: str
        trait name
    covar_cols: List[str]
        list of covariates to load from covar file
    """

    ## 1. load trait and score
    df_trait = pd.read_csv(
        os.path.join(PHENO_DIR, f"{trait}.tsv"), sep="\t", index_col=0
    ).drop(columns=["IID"])

    df_score = pd.read_csv(
        os.path.join(DATA_DIR, f"pred/{trait}.score_summary.tsv.gz"),
        sep="\t",
        index_col=0,
    )
    df_score.index = [int(i.split("_")[0]) for i in df_score.index]

    ## 2. load covariates
    df_covar = pd.read_csv(os.path.join(DATA_DIR, "covar.tsv"), sep="\t", index_col=0)

    # add some phenotype to the covariates
    for col in covar_cols:
        if col in df_covar.columns:
            continue
        else:
            tmp_path = os.path.join(PHENO_DIR, f"{col}.tsv")
            if os.path.exists(tmp_path):
                df_tmp = pd.read_csv(tmp_path, sep="\t", index_col=0).drop(
                    columns=["IID"]
                )
                df_covar[col] = df_tmp["PHENO"].reindex(df_covar.index)
            else:
                warnings.warn(f"{tmp_path} does not exist")

    # merge all files together
    df_trait = pd.merge(df_score, df_trait, left_index=True, right_index=True)
    df_trait = pd.merge(df_trait, df_covar, left_index=True, right_index=True)

    return df_trait

In [13]:
trait_list = list(
    set(
        [
            t.split("/")[-1].rsplit(".", 3)[0]
            for t in glob.glob(os.path.join(DATA_DIR, "pred/*.score_summary.tsv.gz"))
        ]
    )
)
print(f"{len(trait_list)} traits in total.")

247 traits in total.


In [15]:
MAIN_TRAIT_LIST = [
    "height",
    "log_monocyte",
    "log_leukocyte",
    "log_CRP",
    "log_triglycerides",
    "LDL",
    "log_SHBG",
    "systolic_BP",
    "diastolic_BP",
    "FEV1",
    "log_heel_BMD",
    "years_of_edu",
    "neuroticism",
]

In [16]:
for trait in tqdm(trait_list):
    # for trait in tqdm(MAIN_TRAIT_LIST):
    df_trait = load_trait_info(trait, list(set(TEST_COLS) | set(COVAR_COLS)))
    df_trait.index.name = "indiv"
    df_trait.to_csv(f"out/per-trait-info/{trait}.tsv.gz", sep="\t", na_rep="NA")

100%|██████████| 247/247 [42:24<00:00, 10.30s/it]


# Example of trait DataFrame

In [12]:
df_trait

Unnamed: 0_level_0,MEAN,SD,QUANTILE_5,QUANTILE_50,QUANTILE_95,PHENO,AGE,SEX,DEPRIVATION_INDEX,PC1,...,PC15,PC16,group,GENET_DIST,ever_smoked,drink_alcohol,income,glasses,log_BMI,years_of_edu
indiv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4633435,-0.225385,0.873744,-1.645130,-0.191055,1.102090,,51,0,-2.762980,-13.6054,...,-1.219610,0.277595,,,1.0,1.0,1.41421,1.0,2.98457,20.0
4922412,0.673161,0.991759,-0.871505,0.650041,2.251180,6.0,41,0,-2.205510,37.6587,...,-1.821650,1.598630,,,0.0,1.0,2.23607,0.0,3.25274,10.0
3552392,-0.852878,0.880013,-2.286050,-0.874720,0.569765,2.0,58,0,1.088330,-14.1111,...,-0.321982,4.764330,,,0.0,1.0,1.41421,1.0,3.43025,
3627963,-0.140656,1.246800,-2.237720,-0.114406,1.893670,8.0,41,0,-2.233650,155.0930,...,0.378568,-0.826590,China,0.485502,0.0,1.0,,1.0,2.94416,10.0
1381646,-0.166826,0.887041,-1.601780,-0.167901,1.268050,4.0,59,1,-3.525300,-12.6543,...,1.866450,-0.465458,United Kingdom,0.022395,1.0,1.0,1.73205,1.0,3.18577,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5375742,0.439923,0.974878,-1.104630,0.410228,2.043050,,45,1,-4.631790,73.8697,...,-1.397690,-0.568204,,,0.0,0.0,,1.0,3.35516,10.0
5188752,-0.992854,0.959634,-2.469140,-1.048510,0.556549,2.0,68,0,-3.067060,20.2387,...,-4.412340,-0.064192,Ashkenazi,0.165428,0.0,0.0,1.00000,1.0,3.49722,
3269187,0.252616,0.994390,-1.275170,0.269551,1.916540,6.0,56,0,-0.423068,76.8315,...,-1.163240,0.967158,,,0.0,1.0,2.23607,1.0,3.25591,20.0
5461955,0.604064,0.886059,-0.787771,0.573940,2.164640,6.0,58,0,-0.335369,-13.0097,...,-0.835211,-2.236190,United Kingdom,0.032901,0.0,1.0,2.00000,1.0,3.09423,10.0
