# Trait-specific file

- one trait for each file, include all covariate information, PRS mean and standard errors. 
- The standard errors can come from different sources, e.g., (1) sampling (2) PC1 (3) genetic distance (L2 distane of all PCs)

In [1]:
%load_ext lab_black

import numpy as np
import pandas as pd
import os
import glob
from typing import List
from tqdm import tqdm
import submitit
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype

In [2]:
COVAR_COLS = (
    ["AGE", "SEX", "DEPRIVATION_INDEX", "group"]
    + [f"PC{i}" for i in range(1, 11)]
    + [
        "glasses",
        "years_of_edu",
        "income",
        "drink_alcohol",
        "ever_smoked",
        "log_BMI",
    ]
)

DATA_DIR = "out"
PHENO_DIR = "/u/project/sgss/UKBB/PRS-RESEARCH/03-compile-pheno/out"

In [3]:
def load_trait_info(trait: str, covar_cols: List[str]) -> pd.DataFrame:
    """
    Load
    (1) trait values PHENO_DIR/{trait}.tsv
    (2) polygenic score DATA_DIR/pred/{trait}.score_summary.tsv.gz
    (3) covariates DATA_DIR/covar.tsv
    (3) covariates to adjust for, e.g., age, sex, top 10 PCs
    (4) covariate to test

    Parameters
    ----------
    trait: str
        trait name
    covar_cols: List[str]
        list of covariates to load from covar file
    """

    ## 1. load trait and score
    df_trait = pd.read_csv(
        os.path.join(PHENO_DIR, f"{trait}.tsv"), sep="\t", index_col=0
    ).drop(columns=["IID"])

    df_score = pd.read_csv(
        os.path.join(DATA_DIR, f"pred/{trait}.score_summary.tsv.gz"),
        sep="\t",
        index_col=0,
    )
    df_score.index = [int(i.split("_")[0]) for i in df_score.index]

    ## 2. load covariates
    df_covar = pd.read_csv(os.path.join(DATA_DIR, "covar.tsv"), sep="\t", index_col=0)
    # remove unspecified columns
    df_covar = df_covar[[col for col in df_covar.columns if col in covar_cols]]

    # 3. load covariate that are also phenotypes (from phenotype-specific files)
    for col in covar_cols:
        if col in df_covar.columns:
            continue
        else:
            tmp_path = os.path.join(PHENO_DIR, f"{col}.tsv")
            if os.path.exists(tmp_path):
                df_tmp = pd.read_csv(tmp_path, sep="\t", index_col=0).drop(
                    columns=["IID"]
                )
                df_covar[col] = df_tmp["PHENO"].reindex(df_covar.index)
            else:
                warnings.warn(f"{tmp_path} does not exist")

    # merge all files together
    df_trait = pd.merge(df_score, df_trait, left_index=True, right_index=True)
    df_trait = pd.merge(df_trait, df_covar, left_index=True, right_index=True)

    # drop individuals with missing phenotype
    df_trait = df_trait.dropna(subset=["PHENO"])
    # drop individuals with > 20% missing covariates
    df_trait = df_trait.loc[df_trait[COVAR_COLS].isna().mean(axis=1) <= 0.2, :]
    #     # impute missing covariates with column mean
    #     for col in COVAR_COLS:
    #         if is_numeric_dtype(df_trait[col]):
    #             df_trait[col] = df_trait[col].fillna(df_trait[col].mean())
    return df_trait

In [4]:
trait_list = list(
    set(
        [
            t.split("/")[-1].rsplit(".", 3)[0]
            for t in glob.glob(os.path.join(DATA_DIR, "pred/*.score_summary.tsv.gz"))
        ]
    )
)
print(f"{len(trait_list)} traits in total.")

247 traits in total.


In [5]:
MAIN_TRAIT_LIST = [
    "height",
    "log_monocyte",
    "log_leukocyte",
    "log_CRP",
    "log_triglycerides",
    "LDL",
    "log_SHBG",
    "systolic_BP",
    "diastolic_BP",
    "FEV1",
    "log_heel_BMD",
    "years_of_edu",
    "neuroticism",
]

In [6]:
for trait in tqdm(trait_list):
    df_trait = load_trait_info(trait, COVAR_COLS)
    df_trait.index.name = "indiv"
    df_trait.to_csv(
        f"out/per-trait-info/{trait}.tsv.gz", sep="\t", na_rep="NA", float_format="%.6g"
    )

100%|██████████| 247/247 [29:34<00:00,  7.18s/it]


# Example of trait DataFrame

In [7]:
df_trait

Unnamed: 0_level_0,MEAN,SD,QUANTILE_5,QUANTILE_50,QUANTILE_95,PHENO,AGE,SEX,DEPRIVATION_INDEX,PC1,...,PC8,PC9,PC10,group,glasses,years_of_edu,income,drink_alcohol,ever_smoked,log_BMI
indiv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4964808,0.000284,0.009923,-0.015132,0.000220,0.015607,0.0,50,1,-2.762980,-11.58470,...,0.079360,-2.853440,3.51401,United Kingdom,0.0,10.0,1.41421,1.0,1.0,3.33046
4086323,-0.004358,0.009590,-0.020142,-0.004142,0.012187,0.0,41,0,-3.668480,-11.21340,...,0.930548,-15.803400,-0.36375,United Kingdom,0.0,10.0,2.00000,1.0,0.0,3.13822
4633435,0.000599,0.010101,-0.014051,0.000723,0.017039,0.0,51,0,-2.762980,-13.60540,...,-1.693800,2.946900,2.28279,,1.0,20.0,1.41421,1.0,1.0,2.98457
4922412,0.004542,0.011191,-0.012844,0.001767,0.023848,0.0,41,0,-2.205510,37.65870,...,-5.229370,1.283640,-9.55639,,0.0,10.0,2.23607,1.0,0.0,3.25274
3552392,-0.002265,0.010054,-0.018628,-0.002123,0.013244,0.0,58,0,1.088330,-14.11110,...,-1.100970,-11.368900,2.09082,,1.0,,1.41421,1.0,0.0,3.43025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5461955,-0.000237,0.010790,-0.019410,0.000364,0.017549,0.0,58,0,-0.335369,-13.00970,...,1.472760,-0.929611,0.71619,United Kingdom,1.0,10.0,2.00000,1.0,0.0,3.09423
3300490,-0.003579,0.010559,-0.020335,-0.002767,0.014068,0.0,63,1,-2.572370,72.73360,...,2.829890,-0.307037,-7.44235,,1.0,,1.00000,1.0,0.0,3.40981
1582348,0.000065,0.010857,-0.017562,0.000749,0.017107,0.0,69,0,-4.248580,-11.41350,...,-3.884010,1.815370,-1.06846,United Kingdom,1.0,10.0,1.41421,1.0,1.0,3.28244
3988244,-0.002076,0.009665,-0.018693,-0.000873,0.010493,0.0,48,1,6.637230,-4.12484,...,-5.466680,1.353550,-2.85062,United Kingdom,1.0,20.0,2.00000,1.0,1.0,3.53470
