In [1]:
%load_ext lab_black
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
from admix.data import quantile_normalize
from tqdm import tqdm
import itertools
import yaml

In [2]:
trait_list = np.loadtxt("data/traits.txt", dtype=str)
metadata = {
    "VAR_COLS": [
        "AGE",
        "SEX",
        "PC1",
        "PC2",
        "log_BMI",
        "ever_smoked",
        "drink_alcohol",
        "glasses",
        "income",
        "DEPRIVATION_INDEX",
        "years_of_edu",
    ],
    "COVAR_COLS": ["AGE", "SEX"] + ["AGE*SEX"] + [f"PC{i}" for i in range(1, 11)],
}

with open("data/meta.yaml", "w") as f:
    yaml.dump(metadata, f)

VAR_COLS, COVAR_COLS = metadata["VAR_COLS"], metadata["COVAR_COLS"]

In [3]:
def format_data(trait, group):
    DATA_DIR = "../compile-data/out/per-trait-info/"
    df_trait = pd.read_csv(
        os.path.join(DATA_DIR, f"{trait}.tsv.gz"), index_col=0, sep="\t"
    )

    # filter individuals
    if group == "white":
        df_trait = df_trait[df_trait.group == "United Kingdom"]
    elif group == "other":
        df_trait = df_trait[~(df_trait.group == "United Kingdom")]
    elif group == "all":
        df_trait = df_trait.copy()
    else:
        raise NotImplementedError

    df_trait = df_trait.dropna(subset=["PHENO"])

    df_trait["AGE*SEX"] = df_trait["AGE"] * df_trait["SEX"]
    df_trait["PGS"] = df_trait["MEAN"]
    df_trait = df_trait.loc[
        :, ["PHENO", "PGS"] + sorted(list(set(VAR_COLS) | set(COVAR_COLS)))
    ]
    # standardize covariates
    # impute with column average and standardize covariates (so each covariate is mean 0 and SD 1)
    for col in df_trait.columns[1:]:
        df_trait[col] = df_trait[col].fillna(df_trait[col].median())
        df_trait[col] = (df_trait[col] - df_trait[col].mean()) / df_trait[col].std()

    # quantile normalization
    df_trait.insert(1, "QPHENO", quantile_normalize(df_trait["PHENO"]))
    # residual after regressing out covar_cols
    df_trait.insert(
        2,
        "QPHENO_RESID",
        sm.OLS(
            df_trait["QPHENO"].values,
            sm.add_constant(df_trait[COVAR_COLS]),
        )
        .fit()
        .resid,
    )
    path = f"out/format-data/{trait}.{group}.tsv"
    df_trait.to_csv(path, sep="\t")

In [4]:
for trait in tqdm(trait_list):
    for group in ["white", "other", "all"]:
        format_data(trait=trait, group=group)

100%|██████████| 72/72 [08:20<00:00,  6.96s/it]
