In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

import admix
import numpy as np
import pandas as pd
import admix_prs
import os
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

In [2]:
DATA_DIR = "/u/project/pasaniuc/pasaniucdata/admixture/projects/admix-prs-uncertainty/experiments/00-compile-data/out"
PHENO_DIR = "/u/project/sgss/UKBB/PRS-RESEARCH/03-compile-pheno/out"

In [3]:
COVAR_COLS = ["AGE", "SEX", "DEPRIVATION_INDEX"] + [f"PC{i}" for i in range(1, 11)]
TEST_COLS = ["SEX", "glasses"] + [
    col + "_Q"
    for col in ["AGE", "years_of_edu", "income", "DEPRIVATION_INDEX", "PC1", "PC2"]
]
print("Covariates:", ", ".join(COVAR_COLS))
print("Testing:", ", ".join(TEST_COLS))

Covariates: AGE, SEX, DEPRIVATION_INDEX, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10
Testing: SEX, glasses, AGE_Q, years_of_edu_Q, income_Q, DEPRIVATION_INDEX_Q, PC1_Q, PC2_Q


In [4]:
df_covar = pd.read_csv(os.path.join(DATA_DIR, "covar.tsv"), sep="\t", index_col=0)

# add some phenotype to the covariates
for trait in ["years_of_edu", "glasses", "income"]:
    df_trait = pd.read_csv(
        os.path.join(PHENO_DIR, f"{trait}.tsv"), sep="\t", index_col=0
    ).drop(columns=["IID"])
    df_covar[trait] = df_trait["PHENO"].reindex(df_covar.index)

# convert continuous phenotype to discrete phenotypes
for col in ["years_of_edu", "income", "AGE", "DEPRIVATION_INDEX", "PC1", "PC2"]:
    df_covar[f"{col}_Q"] = pd.qcut(df_covar[col], q=5, duplicates="drop").cat.codes

In [5]:
trait = "height"
df_trait = pd.read_csv(
    os.path.join(PHENO_DIR, f"{trait}.tsv"), sep="\t", index_col=0
).drop(columns=["IID"])

# add score
df_score = pd.read_csv(
    os.path.join(DATA_DIR, f"pred/{trait}.score_summary.tsv.gz"), sep="\t", index_col=0
)
df_score.index = [int(i.split("_")[0]) for i in df_score.index]
df_trait = pd.merge(df_score, df_trait, left_index=True, right_index=True)

# add covariates
df_trait = pd.merge(df_trait, df_covar, left_index=True, right_index=True)

# within single UK ancestry
df_trait = df_trait[df_trait.group == "United Kingdom"]
df_trait = df_trait.dropna()

df_trait["PHENO_RESID"] = (
    sm.OLS(
        df_trait["PHENO"].values, sm.add_constant(df_trait[COVAR_COLS]), missing="drop"
    )
    .fit()
    .resid
)

  x = pd.concat(x[::order], 1)
