# Phenotype prediction for all individuals

In [1]:
%load_ext autoreload
%load_ext lab_black
%autoreload 2

import admix
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from os.path import join
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
import seaborn as sns
import admix_prs
import statsmodels.api as sm

In [2]:
prefix = "hsq-0.25-pcausal-0.01"
ldpred2_dir = f"out/ldpred2/{prefix}"
PLINK_DIR = "/u/project/pasaniuc/pasaniucdata/admixture/projects/admix-prs-uncertainty/data/PLINK/"

In [3]:
group_list = ["eur_test", "admix"]
bfile_list = [join(PLINK_DIR, group, "merged") for group in group_list]

In [4]:
def submit_predict(prefix, sim_i):
    ldpred2_dir = f"out/ldpred2/{prefix}"
    df_weights = (
        pd.read_csv(join(ldpred2_dir, f"sim_{sim_i}.weight.tsv.gz"), sep="\t")
        .rename(columns={"CHR": "CHROM", "A1": "REF", "A2": "ALT"})
        .set_index("SNP")
    )
    for group, bfile in zip(group_list, bfile_list):
        df_pred = admix_prs.calc_prs(bfile, df_weights)
        df_pred.to_csv(join(ldpred2_dir, f"sim_{sim_i}.prs.{group}.tsv.gz"), sep="\t")


import submitit

executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=70,
    memory_g=20,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

# jobs = executor.map_array(submit_predict, [prefix] * 10, np.arange(10))

In [49]:
def summarize(prefix, sim_i, quantiles=[0.1, 0.9], seed=1234):
    np.random.seed(seed)
    hsq = float(prefix.split("-")[1])
    print(f"Using hsq={hsq} to generate posterior predictive samples for phenotypes")

    # read PRS
    df_prs_eur_test = pd.read_csv(
        f"out/ldpred2/{prefix}/sim_{sim_i}.prs.eur_test.tsv.gz", sep="\t", index_col=0
    )
    df_prs_admix = pd.read_csv(
        f"out/ldpred2/{prefix}/sim_{sim_i}.prs.admix.tsv.gz", sep="\t", index_col=0
    )
    df_prs = pd.concat([df_prs_eur_test, df_prs_admix])

    # read genetic value and phenotype
    df_pheno_g = (
        pd.read_csv(join(f"out/pheno/{prefix}/sim.pheno_g.tsv"), sep="\t", index_col=0)[
            ["GROUP", f"SIM_{sim_i}"]
        ]
        .rename(columns={f"SIM_{sim_i}": "GV"})
        .reindex(df_prs.index)
    )
    df_pheno = (
        pd.read_csv(join(f"out/pheno/{prefix}/sim.pheno.tsv"), sep="\t", index_col=0)[
            [f"SIM_{sim_i}"]
        ]
        .rename(columns={f"SIM_{sim_i}": "PHENO"})
        .reindex(df_prs.index)
    )

    df_prs = pd.concat([df_pheno, df_pheno_g, df_prs], axis=1)

    # predict in phenotype space
    df_pred = df_prs[[f"SAMPLE_{i}" for i in range(1, 501)]]
    df_pred += np.random.normal(scale=np.sqrt(1 - hsq), size=df_pred.shape)

    pred_sd = np.std(df_pred, axis=1)
    df_plot = pd.DataFrame(
        {
            "PRS_MEAN": df_prs["MEAN"],
            "PRS_SD": pred_sd,
            "GV": df_prs["GV"],
            "GROUP": df_prs["GROUP"],
            "PHENO": df_prs["PHENO"],
        }
    )
    pred_interval = np.quantile(df_pred, q=quantiles, axis=1)
    for q_i, q in enumerate(quantiles):
        df_plot[f"PRS_Q_{q}"] = pred_interval[q_i, :]

    return df_plot

In [50]:
# read local ancestry
df_lanc = pd.read_csv("out/admix_lanc.tsv", sep="\t", index_col=0)
df_lanc["lanc"] = 1.0 - df_lanc["lanc"]
df_lanc["lanc_q"] = pd.qcut(df_lanc.lanc, q=5).cat.codes + 1

# read phenotype
PHENO_DIR = "/u/project/pasaniuc/pasaniucdata/admixture/projects/admix-prs-uncertainty/data/pheno"
df_covar = (
    pd.concat(
        [
            pd.read_csv(join(PHENO_DIR, f"{group}.covar"), delim_whitespace=True)
            for group in ["eur_test", "admix"]
        ]
    )
    .reset_index(drop=True)
    .astype({"FID": str, "IID": str})
)
df_covar.index = df_covar.FID + "_" + df_covar.IID
df_covar = df_covar[["SEX", "AGE"] + [f"PC{i}" for i in range(1, 11)]]

In [62]:
sim_i = 2

In [63]:
df_summary = summarize(prefix, sim_i)
df_summary.loc[df_lanc.index, "GROUP"] = "admix_" + df_lanc["lanc_q"].astype(str)

Using hsq=0.25 to generate posterior predictive samples for phenotypes


In [65]:
for group, df_group in df_summary.groupby(["GROUP"]):

    raw_r2 = pearsonr(df_group["PHENO"], df_group["PRS_MEAN"])[0] ** 2
    print(f"group={group}, raw_r2={raw_r2}")

group=admix_1, raw_r2=0.13313194674812118
group=admix_2, raw_r2=0.14628630903908763
group=admix_3, raw_r2=0.1358655928029286
group=admix_4, raw_r2=0.14172655988925145
group=admix_5, raw_r2=0.12091171823504585
group=eur_test, raw_r2=0.15415001071293716


In [34]:
def submit_summarize(prefix):

    np.random.seed(1234)

    df_plot = []
    for sim_i in tqdm(range(10)):
        df_summary = summarize(prefix, sim_i)
        df_summary.loc[df_lanc.index, "GROUP"] = "admix_" + df_lanc["lanc_q"].astype(
            str
        )
        df_summary = pd.merge(df_summary, df_covar, left_index=True, right_index=True)

        # calibrate seperately for eur_test and admix
        mean_cov_cols = ["SEX", "AGE"] + [f"PC{i}" for i in range(1, 3)]
        quantile_cov_cols = ["PC1", "PC2"]

        for group in ["admix", "eur_test"]:
            df_tmp = df_summary[df_summary["GROUP"].str.startswith(group)].copy()
            idx_cal, idx_test = train_test_split(df_tmp.index, train_size=0.5)
            for method in [None, "shift", "scale"]:
                df_calibrated = admix_prs.calibrate_prs(
                    df_tmp,
                    idx_cal,
                    mean_cov_cols=mean_cov_cols,
                    q=0.1,
                    method=method,
                    quantile_cov_cols=quantile_cov_cols,
                )
                method_prefix = "CENTER" if method is None else method.upper()
                df_summary.loc[
                    df_calibrated.index,
                    [f"{method_prefix}_PRS_Q_0.1", f"{method_prefix}_PRS_Q_0.9"],
                ] = df_calibrated.values
            df_summary.loc[idx_cal, "IN_CAL"] = True
            df_summary.loc[idx_test, "IN_CAL"] = False

        df_summary["SIM_I"] = sim_i

        df_summary = df_summary[
            ["GROUP", "PHENO"]
            + ["IN_CAL", "SIM_I"]
            + [col for col in df_summary.columns if "PRS_Q" in col]
        ]

        df_summary.index.name = "INDIV"
        df_summary = df_summary.reset_index()
        # append to all
        df_plot.append(df_summary)

    df_plot = pd.concat(df_plot)
    df_plot.to_csv(f"out/summary/{prefix}.tsv.gz", sep="\t", index=False)

In [36]:
submit_summarize("hsq-0.05-pcausal-0.01")

  0%|          | 0/10 [00:00<?, ?it/s]

Using hsq=0.05 to generate posterior predictive samples for phenotypes


 10%|█         | 1/10 [00:18<02:43, 18.14s/it]

Using hsq=0.05 to generate posterior predictive samples for phenotypes


 20%|██        | 2/10 [00:35<02:23, 17.89s/it]

Using hsq=0.05 to generate posterior predictive samples for phenotypes


 30%|███       | 3/10 [00:52<02:01, 17.36s/it]

Using hsq=0.05 to generate posterior predictive samples for phenotypes


 40%|████      | 4/10 [01:08<01:41, 16.94s/it]

Using hsq=0.05 to generate posterior predictive samples for phenotypes


 50%|█████     | 5/10 [01:24<01:22, 16.45s/it]

Using hsq=0.05 to generate posterior predictive samples for phenotypes


 60%|██████    | 6/10 [01:41<01:06, 16.53s/it]

Using hsq=0.05 to generate posterior predictive samples for phenotypes


 70%|███████   | 7/10 [01:58<00:49, 16.66s/it]

Using hsq=0.05 to generate posterior predictive samples for phenotypes


 80%|████████  | 8/10 [02:13<00:32, 16.19s/it]

Using hsq=0.05 to generate posterior predictive samples for phenotypes


 90%|█████████ | 9/10 [02:29<00:16, 16.22s/it]

Using hsq=0.05 to generate posterior predictive samples for phenotypes


100%|██████████| 10/10 [02:45<00:00, 16.55s/it]
