# Train PRS on European population and apply to admixed population

1. Divide into training / validation / testing for European
    - prs/pheno/eur_{train, val, test}.indiv
    - Admixed testing population from plink/admix.merged.fam
2. Extract phenotypes
    - Raw phenotypes: prs/pheno/{group}.{trait}.pheno 
    - Raw covariates: prs/covar/{group}.covar
    - Regressed phenotypes: prs/pheno/{group}.{trait}.residual_pheno
    - Regression model (train on European and apply to other populations): prs/pheno/eur_train.{trait}.model
2. Perform GWAS
    - Simple PLINK GWAS
3. Apply PRS uncertainty

# Divide into training / validation / testing for European

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
import admix
import numpy as np
import pandas as pd
from os.path import join
import statsmodels.api as sm
import matplotlib.pyplot as plt

# 1. Divide into training / validation / testing for European

In [16]:
PLINK_DIR = "../../data/PLINK"
OUT_DIR = "../../data/pheno"

In [7]:
# RUN ONLY ONCE 

# df_admix = admix.tools.plink.read_fam(join(PLINK_DIR, "admix.fam"))
# assert np.all(df_admix.FID == df_admix.IID)

# df_eur_fam = admix.tools.plink_read_fam(
#     "/u/project/sriram/ukbiobank/data/geno/cal/filter4.fam"
# ).astype(str)

# np.random.seed(1234)
# n_train = 250_000
# n_val = 20_000
# n_total = df_eur_fam.shape[0]
# # the rest is testing individuals
# train_index = np.sort(np.random.choice(np.arange(n_total), size=n_train, replace=False))
# val_test_index = np.setdiff1d(np.arange(n_total), train_index)
# val_index = np.sort(np.random.choice(val_test_index, size=n_val, replace=False))
# test_index = np.setdiff1d(val_test_index, val_index)

# for name, index in zip(
#     ["eur_train", "eur_val", "eur_test"], [train_index, val_index, test_index]
# ):
#     df_tmp = df_eur_fam.iloc[index, :]
#     df_tmp = df_tmp[~df_tmp.FID.isin(df_admix.FID.values)]
#     df_tmp.to_csv(join(PLINK_DIR, f"{name}.fam"), sep="\t", index=False, header=False)

# 2. Extract phenotypes

In [None]:
import numpy as np
import pandas as pd

col_dict = {
    "eid": "ID",
    "21003-0.0": "AGE",
    "31-0.0": "SEX",
}
for i in range(1, 41):
    col_dict[f"22009-0.{i}"] = f"PC{i}"

pheno1 = pd.read_csv(
    "/u/project/sriram/ukbiobank/33127/ukb21970.csv", 
    usecols=col_dict.keys(), 
    encoding= 'unicode_escape'
).rename(columns=col_dict)
pheno1.ID = pheno1.ID.astype(str)
pheno1 = pheno1.set_index("ID")

# extract phenotype from another file (because in another UKB application)
col_dict = {
    "eid": "ID",
    "31-0.0": "SEX",
    "21000-0.0": "SIRE",
    "50-0.0": "height",
    "23104-0.0": "bmi",
    "30897-0.0": "dilution_factor",
    "30690-0.0": "cholesterol",
    "30780-0.0": "ldl_direct",
    "30760-0.0": "hdl_cholesterol",
}

pheno2 = pd.read_csv(
    "/u/project/sriram/ukbiobank/33127/ukb39967.enc_ukb.converted2.csv",
    usecols=col_dict.keys(),
    encoding="unicode_escape",
).rename(columns=col_dict)
pheno2.ID = pheno2.ID.astype(str)
pheno2 = pheno2.set_index("ID")

df_all = pd.merge(pheno1, pheno2.drop("SEX", axis=1), left_index=True, right_index=True)
df_all.to_csv("out/all_pheno.csv")

In [20]:
trait_list = ["height", "bmi", "cholesterol", "hdl_cholesterol", "ldl_direct"]
group_list = ["eur_train", "eur_val", "eur_test", "admix"]

In [24]:
dict_df_group = {
    group: admix.tools.plink.read_fam(join(PLINK_DIR, f"{group}.fam")).astype(str)
    for group in group_list
}

In [31]:
import pandas as pd

df_pheno = pd.read_csv("out/all_pheno.csv").astype({"ID": str})
df_covar = df_pheno[["ID", "ID", "SEX", "AGE"] + [f"PC{i}" for i in range(1, 11)]]
df_covar.columns = ["FID", "IID"] + list(df_covar.columns[2:])

for group in group_list:
    df_group_covar = dict_df_group[group].merge(df_covar, on=["FID", "IID"])
    df_group_covar.to_csv(
        join(OUT_DIR, f"{group}.covar"), sep="\t", index=False, na_rep="NA"
    )

    for trait in trait_list:
        df_trait = df_pheno[["ID", "ID", trait]].copy()
        df_trait.columns = ["FID", "IID", "PHENO"]
        df_trait["PHENO"] = admix.data.quantile_normalize(df_trait["PHENO"])
        df_group_trait = dict_df_group[group].merge(df_trait, on=["FID", "IID"])
        df_group_trait.to_csv(
            join(OUT_DIR, f"{group}.{trait}.pheno"), sep="\t", index=False, na_rep="NA"
        )

In [37]:
dict_df_covar = {
    group: pd.read_csv(join(OUT_DIR, f"{group}.covar"), sep="\t")
    for group in group_list
}
for trait in trait_list:
    # load trait
    col_covar = ["SEX", "AGE"] + [f"PC{i}" for i in range(1, 11)]
    dict_df_trait = {
        group: pd.read_csv(join(OUT_DIR, f"{group}.{trait}.pheno"), sep="\t")
        for group in group_list
    }

    df_train = pd.merge(
        dict_df_trait["eur_train"], dict_df_covar["eur_train"], on=["FID", "IID"]
    )
    model = sm.OLS(
        df_train["PHENO"], sm.add_constant(df_train[col_covar].values), missing="drop"
    ).fit()
    model.save(join(OUT_DIR, f"eur_train.{trait}.model"))
    # regress out
    for group in group_list:
        df_group = pd.merge(
            dict_df_trait[group], dict_df_covar[group], on=["FID", "IID"]
        )
        df_group["PHENO"] -= model.predict(sm.add_constant(df_group[col_covar].values))
        df_group[["FID", "IID", "PHENO"]].to_csv(
            join(OUT_DIR, f"{group}.{trait}.residual_pheno"),
            sep="\t",
            index=False,
            na_rep="NA",
        )

# 3. Perform GWAS
```bash
for trait in height bmi cholesterol hdl_cholesterol ldl_direct; do
    qsub prepare-train-gwas.sh ${trait}
done
```

# 4. Format GWAS

In [11]:
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import admix
import admix_prs
from os.path import join

In [12]:
trait_list = ["height", "bmi", "cholesterol", "hdl_cholesterol", "ldl_direct"]

In [13]:
GWAS_DIR = "../../data/train_gwas"
for trait in trait_list:
    assoc = admix_prs.plink2_assoc_to_ldpred2(
        join(GWAS_DIR, trait, "assoc.PHENO.glm.linear")
    )
    assoc.to_csv(
        join(GWAS_DIR, trait, "assoc.ldpred2.tsv"), index=False, sep="\t"
    )