# Train PRS on European population and apply to admixed population

1. Divide into training / validation / testing for European
    - prs/pheno/eur_{train, val, test}.indiv
    - Admixed testing population from plink/admix.merged.fam
2. Extract phenotypes
    - Raw phenotypes: prs/pheno/{group}.{trait}.pheno 
    - Raw covariates: prs/covar/{group}.covar
    - Regressed phenotypes: prs/pheno/{group}.{trait}.residual_pheno
    - Regression model (train on European and apply to other populations): prs/pheno/eur_train.{trait}.model
2. Perform GWAS
    - Simple PLINK GWAS
3. Apply PRS uncertainty

# Divide into training / validation / testing for European

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
import admix
import numpy as np
import pandas as pd
from os.path import join
import statsmodels.api as sm

# 1. Divide into training / validation / testing for European

In [3]:
df_admix = admix.tools.plink_read_fam("../data/PLINK/admix.fam")
assert np.all(df_admix.FID == df_admix.IID)

In [6]:
df_eur_fam = admix.tools.plink_read_fam(
    "/u/project/sriram/ukbiobank/data/geno/cal/filter4.fam"
).astype(str)

np.random.seed(1234)
n_train = 250_000
n_val = 20_000
n_total = df_eur_fam.shape[0]
# the rest is testing individuals
train_index = np.sort(np.random.choice(np.arange(n_total), size=n_train, replace=False))
val_test_index = np.setdiff1d(np.arange(n_total), train_index)
val_index = np.sort(np.random.choice(val_test_index, size=n_val, replace=False))
test_index = np.setdiff1d(val_test_index, val_index)

for name, index in zip(
    ["eur_train", "eur_val", "eur_test"], [train_index, val_index, test_index]
):
    df_tmp = df_eur_fam.iloc[index, :]
    df_tmp = df_tmp[~df_tmp.FID.isin(df_admix.FID.values)]
    df_tmp.to_csv(f"../data/PLINK/{name}.fam", sep="\t", index=False, header=False)

# check consistency to legacy data.
#     df_eur_fam.iloc[index, :].to_csv(f"prs/pheno/{name}.indiv", sep='\t', index=False, header=False)
# for name, index in zip(
#     ["eur_train", "eur_val", "eur_test"], [train_index, val_index, test_index]
# ):
#     df_this = df_eur_fam.iloc[index, :]
#     df_old = admix.tools.plink_read_fam(f"../data/PLINK/{name}.fam")


#     print(np.all(df_this.values.astype(str) == df_old.values.astype(str)))

# 2. Extract phenotypes

In [8]:
trait_list = ["height", "bmi", "cholesterol", "hdl_cholesterol", "ldl_direct"]
group_list = ["eur_train", "eur_val", "eur_test", "admix_test"]

In [12]:
import pandas as pd
df_pheno = pd.read_csv("pheno/all.csv")
df_covar = df_pheno[["ID", "ID", "SEX", "AGE"] + [f"PC{i}" for i in range(1, 11)]]
df_covar.columns = ["FID", "IID"] + list(df_covar.columns[2:])

for group in group_list:
    df_group_covar = dict_df_group[group].merge(df_covar, on=["FID", "IID"])
    df_group_covar.to_csv(f"prs/pheno/{group}.covar", sep='\t', index=False, na_rep="NA")
    
    for trait in trait_list:
        df_trait = df_pheno[["ID", "ID", trait]]
        df_trait.columns = ["FID", "IID", trait]
        df_group_trait = dict_df_group[group].merge(df_trait, on=["FID", "IID"])
        df_group_trait.to_csv(f"prs/pheno/{group}.{trait}.pheno", sep='\t', index=False, na_rep="NA")


ValueError: The number of quantiles cannot be greater than the number of samples used. Got 249950 quantiles and 100000 samples.

In [66]:
dict_df_covar = {group : pd.read_csv(f"prs/pheno/{group}.covar", sep='\t') for group in group_list}
for trait in trait_list:
    # load trait
    col_covar = ["SEX", "AGE"] + [f"PC{i}" for i in range(1, 11)]
    dict_df_trait = {group : pd.read_csv(f"prs/pheno/{group}.{trait}.pheno", sep='\t') for group in group_list}
    
    df_train = pd.merge(dict_df_trait["eur_train"], dict_df_covar["eur_train"], on=["FID", "IID"])
    model = sm.OLS(df_train[trait], sm.add_constant(df_train[col_covar]), missing='drop').fit()
    model.save(f"prs/pheno/eur_train.{trait}.model")
    for group in group_list:
        df_group = pd.merge(dict_df_trait[group], dict_df_covar[group], on=["FID", "IID"])
        df_group[f"{trait}-residual"] = df_group[trait] - model.predict(sm.add_constant(df_group[col_covar]))
        df_group[["FID", "IID", f"{trait}-residual"]].to_csv(f"prs/pheno/{group}.{trait}.residual_pheno", sep='\t', index=False, na_rep="NA")

# 3. Perform GWAS
```bash
for trait in height bmi cholesterol hdl_cholesterol ldl_direct; do
    qsub train_gwas.sh ${trait}
done
```

In [1]:
import admix
import numpy as np
import pandas as pd
from os.path import join
import statsmodels.api as sm

In [3]:
bim = admix.tools.plink_read_bim("../data/PLINK/eur_train/merged.bim")

In [4]:
# aggregate all the association results
for trait in ["height"]:
    assoc = pd.concat([pd.read_csv(f"../data/train_gwas/{trait}/assoc.{chr_i}.{trait}.glm.linear", delim_whitespace=True, dtype=str) for chr_i in range(1, 23)], axis=0).reset_index(drop=True)
    assert np.all(assoc['A1'] == assoc["ALT"])
    assoc = assoc[["#CHROM", "ID", "POS", "ALT", "REF", "TEST", "OBS_CT", "BETA", "SE", "L95", "U95", "T_STAT", "P"]].rename(
        columns={"#CHROM": "CHR",
                 "ID": "SNP",
                 "POS": "BP",
                 "ALT": "A1",
                 "REF": "A2",
                 "OBS_CT": "NMISS",
                 "T_STAT": "STAT"})
    assoc.to_csv(f"../data/train_gwas/{trait}/assoc.all.assoc.linear", sep='\t', index=False)

In [25]:
# aggregate all the association results
# for trait in trait_list:
#     assoc = pd.concat([pd.read_csv(f"prs/train_gwas/{trait}/assoc.{chr_i}.assoc.linear", delim_whitespace=True, dtype=str) for chr_i in range(1, 23)], axis=0).reset_index(drop=True)
#     assert np.all(assoc["A1"] == bim["A1"])
#     assert np.all(assoc["SNP"] == bim["SNP"])
#     assoc.insert(4, "A2", bim["A2"].values)
#     assoc.to_csv(f"prs/train_gwas/{trait}/assoc.all.assoc.linear", sep='\t', index=False)

In [7]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(assoc.P)

(array([125284., 120879., 116959., 113676., 105636., 102449.,  97039.,
         91952.,  86280.,  80847.]),
 array([     0. ,  77738.1, 155476.2, 233214.3, 310952.4, 388690.5,
        466428.6, 544166.7, 621904.8, 699642.9, 777381. ]),
 <BarContainer object of 10 artists>)

In [35]:
assoc

Unnamed: 0,CHR,SNP,BP,A1,A2,TEST,NMISS,BETA,SE,L95,U95,STAT,P
0,1,rs115991721,767096,A,G,ADD,248916,-0.0188594,0.044303,-0.105692,0.0679728,-0.425692,0.670332
1,1,rs12562034,768448,G,A,ADD,248911,0.000238705,0.00323871,-0.00610905,0.00658646,0.0737038,0.941246
2,1,rs4040617,779322,A,G,ADD,248690,0.00186237,0.00297852,-0.00397541,0.00770015,0.625268,0.531796
3,1,rs57181708,809876,A,G,ADD,249142,0.0023127,0.0033033,-0.00416165,0.00878704,0.700118,0.483854
4,1,rs116452738,834830,G,A,ADD,249047,0.00149389,0.0122304,-0.0224773,0.025465,0.122146,0.902784
...,...,...,...,...,...,...,...,...,...,...,...,...,...
528610,22,rs5771002,51183255,A,G,ADD,245452,0.000229103,0.0021283,-0.00394228,0.00440049,0.107646,0.914276
528611,22,rs3865764,51185848,G,A,ADD,248647,0.00283037,0.00453166,-0.00605152,0.0117123,0.624576,0.53225
528612,22,rs142680588,51193629,A,G,ADD,248971,0.00297442,0.00377153,-0.00441764,0.0103665,0.78865,0.430317
528613,22,rs9616974,51217954,G,A,ADD,246103,0.00431862,0.00390438,-0.00333382,0.0119711,1.1061,0.268686


# Perform PRS analysis
```bash
git clone X

```