# Phenotype simulation from real genotypes

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
from os.path import join
import numpy as np
import pandas as pd
import admix
import matplotlib.pyplot as plt
from os.path import join
import submitit
import dapgen
from tqdm import tqdm
import dask.array as da
import admix_prs
import os

In [3]:
PLINK_DIR = "/u/project/pasaniuc/pasaniucdata/admixture/projects/admix-prs-uncertainty/data/PLINK/"
BFILE_PREFIX = "merged"
FREQ_FILE = join(PLINK_DIR, "eur_train", "merged.freq")
N_SIM = 10

# Simulate phenotype and perform GWAS

In [4]:
group_list = ["eur_train", "eur_val", "eur_test", "admix"]
bfile_list = [join(PLINK_DIR, group, BFILE_PREFIX) for group in group_list]


def simulate_pheno_gwas(hsq, causal_prop, out_dir):

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    np.random.seed(42)
    admix_prs.simulate_quant_pheno(
        bfile_list=bfile_list,
        group_list=group_list,
        hsq=hsq,
        causal_prop=causal_prop,
        freq_file=FREQ_FILE,
        out_prefix=out_dir + "/sim",
        n_sim=N_SIM,
    )
    df_pheno = pd.read_csv(f"{out_dir}/sim.pheno.tsv", sep="\t", index_col=0)
    df_val_pheno = df_pheno[df_pheno.GROUP == "eur_val"]
    for sim_i in range(N_SIM):

        # perform GWAS
        admix.tools.plink2.gwas(
            bfile=join(PLINK_DIR, "eur_train", BFILE_PREFIX),
            df_sample_info=df_pheno,
            pheno_col=f"SIM_{sim_i}",
            out_prefix=f"{out_dir}/sim_{sim_i}",
        )
        admix_prs.plink2_assoc_to_ldpred2(f"{out_dir}/sim_{sim_i}.assoc").to_csv(
            f"{out_dir}/sim_{sim_i}.assoc.ldpred2.tsv", index=False, sep="\t"
        )

        # seperating validation phenotype
        df_val_pheno[["FID", "IID", f"SIM_{sim_i}"]].to_csv(
            f"{out_dir}/sim_{sim_i}.eur_val.pheno.tsv", index=False, sep="\t"
        )

In [5]:
import submitit

In [7]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=250,
    memory_g=30,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    simulate_pheno_gwas, [0.05], [0.01], ["out/pheno/hsq-0.05-pcausal-0.01"]
)

# Obtain local ancestry

In [None]:
import pandas as pd
import xarray as xr

n_snp = 0
df_lanc = None
for chrom in tqdm(range(1, 23)):
    dset = xr.open_zarr(
        f"/u/project/pasaniuc/pasaniucdata/admixture/projects/admix-prs-uncertainty/data/admix-analysis/dataset/chr{chrom}.zarr"
    )
    n_snp += dset.dims["snp"]
    df_tmp = pd.DataFrame(
        {"lanc": dset.lanc.data.sum(axis=[1, 2]).compute()}, index=dset.indiv.values
    )
    if df_lanc is None:
        df_lanc = df_tmp
    else:
        df_lanc += df_tmp

df_lanc = df_lanc / (n_snp * 2)

df_lanc.to_csv("out/admix_lanc.tsv", sep="\t")