In [10]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import xarray as xr
import numpy as np
import dapgen
import pandas as pd
import dask.array as da
import itertools
import submitit
import json
import matplotlib.pyplot as plt
import admix_genet_cor
import pandas as pd
import admix
from os.path import join
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


# Overview of simulation studies
- p_causal: 0.001, 1.0
- var_g: 1.0
- var_e: 1.0
- gamma: 0.5, 0.8, 1.0
- set of SNPs: simulate from the imputed SNPs, the non-zero effects are simulated from SNPs with allele frequencies > 0.005 in both populations.

From S-LDXR paper: 
```
In these simulations, we randomly selected 10% of the SNPs to be causal in each population, with 80% of causal variants in each population shared with the other population, and sampled perfectly correlated causal effect sizes for shared causal variants using Eq. In these simulations, we set the variance of causal effect size of each SNP j in both populations to be proportional to $[p_{j,max}(1−p_{j,max})]^\alpha$, where pj,max is the maximum MAF of SNP j in the two populations.  We set α to − 0.38, as previously estimated for 25 UK Biobank diseases 
and complex traits in ref.
```

Here, we simulate from the imputed SNPs the non-zero effects are simulated from SNPs with allele frequencies > 0.005 in both populations. And from these, we sample 1% of causal effects.

In [11]:
# CONSTANTS

CHROM = 22
DATA_ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01_dataset/out/aframr"
)

# define the simulation parameters

df_simulate_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            [0.05, 0.1],
            [0.001, 0.01],
            [0.8, 1.0],
            ["mafukb"],
        )
    ],
    columns=["hsq", "pcausal", "cor", "hermodel"],
)
df_simulate_params["out_prefix"] = df_simulate_params.apply(
    lambda row: f"out/pheno/hsq-{row.hsq}-pcausal-{row.pcausal}-cor-{row.cor}-hermodel-{row.hermodel}",
    axis=1,
)

# Step 1: Simulate phenotype

In [12]:
def calc_snp_prior_var(df_snp_info, her_model):
    """
    Calculate the SNP prior variance from SNP information
    """
    assert her_model in ["uniform", "gcta", "ldak", "mafukb"]
    if her_model == "uniform":
        return np.ones(len(df_snp_info))
    elif her_model == "gcta":
        freq = df_snp_info["FREQ"].values
        assert np.all(freq > 0), "frequencies should be larger than zero"
        return np.float_power(freq * (1 - freq), -1)
    elif her_model == "mafukb":
        # MAF-dependent genetic architecture, \alpha = -0.38 estimated from meta-analysis in UKB traits
        freq = df_snp_info["FREQ"].values
        assert np.all(freq > 0), "frequencies should be larger than zero"
        return np.float_power(freq * (1 - freq), -0.38)
    elif her_model == "ldak":
        freq, weight = df_snp_info["FREQ"].values, df_snp_info["LDAK_WEIGHT"].values
        return np.float_power(freq * (1 - freq), -0.25) * weight
    else:
        raise NotImplementedError

In [13]:
def submit_simulate_pheno(
    hsq: float,
    pcausal: float,
    cor: float,
    hermodel: str,
    out_prefix: str,
):
    """
    her_model: one of [uniform, gcta, ldak]

    """
    N_SIM = 30
    N_ANC = 2

    np.random.seed(1234)

    pfile = f"{DATA_ROOT_DIR}/imputed/chr{CHROM}"
    print(
        f"hsq: {hsq}, pcausal: {pcausal}, cor: {cor}, hermodel: {hermodel}, out_prefix: {out_prefix}"
    )
    geno, df_snp, df_indiv = dapgen.read_pfile(pfile, phase=True, snp_chunk=1024)
    lanc = admix.io.read_lanc(pfile + ".lanc", snp_chunk=1024)
    df_snp_info = pd.read_csv(pfile + ".snp_info", sep="\t")
    snp_prior_var = calc_snp_prior_var(df_snp_info, hermodel)

    assert np.all(df_snp_info.SNP == df_snp.index.values)

    # simulate effects
    snp_subset = np.where(
        (df_snp_info.EUR_FREQ > 0.005) & (df_snp_info.AFR_FREQ > 0.005)
    )[0]
    beta = np.zeros((df_snp_info.shape[0], N_ANC, N_SIM))  # (n_snp, n_anc, n_sim)
    n_causal = int(len(snp_subset) * pcausal)
    for i_sim in range(N_SIM):
        cau = sorted(np.random.choice(snp_subset, size=n_causal, replace=False))

        i_beta = np.random.multivariate_normal(
            mean=[0.0, 0.0],
            cov=np.array([[1, cor], [cor, 1]]),
            size=n_causal,
        )

        i_beta = i_beta * np.sqrt(snp_prior_var[cau])[:, None]

        for i_anc in range(N_ANC):
            beta[cau, i_anc, i_sim] = i_beta[:, i_anc]

    sim = admix_genet_cor.simulate_continuous_pheno(
        geno=geno, lanc=lanc, hsq=hsq, beta=beta, n_sim=N_SIM, apa_center=False
    )
    np.save(out_prefix + ".beta.npy", sim["beta"])
    df_pheno = pd.DataFrame(
        sim["pheno"],
        index=df_indiv.index,
        columns=[f"SIM_{i}" for i in range(sim["pheno"].shape[1])],
    )
    df_pheno.to_csv(out_prefix + ".pheno.tsv", index=True, sep="\t")

In [14]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=250,
    memory_g=40,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_simulate_pheno,
    df_simulate_params.hsq,
    df_simulate_params.pcausal,
    df_simulate_params.cor,
    df_simulate_params.hermodel,
    df_simulate_params.out_prefix,
)

# Step 2: Compute GRM

In [33]:
def submit_compute_grm(hermodel: str, out_prefix: str):

    pfile = f"{DATA_ROOT_DIR}/imputed/chr{CHROM}"

    geno, df_snp, df_indiv = dapgen.read_pfile(pfile, phase=True, snp_chunk=512)
    lanc = admix.io.read_lanc(pfile + ".lanc", snp_chunk=512)

    df_snp_info = pd.read_csv(pfile + ".snp_info", sep="\t")
    snp_prior_var = calc_snp_prior_var(df_snp_info, hermodel)
    print("snp_prior_var:")
    print(snp_prior_var)

    snp_subset = np.where(
        (df_snp_info.EUR_FREQ > 0.005) & (df_snp_info.AFR_FREQ > 0.005)
    )[0]

    K1, K2, K12 = admix_genet_cor.compute_grm(
        geno=geno[snp_subset, :, :],
        lanc=lanc[snp_subset, :, :],
        snp_prior_var=snp_prior_var[snp_subset],
        apa_center=True,
    )

    np.save(out_prefix + ".K1.npy", K1)
    np.save(out_prefix + ".K2.npy", K2)
    np.save(out_prefix + ".K12.npy", K12)

In [34]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=180,
    memory_g=40,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

df_grm_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["mafukb"],
        )
    ],
    columns=["hermodel"],
)

df_grm_params["out_prefix"] = df_grm_params.apply(
    lambda row: f"out/grm/hermodel-{row.hermodel}",
    axis=1,
)
jobs = executor.map_array(
    submit_compute_grm,
    df_grm_params.hermodel,
    df_grm_params.out_prefix,
)

# Step 3: Estimation

In [6]:
df_estimate_params = df_simulate_params.copy()
df_estimate_params["estimate_out_prefix"] = df_estimate_params.apply(
    lambda row: row.out_prefix.replace("pheno", "estimate"), axis=1
)

In [7]:
def submit_estimate(hermodel: str, pheno: str, out_prefix: str):
    """
    snp_set: set of SNPs used as GRM, for estimation
    pheno: phenotype file
    out_prefix: prefix to the output
    """
    print(f"submit_estimate:\npheno={pheno}, out_prefix={out_prefix}")
    # Step 1: load GRM
    grm_prefix = f"out/grm/hermodel-{hermodel}"
    K1 = np.load(grm_prefix + ".K1.npy")
    K2 = np.load(grm_prefix + ".K2.npy")
    K12 = np.load(grm_prefix + ".K12.npy")

    # step 2 load data

    A1 = K1 + K2
    A2 = K12 + K12.T

    # K1, K2, K12 are no longer used, release memory for them
    del K1, K2, K12

    # Step 2: load phenotype and estimate
    pheno = pd.read_csv(pheno, delim_whitespace=True, index_col=0)
    print(pheno.head())
    n_indiv = pheno.shape[0]

    rls_list = admix_genet_cor.estimate_genetic_cor(
        A1, A2, pheno.values, cov=np.ones((n_indiv, 1))
    )

    if len(rls_list[0]) == 2:
        data = [
            {"estimate": rls[0].tolist(), "varcov": rls[1].tolist()} for rls in rls_list
        ]
    else:
        data = [{"estimate": rls.tolist()} for rls in rls_list]

    with open(out_prefix + ".estimate.json", "w") as out:
        json.dump(data, out)

In [8]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=30,
    memory_g=48,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    submit_estimate,
    df_estimate_params.hermodel,
    (df_estimate_params["out_prefix"] + ".pheno.tsv"),
    df_estimate_params.estimate_out_prefix,
)

In [9]:
jobs

[SgeJob<job_id=449371_1, task_id=0, state="pending">,
 SgeJob<job_id=449371_2, task_id=0, state="pending">,
 SgeJob<job_id=449371_3, task_id=0, state="pending">,
 SgeJob<job_id=449371_4, task_id=0, state="pending">]

# LEGACY versions

def submit_simulate_pheno(
    hsq: float,
    pcausal: float,
    cor: float,
    her_model: str,
    out_prefix: str,
):
    """
    her_model: one of [uniform, gcta, ldak]

    """
    assert her_model in ["uniform", "gcta", "ldak"]

    np.random.seed(1234)

    pfile = f"{DATA_ROOT_DIR}/imputed/chr{CHROM}"
    lanc_path = pfile + ".lanc"
    print(
        f"hsq: {hsq}, pcausal: {pcausal}, cor: {cor}, hermodel: {her_model}, out_prefix: {out_prefix}"
    )
    geno, df_snp, df_indiv = dapgen.read_pfile(pfile, phase=True, snp_chunk=1024)
    lanc = admix.io.read_lanc(lanc_path, snp_chunk=1024)

    df_snp_info = pd.read_csv(pfile + ".snp_info", sep="\t")
    snp_prior_var = calc_snp_prior_var(df_snp_info, her_model)

    print(snp_prior_var)

    print(f"n_causal={int(geno.shape[0] * pcausal)}")

    sim = admix_genet_cor.simulate_continuous_pheno(
        geno=geno,
        lanc=lanc,
        hsq=hsq,
        cor=cor,
        n_causal=int(geno.shape[0] * pcausal),
        snp_prior_var=snp_prior_var,
        n_sim=30,
    )
    np.save(out_prefix + ".beta.npy", sim["beta"])
    df_pheno = pd.DataFrame(
        sim["pheno"],
        index=df_indiv.index,
        columns=[f"SIM_{i}" for i in range(sim["pheno"].shape[1])],
    )
    df_pheno.to_csv(out_prefix + ".pheno.tsv", index=True, sep="\t")