In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import dapgen
import pandas as pd
import dask.array as da
import itertools
import submitit
import json
import matplotlib.pyplot as plt
import admix_genet_cor
import pandas as pd
import admix
from os.path import join
import os

In [2]:
DATA_ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr"
)
CHROM = 1
REAL_PFILE = f"{DATA_ROOT_DIR}/imputed/chr{CHROM}"
SAMPLE_INFO_PATH = join(DATA_ROOT_DIR, "sample_info.txt")

# Step 1: generate regions

In [8]:
geno, df_snp, df_indiv = dapgen.read_pfile(REAL_PFILE, phase=True, snp_chunk=2048)

In [6]:
np.random.seed(0)
df_region = {"CHROM": [], "START": [], "STOP": []}
n_region = 100  # 100 regions
region_size = 20  # in megabase

while len(df_region["START"]) < n_region:
    start = np.random.uniform(
        low=df_snp.POS.min(), high=df_snp.POS.max() - region_size * 1e6
    )
    stop = start + region_size * 1e6
    start, stop = int(start), int(stop)
    n_snp = np.sum((start < df_snp.POS) & (df_snp.POS < stop))
    if n_snp < 1e5:
        continue
    df_region["CHROM"].append(CHROM)
    df_region["START"].append(start)
    df_region["STOP"].append(stop)

df_region = (
    pd.DataFrame(df_region).sort_values(["CHROM", "START"]).reset_index(drop=True)
)
df_region.to_csv("out/regions.txt", sep="\t", index=False)

# Step 2: writing real regions

In [3]:
def write_region(region_i):
    df_region = pd.read_csv("out/regions.txt", sep="\t")
    dset = admix.io.read_dataset(
        REAL_PFILE, indiv_info_file=SAMPLE_INFO_PATH, snp_chunk=256
    )
    dset._indiv = dset._indiv[["geno_EV1", "geno_EV2"]]
    lanc = admix.io.read_lanc(REAL_PFILE + ".lanc")

    region_idx = np.where(
        (df_region["START"][region_i] < dset.snp.POS)
        & (dset.snp.POS < df_region["STOP"][region_i])
    )[0]
    region_start_idx, region_stop_idx = region_idx[0], region_idx[-1]

    dset = dset[region_start_idx:region_stop_idx]
    lanc = lanc[region_start_idx:region_stop_idx]
    dset.indiv["AVG_ANC"] = dset.lanc.mean(axis=[0, 2]).compute()
    prefix = f"out/real-dataset/region{region_i}"
    admix.io.write_dataset(
        geno=dset.geno,
        lanc=lanc,
        df_indiv=dset.indiv,
        df_snp=dset.snp,
        out_prefix=prefix,
    )
    admix.tools.plink2.run(f"--pfile {prefix} --make-bed --out {prefix} --memory 16000")

In [1]:
n_snp_list = []
for i in range(100):
    with open(f"out/real-dataset/region{i}.bim") as f:
        n_snp = len(f.readlines())
        n_snp_list.append(n_snp)

In [12]:
import numpy as np
print(f"mean={np.mean(n_snp_list):.1f}, std={np.std(n_snp_list):.1f}")

mean=120395.5, std=6248.3


In [7]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=30,
    memory_g=80,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(write_region, np.arange(100))

NameError: name 'submitit' is not defined

# Step 3: writing simulated regions

In [13]:
CHROM = 1
KG_PATH = f"/u/project/pasaniuc/kangchen/DATA/plink2-1kg/out/build38.chr{CHROM}"

In [14]:
def write_simu_region(region_i, anc_props):
    df_region = pd.read_csv("out/regions.txt", sep="\t")
    ref_dset = admix.io.read_dataset(KG_PATH)
    ref_dset = ref_dset[
        (
            (ref_dset.snp.CHROM == df_region.CHROM[region_i])
            & (
                ref_dset.snp.POS.between(
                    df_region.START[region_i], df_region.STOP[region_i]
                )
            )
        ).values
    ]
    ref_dset_list = [
        ref_dset[:, (ref_dset.indiv.SuperPop == pop).values] for pop in ["EUR", "AFR"]
    ]

    mosaic_size = admix.simulate.calculate_mosaic_size(
        df_snp=ref_dset.snp, genetic_map="hg38", chrom=1, n_gen=7
    )

    np.random.seed(1)

    dset, lanc = admix.simulate.admix_geno(
        geno_list=[dset.geno for dset in ref_dset_list],
        df_snp=ref_dset.snp,
        anc_props=anc_props,
        mosaic_size=mosaic_size,
        n_indiv=20_000,
        return_sparse_lanc=True,
    )

    # save
    prefix = (
        "out/simu-dataset/"
        f"region{region_i}.eur{int(anc_props[0] * 100)}.afr{int(anc_props[1] * 100)}"
    )

    admix.io.write_dataset(
        geno=dset.geno,
        lanc=lanc,
        df_indiv=dset.indiv,
        df_snp=dset.snp,
        out_prefix=prefix,
    )
    admix.tools.plink2.run(f"--pfile {prefix} --make-bed --out {prefix} --memory 16000")

    # reload data set and calculate stats
    dset = admix.io.read_dataset(prefix)
    dset.snp[["EUR_FREQ", "AFR_FREQ"]] = dset.af_per_anc()
    dset.snp["FREQ"] = dset.geno.mean(axis=[1, 2])
    dset.indiv["AVG_ANC"] = dset.lanc.mean(axis=[0, 2]).compute()
    dset.snp[["EUR_FREQ", "AFR_FREQ", "FREQ"]].to_csv(prefix + ".snp_info", sep="\t")
    dset.indiv[["AVG_ANC"]].to_csv(prefix + ".indiv_info", sep="\t")

In [None]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=60,
    memory_g=80,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)
jobs = executor.map_array(write_simu_region, np.arange(100), [[0.2, 0.8]] * 100)
jobs = executor.map_array(write_simu_region, np.arange(100), [[0.8, 0.2]] * 100)