In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import admix
import dapgen
import pandas as pd
import numpy as np
import submitit

In [2]:
CHROM = 1
KG_PATH = f"/u/project/pasaniuc/kangchen/DATA/plink2-1kg/out/build38.chr{CHROM}"

In [3]:
def create_dataset(region_i, anc_props):
    df_regions = pd.read_csv(
        "../02-region-locus-simulate/out/regions.txt", delim_whitespace=True
    )
    ref_dset = admix.io.read_dataset(KG_PATH)
    ref_dset = ref_dset[
        (
            (ref_dset.snp.CHROM == df_regions.CHROM[region_i])
            & (
                ref_dset.snp.POS.between(
                    df_regions.START[region_i], df_regions.STOP[region_i]
                )
            )
        ).values
    ]
    ref_dset_list = [
        ref_dset[:, (ref_dset.indiv.SuperPop == pop).values] for pop in ["EUR", "AFR"]
    ]

    mosaic_size = admix.simulate.calculate_mosaic_size(
        df_snp=ref_dset.snp, genetic_map="hg38", chrom=1, n_gen=7
    )

    np.random.seed(1)

    admix_dset, admix_lanc = admix.simulate.admix_geno(
        geno_list=[dset.geno for dset in ref_dset_list],
        df_snp=ref_dset.snp,
        anc_props=anc_props,
        mosaic_size=mosaic_size,
        n_indiv=15_000,
        return_sparse_lanc=True,
    )

    # save
    prefix = f"out/datasets/region{region_i}_{int(anc_props[0] * 100)}_{int(anc_props[1] * 100)}"
    dapgen.write_pgen(f"{prefix}.pgen", admix_dset.geno)
    admix_dset.indiv.rename_axis("#IID").to_csv(f"{prefix}.psam", sep="\t")

    df_snp = admix_dset.snp.reset_index().rename(
        columns={"snp": "ID", "CHROM": "#CHROM"}
    )

    df_snp = admix_dset.snp.reset_index().rename(
        columns={"snp": "ID", "CHROM": "#CHROM"}
    )
    fixed_cols = ["#CHROM", "POS", "ID", "REF", "ALT"]
    df_snp = df_snp[
        fixed_cols + [col for col in df_snp.columns if col not in fixed_cols]
    ]

    df_snp.to_csv(f"{prefix}.pvar", sep="\t", index=False)

    admix.io.write_lanc(f"{prefix}.lanc", admix_lanc)

    admix.tools.plink2.run(f"--pfile {prefix} --make-bed --out {prefix}")

    # reload data set and calculate stats
    admix_dset = admix.io.read_dataset(prefix)
    admix_dset.snp[["EUR_FREQ", "AFR_FREQ"]] = admix_dset.af_per_anc()
    admix_dset.snp["FREQ"] = admix_dset.geno.mean(axis=[1, 2])
    admix_dset.indiv["AVG_ANC"] = admix_dset.lanc.mean(axis=[0, 2]).compute()
    admix_dset.snp[["EUR_FREQ", "AFR_FREQ", "FREQ"]].to_csv(
        prefix + ".snp_info", sep="\t"
    )
    admix_dset.indiv[["AVG_ANC"]].to_csv(prefix + ".indiv_info", sep="\t")

In [1]:
import admix

In [2]:
dset = admix.io.read_dataset("out/datasets/region0_20_80")

2022-01-21 15:40.46 [info     ] admix.Dataset: read local ancestry from out/datasets/region0_20_80.lanc
2022-01-21 15:40.48 [info     ] admix.Dataset: `n_anc` is not provided, infered n_anc from the first 1,000 SNPs is 2. If this is not correct, provide `n_anc` when constructing admix.Dataset


In [6]:
dset.indiv["AVG_ANC"].mean()

0.799543294492783

In [3]:
dset

admix.Dataset object with n_snp x n_indiv = 98265 x 15000, n_anc=2
	snp: 'CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'FILTER', 'EUR_FREQ', 'AFR_FREQ', 'FREQ'
	indiv: 'AVG_ANC'

In [4]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=30,
    memory_g=60,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)
jobs = executor.map_array(create_dataset, np.arange(100), [[0.2, 0.8]] * 100)
jobs = executor.map_array(create_dataset, np.arange(100), [[0.8, 0.2]] * 100)