In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import pandas as pd
import admix
import matplotlib.pyplot as plt
import numpy as np
import json
import pandas as pd
import seaborn as sns
import os
import dapgen
from os.path import join
import admix_genet_cor
import submitit
from scipy.stats import ttest_rel

In [2]:
DATA_ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01-dataset/out/aframr"
)
CHROM = 1


def GG(admix_ld, i, j):
    """
    Obtain
    gi1 x gj1, gi1 x gj2
    gj1 x gi2, gi2 x gj2

    Note that the matrix is not symmetric
    """
    ld1, ld2, ld12 = admix_ld["11"], admix_ld["22"], admix_ld["12"]
    return np.array([[ld1[i, j], ld12[i, j]], [ld12[j, i], ld2[i, j]]])

In [3]:
def submit_admix_ld(
    region: int,
    out: str,
    region_file="../02-region-locus-simulate/out/regions.txt",
    step=100,
):
    """
    region: int
        region id to simulate, defined in region_file
    hsq: float
        heritability of all the simulated causal variant
    ncausal: int
        number of simulated causal varaiants
    her_model: one of [uniform, gcta, ldak]

    """

    region = pd.read_csv(region_file, sep="\t").iloc[region, :]

    pfile = f"{DATA_ROOT_DIR}/imputed/chr{CHROM}"

    # read data
    dset = admix.io.read_dataset(pfile, snp_chunk=1024)

    # simulate effects
    dset = dset[
        (
            dset.snp.EUR_FREQ.between(0.005, 0.995)
            & dset.snp.AFR_FREQ.between(0.005, 0.995)
            & (region.START < dset.snp.POS)
            & (dset.snp.POS <= region.STOP)
        ).values
    ]

    # subsampling
    dset = dset[0::step]
    print(dset)
    dset.persist()

    admix_ld = admix.data.admix_ld(dset)

    res = []
    for i in range(dset.n_snp):
        for j in range(dset.n_snp):
            GiGi = GG(admix_ld, i, i)
            GiGj = GG(admix_ld, i, j)
            tag = np.linalg.inv(GiGi) @ GiGj @ np.ones(2)[:, None]
            res.append([dset.snp.POS[i], dset.snp.POS[j]] + tag.flatten().tolist())
    res = pd.DataFrame(res, columns=["posi", "posj", "ld1", "ld2"])
    res.to_csv(out, index=False, sep="\t")

In [4]:
df_params = pd.DataFrame(
    {
        "region": np.arange(100).astype(int),
        "out": [f"out/admix-ld/region{region}.tsv" for region in range(100)],
    }
)

In [5]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=60,
    memory_g=32,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(submit_admix_ld, df_params.region, df_params.out)

In [6]:
df_plot = pd.read_csv("out/admix-ld/region20.tsv", sep="\t")
df_plot["dist"] = np.abs(df_plot["posi"] - df_plot["posj"])
df_plot = df_plot[df_plot["dist"] != 0]
df_plot["dist"] /= 1e6

FileNotFoundError: [Errno 2] No such file or directory: 'out/admix-ld/region20.tsv'

In [None]:
fig, axes = plt.subplots(figsize=(6, 3), dpi=150, ncols=2, sharey=True)
axes[0].scatter(df_plot["dist"], df_plot["ld1"], s=1, alpha=0.05)
axes[1].scatter(df_plot["dist"], df_plot["ld2"], s=1, alpha=0.05)

axes[0].set_ylabel("Taggability")

axes[0].set_title("European")
axes[1].set_title("African")

fig.text(0.5, -0.04, "Pairwise distance (Mb)", ha="center", fontsize=12)
fig.tight_layout()

In [None]:
ttest_rel(np.abs(df_plot["ld1"]), np.abs(df_plot["ld2"]))

Find `n_region=30` regions, select the center SNP as the causal SNP, select 800 SNPs in the +- 10k SNPs, every 25 SNPs, and calculate the covariance difference.