In [1]:
%load_ext lab_black

import pandas as pd
import numpy as np
import dapgen
import itertools
import submitit
import json
import matplotlib.pyplot as plt
import admix_genet_cor
import admix
from os.path import join
import os
import pandas as pd
from tqdm import tqdm
from admix_genet_cor import calc_snp_prior_var

# Compute GRM per chromosome

In [2]:
DATA_ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01_dataset/out/aframr"
)


def compute_grm(snpset, hermodel, chrom):
    """
    Given snpset: [hm3, imputed]
    hermodel: [gcta, mafukb]
    chrom: 1-22

    Compute the GRM and store to out/admix-grm/{snpset}.{hermodel}.chr{chrom}[.A1.npy | .A2.npy | .weight.tsv]
    """
    out_prefix = f"out/admix-grm/{snpset}.{hermodel}.chr{chrom}"
    pfile = f"{DATA_ROOT_DIR}/{snpset}/chr{chrom}"

    dset = admix.dataset.read_dataset(pfile=pfile, snp_chunk=512, n_anc=2)
    df_snp_info = pd.read_csv(pfile + ".snp_info", sep="\t", index_col=0)
    dset._snp = pd.merge(dset.snp, df_snp_info, left_index=True, right_index=True)

    snp_subset = np.where((dset.snp.EUR_FREQ > 0.005) & (dset.snp.AFR_FREQ > 0.005))[0]

    dset = dset[snp_subset]
    dset.snp["PRIOR_VAR"] = calc_snp_prior_var(dset.snp, hermodel)

    K1, K2, K12 = admix_genet_cor.compute_grm(
        geno=dset.geno,
        lanc=dset.lanc,
        snp_prior_var=dset.snp.PRIOR_VAR.values,
        apa_center=False,
    )

    np.save(out_prefix + ".A1.npy", K1 + K2)
    np.save(out_prefix + ".A2.npy", K12 + K12.T)
    dset.snp[["PRIOR_VAR"]].to_csv(out_prefix + ".weight.tsv", sep="\t")

In [3]:
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["imputed"], ["mafukb", "gcta"], np.arange(1, 23)
        )
    ],
    columns=["snpset", "hermodel", "chrom"],
)

# find unfinished
df_params = df_params[
    ~df_params.apply(
        lambda p: os.path.exists(
            f"out/admix-grm/{p.snpset}.{p.hermodel}.chr{p.chrom}.weight.tsv"
        ),
        axis=1,
    )
]

In [10]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=600,
    memory_g=32,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    compute_grm, df_params.snpset, df_params.hermodel, df_params.chrom
)

# Merge GRMs for all chromosomes

In [14]:
def merge_grm(snpset, hermodel):
    """
    Given snpset: [hm3, imputed]
    hermodel: [gcta, mafukb]
    chrom: 1-22

    Compute the GRM and store to out/admix-grm/{snpset}.{hermodel}.chr{chrom}[.A1.npy | .A2.npy | .weight.tsv]
    """
    prior_var_list = []
    for chrom in tqdm(range(1, 23)):
        prefix = f"out/admix-grm/{snpset}.{hermodel}.chr{chrom}"
        prior_var_list.append(
            pd.read_csv(prefix + ".weight.tsv", sep="\t", index_col=0)
        )

    def _merge(suffix):
        A = None
        weight_list = []
        for chrom in tqdm(range(1, 23)):
            prefix = f"out/admix-grm/{snpset}.{hermodel}.chr{chrom}"
            prior_var = prior_var_list[chrom - 1]
            weight = prior_var["PRIOR_VAR"].sum()
            weight_list.append(weight)
            if A is None:
                A = np.load(prefix + f".{suffix}.npy") * weight
            else:
                A += np.load(prefix + f".{suffix}.npy") * weight
        print(weight_list)
        A /= np.sum(weight_list)
        return A

    for m in ["A1", "A2"]:
        np.save(f"out/admix-grm/{snpset}.{hermodel}.all.{m}.npy", _merge(m))

    # save prior variance
    pd.concat(prior_var_list).to_csv(
        f"out/admix-grm/{snpset}.{hermodel}.all.weight.tsv", sep="\t"
    )

In [15]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=50,
    memory_g=32,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(merge_grm, ["imputed"] * 2, ["mafukb", "gcta"])

In [5]:
def write_gcta_grm(snpset, hermodel, chrom):
    if isinstance(chrom, int) or isinstance(chrom, np.integer):
        chrom = f"chr{chrom}"
    else:
        assert chrom == "all"
    dset = admix.dataset.read_dataset(
        pfile=f"{DATA_ROOT_DIR}/{snpset}/chr22", snp_chunk=512, n_anc=2
    )
    n_indiv = dset.n_indiv

    prefix = f"out/admix-grm/{snpset}.{hermodel}.{chrom}"
    out_prefix = join(os.getcwd(), f"out/admix-grm/{snpset}.{hermodel}.{chrom}.")

    Ks = [np.load(prefix + ".A1.npy"), np.load(prefix + ".A2.npy")]

    n_snp = sum(1 for line in open(prefix + ".weight.tsv"))
    print(f"n_snp: {n_snp}")
    names = []

    for i, K in enumerate(Ks):
        name = f"K{i+1}"
        admix.tools.gcta.write_grm(
            out_prefix + name,
            K=K,
            df_id=pd.DataFrame(
                {"0": dset.indiv.index.values, "1": dset.indiv.index.values}
            ),
            n_snps=np.repeat(n_snp, n_indiv),
        )
        names.append(out_prefix + name)

    with open(out_prefix + "mgrm.txt", "w") as f:
        f.writelines("\n".join(names))

    # addition of all GRMs, used for likelihood ratio test
    K_full = sum(Ks)
    admix.tools.gcta.write_grm(
        out_prefix + "K_full",
        K=K_full,
        df_id=pd.DataFrame(
            {"0": dset.indiv.index.values, "1": dset.indiv.index.values}
        ),
        n_snps=np.repeat(n_snp, n_indiv),
    )

In [4]:
write_gcta_grm("imputed", "mafukb", "all")
write_gcta_grm("imputed", "mafukb", 1)
write_gcta_grm("imputed", "mafukb", 22)

n_snp: 7117286
n_snp: 93650
