In [1]:
%load_ext lab_black

import pandas as pd
import numpy as np
import dapgen
import itertools
import submitit
import json
import matplotlib.pyplot as plt
import admix_genet_cor
import admix
from os.path import join
import os

In [2]:
def calc_snp_prior_var(df_snp_info, her_model):
    """
    Calculate the SNP prior variance from SNP information
    """
    assert her_model in ["uniform", "gcta", "ldak", "mafukb"]
    if her_model == "uniform":
        return np.ones(len(df_snp_info))
    elif her_model == "gcta":
        freq = df_snp_info["FREQ"].values
        assert np.all(freq > 0), "frequencies should be larger than zero"
        return np.float_power(freq * (1 - freq), -1)
    elif her_model == "mafukb":
        # MAF-dependent genetic architecture, \alpha = -0.38 estimated from meta-analysis in UKB traits
        freq = df_snp_info["FREQ"].values
        assert np.all(freq > 0), "frequencies should be larger than zero"
        return np.float_power(freq * (1 - freq), -0.38)
    elif her_model == "ldak":
        freq, weight = df_snp_info["FREQ"].values, df_snp_info["LDAK_WEIGHT"].values
        return np.float_power(freq * (1 - freq), -0.25) * weight
    else:
        raise NotImplementedError

In [3]:
DATA_ROOT_DIR = (
    "/u/project/pasaniuc/pasaniucdata/admixture/projects/PAGE-QC/01_dataset/out/aframr"
)


def compute_grm(snpset, hermodel, chrom):
    out_prefix = f"out/admix-grm/{snpset}.{hermodel}.chr{chrom}"
    pfile = f"{DATA_ROOT_DIR}/{snpset}/chr{chrom}"

    dset = admix.dataset.read_dataset(pfile=pfile, snp_chunk=512, n_anc=2)
    df_snp_info = pd.read_csv(pfile + ".snp_info", sep="\t", index_col=0)
    dset._snp = pd.merge(dset.snp, df_snp_info, left_index=True, right_index=True)

    snp_subset = np.where((dset.snp.EUR_FREQ > 0.005) & (dset.snp.AFR_FREQ > 0.005))[0]

    dset = dset[snp_subset]
    dset.snp["PRIOR_VAR"] = calc_snp_prior_var(dset.snp, hermodel)

    K1, K2, K12 = admix_genet_cor.compute_grm(
        geno=dset.geno,
        lanc=dset.lanc,
        snp_prior_var=dset.snp.PRIOR_VAR.values,
        apa_center=True,
    )

    np.save(out_prefix + ".A1.npy", K1 + K2)
    np.save(out_prefix + ".A2.npy", K12 + K12.T)
    dset.snp[["PRIOR_VAR"]].to_csv(out_prefix + ".weight.tsv", sep="\t")

In [4]:
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            ["imputed", "hm3"], ["mafukb", "gcta"], np.arange(1, 23)
        )
    ],
    columns=["snpset", "hermodel", "chrom"],
)

In [7]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=900,
    memory_g=32,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(
    compute_grm, df_params.snpset, df_params.hermodel, df_params.chrom
)