# Evaluating properties of testing heterogeneity

In [80]:
import numpy as np
import pandas as pd
import pickle
import xarray as xr
import admix
import matplotlib.pyplot as plt
import statsmodels.api as sm
from tqdm import tqdm

In [5]:
def simulate(apa, beta, cov):
    cov_effects = np.random.normal(loc=0, scale=0.1, size=cov.shape[1])
    y = np.dot(apa, beta) + np.dot(cov, cov_effects) + np.random.normal(size=apa.shape[0])
    return y

def test_het(apa, y, cov):
    design = sm.add_constant(np.hstack([apa, cov]))
    model = sm.OLS(y, design).fit()
    
    A = np.zeros([1, len(model.params)])
    A[0, 1] = 1
    A[0, 2] = -1
    p_ftest = model.f_test(A).pvalue.item()
    return p_ftest, model

In [30]:
# read dataset and do minimal processing
dset = xr.open_zarr("out/locus_het/gwas_hit.zarr/", chunks=-1)
admix.tools.af_per_anc(dset)
maf = np.minimum(dset["af_per_anc"].values, 1 - dset["af_per_anc"].values).min(axis=1)
dset = dset.sel(snp = maf > 0.005)

# Simulate beta heterogeneity effect sizes
Use two practices with / without conditioning local ancestry

In [214]:
np.random.seed(1234)
n_sim = 5

dict_rls = {"snp_i": [], "effect": [], "sim_i": [], "method": [], "pval": []}

cov = np.column_stack([dset.coords[col + "@indiv"].values for col in ["age", "sex"] + [f"geno_EV{i}" for i in range(1, 10)]])
allele_per_anc = dset["allele_per_anc"].values
lanc = dset.lanc.sum(axis=2).values

for snp_i in tqdm(range(dset.dims["snp"])):
# for snp_i in tqdm(range(10)):
    apa_snp = allele_per_anc[:, snp_i, :]
    lanc_snp = lanc[:, snp_i]
    for effect in [0.15, 0.2, 0.25]:
        pvals1 = []
        pvals2 = []
        
        for sim_i in range(n_sim):
            y_sim = simulate(apa_snp, [effect, 0.25], cov)
            # w/o conditioning
            pval1, model1 = test_het(apa_snp, y_sim, cov)
            pvals1.append(pval1)
            # w conditioning
#             pval2, model2 = test_het(apa_snp, sm.OLS(y_sim, sm.add_constant(lanc_snp)).fit().resid, cov)
            pval2, model2 = test_het(apa_snp, y_sim, np.column_stack([cov, lanc_snp]))
            pvals2.append(pval2)
        for method, pvals in zip(["w/o lanc", "w lanc"], [pvals1, pvals2]):
            dict_rls["pval"].extend(pvals)
            dict_rls["method"].extend([method] * n_sim)
            dict_rls["sim_i"].extend(np.arange(n_sim))
            dict_rls["effect"].extend([effect] * n_sim)
            dict_rls["snp_i"].extend([snp_i] * n_sim)

df_rls = pd.DataFrame(dict_rls)

100%|██████████| 155/155 [05:30<00:00,  2.13s/it]


In [222]:
np.random.seed(1234)
n_sim = 10

allele_per_anc = dset["allele_per_anc"].values
lanc = dset.lanc.sum(axis=2).values

diff_corr = []
for snp_i in tqdm(range(dset.dims["snp"])):
    apa_snp = allele_per_anc[:, snp_i, :]
    lanc_snp = lanc[:, snp_i]
    
    df_corr = pd.DataFrame(np.column_stack([apa_snp, lanc_snp]), columns=["EUR", "AFR", "lanc"]).corr()
    diff_corr.append(df_corr.iloc[1, 2] - df_corr.iloc[0, 2])

100%|██████████| 155/155 [00:00<00:00, 657.17it/s]


In [234]:
df_corr

Unnamed: 0,EUR,AFR,lanc
EUR,1.0,-0.374797,-0.764228
AFR,-0.374797,1.0,0.490207
lanc,-0.764228,0.490207,1.0


In [225]:
f = dset["af_per_anc"].values
df_af = pd.DataFrame({"snp_i": np.arange(af.shape[0]), "EUR_af": af[:, 0], "AFR_af": af[:, 1], "diff_corr": diff_corr})

df_plot = pd.merge(df_rls, df_af, on="snp_i")
df_plot["diff_af"] = df_plot["EUR_af"] - df_plot["AFR_af"]

In [226]:
df_plot.groupby(["method", "effect"]).agg({"pval": lambda x : np.mean(x < 0.05)})

Unnamed: 0_level_0,Unnamed: 1_level_0,pval
method,effect,Unnamed: 2_level_1
w lanc,0.15,0.784516
w lanc,0.2,0.331613
w lanc,0.25,0.052903
w/o lanc,0.15,0.867097
w/o lanc,0.2,0.543226
w/o lanc,0.25,0.049032


In [233]:
df_plot[df_plot.diff_corr > 1.0].groupby(["method", "effect"]).agg({"pval": lambda x : np.mean(x < 0.05)})

Unnamed: 0_level_0,Unnamed: 1_level_0,pval
method,effect,Unnamed: 2_level_1
w lanc,0.15,0.861765
w lanc,0.2,0.352941
w lanc,0.25,0.064706
w/o lanc,0.15,0.994118
w/o lanc,0.2,0.767647
w/o lanc,0.25,0.047059


In [232]:
df_plot[df_plot.diff_corr < 1.0].groupby(["method", "effect"]).agg({"pval": lambda x : np.mean(x < 0.05)})

Unnamed: 0_level_0,Unnamed: 1_level_0,pval
method,effect,Unnamed: 2_level_1
w lanc,0.15,0.724138
w lanc,0.2,0.314943
w lanc,0.25,0.043678
w/o lanc,0.15,0.767816
w/o lanc,0.2,0.367816
w/o lanc,0.25,0.050575


In [60]:
pd.DataFrame(dict_rls)

Unnamed: 0,snp_i,effect,sim_i,method,pval
0,0,-0.20,0,w/o lanc,1.058773e-110
1,0,-0.20,1,w/o lanc,2.846807e-114
2,0,-0.20,2,w/o lanc,7.618380e-105
3,0,-0.20,3,w/o lanc,6.552035e-102
4,0,-0.20,4,w/o lanc,4.728806e-109
...,...,...,...,...,...
395,0,0.25,95,w/o lanc,6.541897e-01
396,0,0.25,96,w/o lanc,7.188811e-01
397,0,0.25,97,w/o lanc,1.122052e-01
398,0,0.25,98,w/o lanc,3.904252e-01


In [47]:
dset_snp["af_per_anc"].values

array([0.40971406, 0.45859163])

In [None]:
gt = np.dstack([dict_hit_info[snp]["gt"] for snp in dict_hit_info])

In [None]:
gt = np.dstack([dict_hit_info[snp]["gt"] for snp in dict_hit_info])
sample = [dict_hit_info[snp]["sample"] for snp in dict_hit_info]
assert all(x == sample[0] for x in sample)
sample = sample[0]

R2 = [dict_hit_info[snp]["info"]["R2"] for snp in dict_hit_info]
AF = [dict_hit_info[snp]["info"]["AF"] for snp in dict_hit_info]

dset_gwas_hit = xr.Dataset(
    data_vars={
        "geno": (
            ("indiv", "snp", "ploidy"),
            da.from_array(np.swapaxes(gt, 1, 2), chunks=-1),
        ),
    },
    coords={
        "snp": np.array(list(dict_hit_info.keys())).astype(str),
        "indiv": np.array(sample).astype(str),
        "R2@snp": ("snp", np.array(R2, dtype=float)),
        "AF@snp": ("snp", np.array(AF, dtype=float)),
    },
    attrs={"n_anc": 2},
)

info = [s.split("_") for s in dset_gwas_hit.snp.values]
dset_gwas_hit = dset_gwas_hit.assign_coords(
    {
        "CHROM@snp": ("snp", [int(i[0]) for i in info]),
        "POS@snp": ("snp", [int(i[1]) for i in info]),
        "REF@snp": ("snp", [str(i[2]) for i in info]),
        "ALT@snp": ("snp", [str(i[3]) for i in info]),
    }
)

# align `dset_gwas_hit to `dset`
dset_gwas_hit = dset_gwas_hit.sel(indiv=dset.indiv.values)

# fill in lanc
df_dset = dset.snp.to_dataframe().reset_index(drop=True)
snp_index = []
for _, snp in tqdm(dset_gwas_hit.snp.to_dataframe().iterrows()):
    chrom, pos = snp["CHROM@snp"], snp["POS@snp"]
    df_tmp = df_dset[df_dset["CHROM@snp"] == chrom]
    snp_index.append((df_tmp["POS@snp"] - pos).abs().idxmin())
lanc = dset.isel(snp=snp_index).lanc.values
dset_gwas_hit = dset_gwas_hit.assign(
    lanc=(("indiv", "snp", "ploidy"), da.from_array(lanc, chunks=-1))
)

# fill in individual information
for col in dset:
    if col.endswith("@indiv"):
        dset_gwas_hit[col] = ("indiv", dset[col].values)
admix.tools.allele_per_anc(dset_gwas_hit)