# Evaluating properties of testing heterogeneity with / without including local ancestries

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import numpy as np
import pandas as pd
import pickle
import admix
import matplotlib.pyplot as plt
import statsmodels.api as sm
from tqdm import tqdm

In [2]:
def simulate(apa, beta, cov):
    cov_effects = np.random.normal(loc=0, scale=0.1, size=cov.shape[1])
    y = (
        np.dot(apa, beta)
        + np.dot(cov, cov_effects)
        + np.random.normal(size=apa.shape[0])
    )
    return y


def test_het(apa, y, cov):
    design = sm.add_constant(np.hstack([apa, cov]))
    model = sm.OLS(y, design).fit()

    A = np.zeros([1, len(model.params)])
    A[0, 1] = 1
    A[0, 2] = -1
    p_ftest = model.f_test(A).pvalue.item()
    return p_ftest, model

In [45]:
pfile = f"out/real-dataset/region1"

dset = admix.io.read_dataset(pfile=pfile, n_anc=2)
dset = dset[0:10000]
dset = dset[
    (
        dset.snp.EUR_FREQ.between(0.01, 0.99) & dset.snp.AFR_FREQ.between(0.01, 0.99)
    ).values
]
dset.persist()

apa = dset.allele_per_anc().compute()
lanc = dset.lanc.sum(axis=2).compute()

2022-01-22 20:31.47 [info     ] admix.Dataset: read local ancestry from out/real-dataset/region1.lanc


# Simulate beta heterogeneity effect sizes
Use two practices with / without conditioning local ancestry

In [46]:
np.random.seed(1234)
n_sim = 5

dict_rls = {
    "snp_i": [],
    "effect": [],
    "sim_i": [],
    "method": [],
    "pval": [],
    "coef1": [],
    "coef2": [],
    "se1": [],
    "se2": [],
}

cov = np.column_stack(
    [dset.indiv[col].values for col in [f"geno_EV{i}" for i in range(1, 3)]]
)

for snp_i in tqdm(range(0, dset.n_snp, 5)):
    apa_snp = apa[snp_i, :, :]
    lanc_snp = lanc[snp_i, :]
    for effect in [0.015, 0.02, 0.025]:
        pvals1 = []
        pvals2 = []

        for sim_i in range(n_sim):
            beta = np.array([effect, 0.025])
            beta *= np.random.choice([-1, 1])
            y_sim = simulate(apa_snp, beta, cov)
            # w/o conditioning
            pval1, model1 = test_het(apa_snp, y_sim, cov)
            pvals1.append(pval1)

            dict_rls["coef1"].append(model1.params[1])
            dict_rls["coef2"].append(model1.params[2])
            dict_rls["se1"].append(model1.bse[1])
            dict_rls["se2"].append(model1.bse[2])

            # w conditioning
            #             pval2, model2 = test_het(
            #                 apa_snp, sm.OLS(y_sim, sm.add_constant(lanc_snp)).fit().resid, cov
            #             )
            pval2, model2 = test_het(apa_snp, y_sim, np.column_stack([cov, lanc_snp]))
            pvals2.append(pval2)
            dict_rls["coef1"].append(model2.params[1])
            dict_rls["coef2"].append(model2.params[2])
            dict_rls["se1"].append(model2.bse[1])
            dict_rls["se2"].append(model2.bse[2])

        for method, pvals in zip(["w/o lanc", "w lanc"], [pvals1, pvals2]):
            dict_rls["pval"].extend(pvals)
            dict_rls["method"].extend([method] * n_sim)
            dict_rls["sim_i"].extend(np.arange(n_sim))
            dict_rls["effect"].extend([effect] * n_sim)
            dict_rls["snp_i"].extend([snp_i] * n_sim)

df_rls = pd.DataFrame(dict_rls)

100%|██████████| 682/682 [04:14<00:00,  2.68it/s]


In [48]:
df_rls.groupby(["method", "effect"]).agg({"pval": lambda x: np.mean(x < 0.05)})

Unnamed: 0_level_0,Unnamed: 1_level_0,pval
method,effect,Unnamed: 2_level_1
w lanc,0.015,0.055425
w lanc,0.02,0.054252
w lanc,0.025,0.050733
w/o lanc,0.015,0.066569
w/o lanc,0.02,0.059824
w/o lanc,0.025,0.052199


In [49]:
import admix_genet_cor

In [50]:
df_rls.groupby(["method", "effect"]).apply(
    lambda x: admix_genet_cor.locus.deming_regression(
        x=x["coef1"], y=x["coef2"], sx=x["se1"], sy=x["se2"]
    )[0]
)

method    effect
w lanc    0.015     1.530235
          0.020     1.201315
          0.025     0.952477
w/o lanc  0.015     1.628569
          0.020     1.245842
          0.025     0.951579
dtype: float64

In [30]:
np.random.seed(1234)

df_corr_list = []
for snp_i in tqdm(range(dset.n_snp)):
    apa_snp = apa[snp_i, :, :]
    lanc_snp = lanc[snp_i, :]

    df_corr = pd.DataFrame(
        np.column_stack([apa_snp, lanc_snp]), columns=["EUR", "AFR", "lanc"]
    ).corr()
    df_corr_list.append(df_corr)

100%|██████████| 418/418 [00:01<00:00, 360.09it/s]


In [31]:
avg_df_corr = np.zeros_like(df_corr_list[0])
for d in df_corr_list:
    avg_df_corr += d
avg_df_corr /= len(df_corr_list)

In [32]:
avg_df_corr

Unnamed: 0,EUR,AFR,lanc
EUR,1.0,-0.214814,-0.467823
AFR,-0.214814,1.0,0.342366
lanc,-0.467823,0.342366,1.0


In [34]:
np.std(df_corr_list, axis=0) / np.sqrt(len(df_corr_list))

array([[0.        , 0.01195146, 0.01245882],
       [0.01195146, 0.        , 0.0126318 ],
       [0.01245882, 0.0126318 , 0.        ]])

In [39]:
af = dset.af_per_anc()
df_af = pd.DataFrame(
    {"snp_i": np.arange(af.shape[0]), "EUR_af": af[:, 0], "AFR_af": af[:, 1]}
)

df_plot = pd.merge(df_rls, df_af, on="snp_i")
df_plot["diff_af"] = df_plot["EUR_af"] - df_plot["AFR_af"]

Unnamed: 0_level_0,Unnamed: 1_level_0,pval
method,effect,Unnamed: 2_level_1
w lanc,0.15,0.635714
w lanc,0.2,0.257143
w lanc,0.25,0.040476
w/o lanc,0.15,0.678571
w/o lanc,0.2,0.397619
w/o lanc,0.25,0.061905
