from Squartini and Arndt:

We further want to quantify whether deviations from zero of the three indices are statistically significant when only a finite amount of sequence data is available to measure the present day nucleotide distribution. To achieve this we compare the distribution of nucleotides, $\rho_\alpha$, of a sequence of length N to the stationary distribution, $\pi_\alpha$, using a $\chi^2$ test with

$$ \chi^2 = N \sum_\alpha  \frac{\rho_\alpha - \pi_\alpha}{\pi_\alpha}$$

This quantity follows a $ \chi^2 $ distribution with 3 degrees of freedom. Deviations from stationarity are significant (with 95% confidence) if $ \chi^2 $ > 7.8147.

In [None]:
from mdeq.utils import load_from_sqldb
from mdeq.stationary_pi import get_stat_pi_via_eigen

from cogent3 import open_data_store
from cogent3.core.table import Table
import pathlib

from cogent3.app import typing as c3_types
from cogent3.app.data_store import DataStoreDirectory
from cogent3.app.composable import define_app
from cogent3.app.io import write_tabular
from scipy import stats


from mdeq.bootstrap import (
    compact_bootstrap_result,
)

In [2]:
synthetic_GSN_fits_paths = list(pathlib.Path("../results/micro/toe/fg-GSN-toe/").glob("**/*.sqlitedb"))

In [3]:
def chi_squared(pi, pi_inf, n, motif_order):
    chi_sum = 0 
    for i, nt in enumerate(motif_order):
        chi_sum += ((pi[nt] - pi_inf[i])**2)/pi_inf[i]
    chi_sum *= n
    
    p = 1 - stats.chi2.cdf(chi_sum, 3)

    return chi_sum, p

In [4]:
@define_app()
def squartini_arndt_test(
    hyp_result: compact_bootstrap_result,
) -> c3_types.TabularType:
    hyp_result.deserialised_values()
    observed_gn = hyp_result.observed["GN"]
    fg_edge = observed_gn.alignment.info["fg_edge"]
    pi = observed_gn.lf.get_motif_probs(fg_edge)
    P = observed_gn.lf.get_psub_for_edge(name=fg_edge)
    pi_inf = get_stat_pi_via_eigen(P)

    chi_2, p = chi_squared(pi = pi, pi_inf=pi_inf, n=len(observed_gn.alignment), motif_order=P.keys())

    table = Table(["name", "chi2", "chisq_pval"], data=[[hyp_result.source, chi_2, p]])
    return table

In [5]:
# open data stores for input and output 
for path in synthetic_GSN_fits_paths:
    in_path = path
    in_dstore = open_data_store(in_path)

    out_path = pathlib.Path("../results/micro/chi2/"+str(in_path).split("/")[-1])
    out_dstore = DataStoreDirectory(out_path, mode="w", suffix="tsv")

    # define apps 
    loader = load_from_sqldb()
    writer = write_tabular(out_dstore)
    chi = squartini_arndt_test()

    proc = loader + chi + writer

    r = proc.apply_to(in_dstore, show_progress=True, parallel=True)

In [6]:
# oops it seems to have created a whole new tsv for each alignment :(

In [7]:
results_paths = list(pathlib.Path("../results/micro/chi2/").glob("**/*.sqlitedb"))

In [8]:
for path in results_paths:
    consolidated_path = "../"+str(path).split(".")[2]+".tsv"
    with open(consolidated_path, "x") as out_file:
        out_file.write('name\tchi2\tchisq_pval\n')
        for tsv in pathlib.Path(path).glob("*.tsv"):
            with open(tsv, 'r') as f:
                lines = f.readlines()
                out_file.write(lines[1]+"\n") 