# Selecting the seed alignments

We use the results from `mdeqasis microbial-gn-stats`. This command extracts core statistics from the GN fits produced from running `mdeqasis microbial-fit-gn`. Note that `microbial-gn-stats` excludes fitted models where the maximum likelihood estimates were within machine precision of the upper/lower bound (which suggests a difficult to fit alignment).

In [1]:
from mdeq_analysis import (
    microbial,
)  # isort: skip  # keep at top as this registers custom deserialiser
from pathlib import Path

OUTPUT_ROOT = Path("~/repos/MutationDiseqMS").expanduser()
fig_dir = OUTPUT_ROOT / "figs_supp"
table_dir = OUTPUT_ROOT / "tables_supp"
fig_dir.mkdir(parents=True, exist_ok=True)
table_dir.mkdir(parents=True, exist_ok=True)

In [2]:
import plotly.express as px
from plotly.io import full_figure_for_development

from cogent3 import make_table, open_data_store
from mdeq.utils import load_from_sqldb

In [3]:
path = "../data/raw/microbial/fit_gn-stats.sqlitedb"
dstore = open_data_store(path)
print(dstore)

9854x member DataStoreSqlite(source='/Users/gavin/repos/Honours2021/Kath/MutationDiseqAnalysis/nbks/../data/raw/microbial/fit_gn-stats.sqlitedb', members=[DataMember(data_store=/Users/gavin/repos/Honours2021/Kath/MutationDiseqAnalysis/nbks/../data/raw/microbial/fit_gn-stats.sqlitedb, unique_id=100032_52477_200580.json), DataMember(data_store=/Users/gavin/repos/Honours2021/Kath/MutationDiseqAnalysis/nbks/../data/raw/microbial/fit_gn-stats.sqlitedb, unique_id=100193_244328_133502.json)]...)


In [4]:
dstore.summary_logs

time,name,python version,who,command,composable
2022-05-07 12:04:48,logs/sql_loader-user_function-user_function-sql_writer-f502e5e2.log,3.10.0,gavin,/Users/gavin/opt/miniconda3/envs/c310dev/bin/mdeqasis microbial-gn-stats -i data/raw/microbial/fit_gn.sqlitedb -o data/raw/microbial/fit_gn-stats.sqlitedb -O -vv -p,"sql_loader(type='output') + user_function(name='mles_within_bounds',module='mdeq.model') + user_function(name='compute_stats',module='mdeq_analysis.microbial') + sql_writer(type='output',args=(PosixPath('data/raw/microbial/fit_gn-stats.sqlitedb'),), suffix='json',kwargs={'create': True, 'if_exists': 'overwrite'})"
2023-01-28 11:05:13,logs/convert_db_to_new_sqlitedb-e887e843.log,3.10.0,gavin,/Users/gavin/opt/miniconda3/envs/c310dev/bin/mdeq db-upgrade data-old/* -od data -O,


In [5]:
loader = load_from_sqldb()
records = [loader(m) for m in dstore.completed]
header = records[0].header()
rows = [r.to_record() for r in records]
table = make_table(header=header, data=rows)

We eliminate model with fits where the condition number was > 2. (Condition number is an indicator of numerical issues.)

In [6]:
table = table.filtered(lambda x: x <= 2, columns="cond_num")
table

source,foreground,jsd,entropy,cond_num
100193_878_164800,878,0.0005,1.6803,1.9531
100347_145636_156619,156619,0.0048,1.6972,1.9264
100514_880_75781,100514,0.0008,1.8724,1.7404
100667_334811_520123,334811,0.0011,1.7527,1.7516
100984_162981_175057,162981,0.0011,1.8738,1.7910
...,...,...,...,...
923_202620_544074,923,0.0002,1.7036,1.8173
92579_740_200580,740,0.0002,1.8228,1.7152
930_131798_350114,131798,0.0066,1.7517,1.9905
951_350595_127600,127600,0.0030,1.7017,1.8567


We selected four alignments as our seeds, corresponding to combinations of hi/lo entropy/jsd. These are referred to as "seed" alignments as the fits to these alignments are used to generate synthetic data for evaluating statistical measures.

In [7]:
seed_alignments = [
    "197113_332182_17210",
    "198257_206396_13724",
    "200580_114946_573911",
    "758_443154_73021",
]
seeds = table.filtered(lambda x: x in seed_alignments, columns="source")
not_seeds = table.filtered(lambda x: x not in seed_alignments, columns="source")

We show the position of the selected seed alignments (red markers) with respect to the full distribution (blue markers).

In [8]:
x_label = "jsd"
y_label = "entropy"
nseed_plot = {
    "x": not_seeds.columns[x_label],
    "y": not_seeds.columns[y_label],
    "mode": "markers",
    "marker_color": "blue",
}
seed_plot = {
    "x": seeds.columns[x_label],
    "y": seeds.columns[y_label],
    "mode": "markers",
    "marker_size": 10,
    "marker_color": "red",
}
traces = [nseed_plot, seed_plot]

fig = px.scatter()
fig.add_traces(traces)

size = 700
_ = fig.update_layout(
    showlegend=False,
    xaxis=dict(title=r"$\widehat {\textrm{JSD}}$"),
    yaxis=dict(title=r"$\hat H(\pi_\infty)$"),
    width=size,
    height=size,
)

In [9]:
# address plotly bug, suppress MathJax warning box
fig = full_figure_for_development(fig, warn=False)
# fig.show()

Write the image out for inclusion in the manuscript.

In [12]:
fig.write_image(fig_dir / "microbial-jsd_x_entropy.pdf")

The designations of the seed fits

In [11]:
header = ["Identifier", "Entropy", "JSD"]
seeds = [
    (r"197113\_332182\_17210", "Hi", "Hi"),
    (r"198257\_206396\_13724", "Hi", "Lo"),
    (r"200580\_114946\_573911", "Lo", "Hi"),
    (r"758\_443154\_73021", "Lo", "Lo"),
]

table = make_table(
    header=header,
    data=seeds,
    title="Selected seed fits from microbial data.",
    legend="Fits from these alignments were used for the simulation study. "
    "Identifier is from the GreenGenes alignment. Entropy and JSD categories "
    r"are from Figure \ref{supfig:jsd-vs-entropy}."
)

table.write(table_dir / "microbial-seed_fits.tex", label="suptable:seed-categories", justify="lll")