In [1]:
import pathlib
from rich.progress import track
from statistics import median
from cogent3 import open_data_store, make_table
from mdeq.utils import load_from_sqldb

loader = load_from_sqldb()

def get_paths():
    all_paths = list(pathlib.Path("../data/").glob("**/*.sqlitedb"))
    paths = []
    for path in all_paths:
        if (("filtered" not in path.name and 
             "drosophila" not in path.parts) or
            ({"raw"} & set(path.parts)) or 
            "windowed" in path.name):
            continue
        paths.append(path)
    return paths

paths = get_paths()

In [3]:
def get_length_stats(path) -> list[int, int, int]:
    dstore = open_data_store(path)
    lengths = [len(loader(m)) for m in dstore.completed]
    return [min(lengths), int(round(median(lengths), 0)), max(lengths), len(lengths)]

def get_data_group(path):
    if "micro" in path.parts:
        group = "Microbial"
    elif "ape" in path.parts:
        group = "Great Apes"
    elif "drosophila" in path.parts:
        group = "Drosophila"
    elif "fxy" in path.parts:
        group = "Rodent"
    else:
        raise ValueError(path)
    return group

def get_data_type(path, group):
    if group == "Drosophila":
        return "CDS"
    if group == "Rodent":
        return "Intron"
    if "micro" in path.parts:
        return "16S rRNA"
    if "ape" in path.parts:
        return "Intron" if "intron" in path.name else "CDS"
    raise ValueError(path, group)

results = []
for path in track(paths):
    grp = get_data_group(path)
    data_type = get_data_type(path, grp)
    lengths = get_length_stats(path)
    results.append([grp, data_type] + lengths)

def _comma_sep(value):
    return f"{value:,}"

header = ["Data Set", "Seq. Type", 
                           "min(length)",
                           "median(length)",
                           "max(length)",
                           "Num. Alignments"]
table = make_table(header=header, 
                   data=results,
                   column_templates={c: _comma_sep for c in header[2:]})
    

Output()

Data Set,Seq. Type,min(length),median(length),max(length),Num. Alignments
Microbial,16S rRNA,954,1140,1275,380
Great Apes,Intron,3038,13561,251535,613
Great Apes,CDS,300,576,3773,613
Rodent,Intron,377,1864,42745,9
Drosophila,CDS,300,533,8892,5680


In [5]:
table.title = "Summary statistics of filtered data sets."
table.legend = "Range of alignment lengths and the total number of alignments are shown."
table.sorted(columns="Data Set")

Data Set,Seq. Type,min(length),median(length),max(length),Num. Alignments
Drosophila,CDS,300,533,8892,5680
Great Apes,Intron,3038,13561,251535,613
Great Apes,CDS,300,576,3773,613
Microbial,16S rRNA,954,1140,1275,380
Rodent,Intron,377,1864,42745,9


In [7]:
with open("../../MutationDiseqMS/tables_supp/data_set_summary.tex", mode="w") as out:
    out.write(table.to_latex(justify="llrrrr", label="suptable:data-summary"))