Initial implementation of Beta-Bernoulli model, based the contig levels methylation calls. 

In [1]:
import methmotif.evaluate as mm
import polars as pl
import re

In [2]:
assembly = mm.Assembly("../data/ecoli/assembly.polished.fasta")

In [3]:
ecoli = mm.Pileup("../data/ecoli/modkit.pileup.bed")
ecoli_6mA_80p = ecoli.pileup.filter(pl.col("mod_type") == "a").filter(pl.col("fraction_mod") > 0.8)

Select positions with a fraction reads above 80% as positive methylations

In [4]:
candidate_motifs = mm.generate_kmers(4, alphabet=["A", "C", "G", "T"])
candidate_motifs_indexed = [[motif, i] for motif in candidate_motifs for i in range(len(motif))]
candidate_motifs_indexed = list(map(list, zip(*candidate_motifs_indexed)))


In [5]:
motif_candidates = mm.MotifCandidates(candidate_motifs_indexed[0], candidate_motifs_indexed[1])

In [6]:
scored_candidates = mm.score_candidates(ecoli_6mA_80p, assembly, "contig_3", motif_candidates)

In [8]:
scored_candidates.with_columns(
    pl.col("posterior").apply(lambda model: 1 - model.cdf(0.8)).alias("cdf_score"),
    pl.col("posterior").apply(lambda model: model.mean() / model.standard_deviation()).alias("signal_noise"),
).sort(pl.col("signal_noise"), descending = True)

motif,motif_mod_index,posterior,cdf_score,signal_noise
str,i64,object,f64,f64
"""GATC""",1,"BetaBernoulliModel(alpha=37562, beta=688)",1.0,1445.112081
"""C""",0,"BetaBernoulliModel(alpha=128648, beta=2228106)",0.0,368.884846
"""G""",0,"BetaBernoulliModel(alpha=113525, beta=2243229)",0.0,345.355334
"""T""",0,"BetaBernoulliModel(alpha=101413, beta=2182005)",0.0,325.770496
"""ATC""",0,"BetaBernoulliModel(alpha=37710, beta=135349)",0.0,219.58319
"""GAT""",1,"BetaBernoulliModel(alpha=37630, beta=135429)",0.0,219.285352
"""CG""",0,"BetaBernoulliModel(alpha=42045, beta=651333)",0.0,211.563615
"""A""",0,"BetaBernoulliModel(alpha=41864, beta=2241554)",0.0,206.5088
"""GA""",1,"BetaBernoulliModel(alpha=37926, beta=496692)",0.0,202.044546
"""AT""",0,"BetaBernoulliModel(alpha=38014, beta=581684)",0.0,201.242
