Initial implementation of Beta-Bernoulli model, based the contig levels methylation calls. 

In [1]:
import methmotif.evaluate as mm
import polars as pl
import re

First load assembly. 

In [2]:
assembly = mm.Assembly("../data/ecoli/assembly.polished.fasta")

Load pileup data, and set threshold for which positions are considerd methylated. Here we use a threshold of 80%, e.g. 80% of mapped read have to be methylated for the position to be considered methylated. Also we only consider 6mA methylation type here. 

In [3]:
ecoli = mm.Pileup("../data/ecoli/modkit.pileup.bed")
ecoli_6mA_80p = ecoli.pileup.filter(pl.col("mod_type") == "a").filter(pl.col("fraction_mod") > 0.8)

Lets generate some candidate motifs, here all kmers up to size 5. We also generate a candidate for possible methylation position, for example GATC is first evluated as if the G is methylated, then the A, then the T and finally the C. 

In [13]:
candidate_motifs = mm.generate_kmers(5, alphabet=["A", "C", "G", "T"])
candidate_motifs_indexed = [[motif, i] for motif in candidate_motifs for i in range(len(motif))]
candidate_motifs_indexed = list(map(list, zip(*candidate_motifs_indexed)))


In [14]:
motif_candidates = mm.MotifCandidates(candidate_motifs_indexed[0], candidate_motifs_indexed[1])

In [6]:
scored_candidates = mm.score_candidates(ecoli_6mA_80p, assembly, "contig_3", motif_candidates)

In [8]:
scored_candidates.with_columns(
    pl.col("posterior").apply(lambda model: 1 - model.cdf(0.8)).alias("cdf_score"),
    pl.col("posterior").apply(lambda model: model.mean() / model.standard_deviation()).alias("signal_noise"),
).sort(pl.col("signal_noise"), descending = True)

motif,motif_mod_index,posterior,cdf_score,signal_noise
str,i64,object,f64,f64
"""GATC""",1,"BetaBernoulliModel(alpha=37562, beta=688)",1.0,1445.112081
"""C""",0,"BetaBernoulliModel(alpha=128648, beta=2228106)",0.0,368.884846
"""G""",0,"BetaBernoulliModel(alpha=113525, beta=2243229)",0.0,345.355334
"""T""",0,"BetaBernoulliModel(alpha=101413, beta=2182005)",0.0,325.770496
"""ATC""",0,"BetaBernoulliModel(alpha=37710, beta=135349)",0.0,219.58319
"""GAT""",1,"BetaBernoulliModel(alpha=37630, beta=135429)",0.0,219.285352
"""CG""",0,"BetaBernoulliModel(alpha=42045, beta=651333)",0.0,211.563615
"""A""",0,"BetaBernoulliModel(alpha=41864, beta=2241554)",0.0,206.5088
"""GA""",1,"BetaBernoulliModel(alpha=37926, beta=496692)",0.0,202.044546
"""AT""",0,"BetaBernoulliModel(alpha=38014, beta=581684)",0.0,201.242


In [15]:
ecoli_5mC_80p = ecoli.pileup.filter(pl.col("mod_type") == "m").filter(pl.col("fraction_mod") > 0.8)
scored_candidates = mm.score_candidates(ecoli_5mC_80p, assembly, "contig_3", motif_candidates)

In [16]:
scored_candidates.with_columns(
    pl.col("posterior").apply(lambda model: 1 - model.cdf(0.8)).alias("cdf_score"),
    pl.col("posterior").apply(lambda model: model.mean() / model.standard_deviation()).alias("signal_noise"),
).sort(pl.col("signal_noise"), descending = True)

motif,motif_mod_index,posterior,cdf_score,signal_noise
str,i64,object,f64,f64
"""CCTGG""",1,"BetaBernoulliModel(alpha=11995, beta=54)",1.0,1636.050744
"""CCAGG""",1,"BetaBernoulliModel(alpha=11959, beta=90)",1.0,1265.376316
"""G""",0,"BetaBernoulliModel(alpha=65608, beta=2291146)",0.0,259.782107
"""A""",0,"BetaBernoulliModel(alpha=62380, beta=2221038)",0.0,253.243027
"""T""",0,"BetaBernoulliModel(alpha=49992, beta=2233426)",0.0,226.077466
"""CA""",1,"BetaBernoulliModel(alpha=25655, beta=621814)",0.0,163.442722
"""C""",0,"BetaBernoulliModel(alpha=24572, beta=2332182)",0.0,157.578242
"""CC""",1,"BetaBernoulliModel(alpha=22383, beta=439228)",0.0,153.374332
"""CG""",1,"BetaBernoulliModel(alpha=22627, beta=670751)",0.0,152.938972
"""GA""",0,"BetaBernoulliModel(alpha=21719, beta=512899)",0.0,150.461775
