In [1]:
import nanomotif as nm
import numpy as np
import polars as pl
from polars import col, lit, when
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
importlib.reload(nm)

<module 'nanomotif' from '/user_data/sh/dark-science/nanomotif/nanomotif/__init__.py'>

In [None]:
ec_assembly = nm.load_assembly("../data/ecoli/assembly.polished.fasta")
ec_pileup = nm.load_pileup("../data/ecoli/modkit.pileup.bed")
ec_motifs = nm.evaluate.process_sample(
    ec_assembly, 
    ec_pileup.pileup, 
    min_read_methylation_fraction = 0.80,
    min_kl_divergence = 0.2
)

In [2]:
mr_assembly = nm.load_assembly("../data/mruber/assembly.polished.fasta")
mr_pileup = nm.load_pileup("../data/mruber/modkit.pileup.bed")
mr_motifs = nm.evaluate.process_sample(mr_assembly, mr_pileup.pileup.filter(pl.col("mod_type")=="a"), 
    min_read_methylation_fraction = 0.80,
    min_kl_divergence = 0.2
    )

INFO: Processing contig_1
INFO: Processing a
DEBUG: ....................A.................... | BetaBernoulliModel(alpha=43533, beta=1090692) | 0.00 | 0
DEBUG: ....................A[TG]................... | BetaBernoulliModel(alpha=39333, beta=544799) | 0.01 | 2
DEBUG: ...................GA[TG]................... | BetaBernoulliModel(alpha=28677, beta=142812) | 0.05 | 2
DEBUG: ...................GA[TG]C.................. | BetaBernoulliModel(alpha=20734, beta=38932) | 0.17 | 2
DEBUG: ....................AT................... | BetaBernoulliModel(alpha=20948, beta=181283) | 0.02 | 2
DEBUG: ...................[AG]AT................... | BetaBernoulliModel(alpha=20271, beta=85848) | 0.05 | 4
DEBUG: ...................[AG]AT[TC].................. | BetaBernoulliModel(alpha=19535, beta=30690) | 0.20 | 6
DEBUG: ...................[AG]ATC.................. | BetaBernoulliModel(alpha=13930, beta=17802) | 0.28 | 5
DEBUG: ..................T[AG]ATC.................. | BetaBernoulliModel(alpha=50

In [2]:
gb_assembly = nm.load_assembly("../data/geobacillus/assembly.polished.fasta")
gb_pileup = nm.load_pileup("../data/geobacillus/modkit.pileup.bed")
gb_motifs = nm.evaluate.process_sample(
    gb_assembly, 
    gb_pileup.pileup.filter(pl.col("mod_type") == "a"), 
    min_kl_divergence=0.2, 
    min_read_methylation_fraction = 0.8
    )

INFO: Processing contig_3
INFO: Processing a
DEBUG: ....................A.................... | BetaBernoulliModel(alpha=807, beta=45886) | 0.00 | 0
DEBUG: ....................A.C.................. | BetaBernoulliModel(alpha=686, beta=8740) | 0.01 | 0
DEBUG: ...................GA.C.................. | BetaBernoulliModel(alpha=603, beta=1453) | 0.13 | 0
DEBUG: ...................GATC.................. | BetaBernoulliModel(alpha=603, beta=106) | 0.89 | 0
INFO: ...................GATC.................., 602 seqs. model: BetaBernoulliModel(alpha=603, beta=106). (25.8 % left)
INFO: Continuing search
DEBUG: ....................A.................... | BetaBernoulliModel(alpha=807, beta=45886) | 0.00 | 0
DEBUG: ...................CA.................... | BetaBernoulliModel(alpha=52, beta=10001) | -0.00 | 2
DEBUG: .................C.CA.................... | BetaBernoulliModel(alpha=48, beta=1581) | 0.00 | 2
DEBUG: .................CCCA.................... | BetaBernoulliModel(alpha=48, beta=354

In [3]:
def remove_noisy_motifs(motif_df):
    motif_strings = motif_df.get_column("motif").to_list()
    positions = motif_df.get_column("mod_position").to_list()
    motifs = [nm.candidate.Motif(motif_string, pos) for motif_string, pos in zip(motif_strings, positions)]
    clean_motifs = []
    for motif in motifs:
        if not motif.have_isolated_bases():
            clean_motifs.append(motif)
    return motif_df.filter(col("motif").is_in(clean_motifs))


def remove_sub_motifs(motif_df):
    motif_strings = motif_df.get_column("motif").to_list()
    positions = motif_df.get_column("mod_position").to_list()
    motifs = [nm.candidate.Motif(motif_string, pos) for motif_string, pos in zip(motif_strings, positions)]
    parent_motifs = []
    for i, motif in enumerate(motifs):
        parent = True
        for j, other in enumerate(motifs):
            if i == j:
                continue
            if motif.sub_string_of(other):
                parent = False
        if parent:
            parent_motifs.append(motif)
    return motif_df.filter(col("motif").is_in(parent_motifs))

In [4]:
mr_motifs

sequence,model,score,contig,mod_type,motif,mod_position
str,object,f64,str,str,str,i64
"""..............…","BetaBernoulliModel(alpha=1811, beta=66)",1.829449,"""contig_1""","""a""","""GGGAGC""",3
"""..............…","BetaBernoulliModel(alpha=4086, beta=553)",1.607063,"""contig_1""","""a""","""TTAA""",3
"""..............…","BetaBernoulliModel(alpha=1229, beta=9)",1.531066,"""contig_1""","""a""","""GGCA......TGG""",3
"""..............…","BetaBernoulliModel(alpha=5537, beta=644)",1.52708,"""contig_1""","""a""","""AATT""",1
"""..............…","BetaBernoulliModel(alpha=13544, beta=5875)",0.874797,"""contig_1""","""a""","""GATC""",1
"""..............…","BetaBernoulliModel(alpha=12759, beta=2306)",0.497409,"""contig_1""","""a""","""CTCGAG""",4
""".........GGG[T…","BetaBernoulliModel(alpha=8, beta=1)",0.323157,"""contig_1""","""a""","""GGG[TG]GGCGG..…",11
""".....G...GGGGG…","BetaBernoulliModel(alpha=6, beta=1)",0.298298,"""contig_1""","""a""","""G...GGGGGG.G.G…",15
"""......GGGTGGTG…","BetaBernoulliModel(alpha=5, beta=1)",0.275857,"""contig_1""","""a""","""GGGTGGTGG.G.G.…",14
"""..........GGTG…","BetaBernoulliModel(alpha=4, beta=1)",0.251845,"""contig_1""","""a""","""GGTGGTGATCA...…",10


In [104]:
mr_motifs_filt = remove_sub_motifs(mr_motifs)

0: GGGAGC
1: TTAA
2: GGCA......TGG
3: AATT
4: GATC
5: CTCGAG
6: GGG[TG]GGCGG..ATG................C
7: G...GGGGGG.G.G.A.C.G
8: GGGTGGTGG.G.G.A.C........C
9: GGTGGTGATCA...................
  - GATC
10: G.TGGGGGTGG..AGG
11: GG.CTCGAGCACC
  - CTCGAG
12: GG.GGTGGTGGG.A...............T
13: GGGGG..GG..AGGG...............[AG]
14: G..GG.G.GGGG.A.G
15: G..GGTG.TGATCATC
  - GATC
16: G....GG.GGTGGGG.AGG
17: G.....G.GG.GG.GGTGGAG
18: GGGG.GG.[TG].A.CT.........C
19: GG.GGTG.GG.GTA
20: GG.GGTGGG.A.G
21: GGTGG[TG]G.CGGG.AG..........T
22: GG.GGTGG.GGT.A.......C
23: C...GC.GGTG.[TG]GTGGA


In [105]:
mr_motifs_filt_clean =   remove_noisy_motifs(mr_motifs_filt)

GGGAGC
TTAA
GGCA......TGG
AATT
GATC
CTCGAG
GGG[TG]GGCGG..ATG................C
  - Noisy
G...GGGGGG.G.G.A.C.G
  - Noisy
GGGTGGTGG.G.G.A.C........C
  - Noisy
G.TGGGGGTGG..AGG
  - Noisy
GG.GGTGGTGGG.A...............T
  - Noisy
GGGGG..GG..AGGG...............[AG]
  - Noisy
G..GG.G.GGGG.A.G
  - Noisy
G....GG.GGTGGGG.AGG
  - Noisy
G.....G.GG.GG.GGTGGAG
  - Noisy
GGGG.GG.[TG].A.CT.........C
  - Noisy
GG.GGTG.GG.GTA
GG.GGTGGG.A.G
GGTGG[TG]G.CGGG.AG..........T
  - Noisy
GG.GGTGG.GGT.A.......C
  - Noisy
C...GC.GGTG.[TG]GTGGA
  - Noisy
Removed: 13


sequence,model,score,contig,mod_type,motif,mod_position
str,object,f64,str,str,str,i64
"""..............…","BetaBernoulliModel(alpha=1811, beta=66)",1.829449,"""contig_1""","""a""","""GGGAGC""",3
"""..............…","BetaBernoulliModel(alpha=4086, beta=553)",1.607063,"""contig_1""","""a""","""TTAA""",3
"""..............…","BetaBernoulliModel(alpha=1229, beta=9)",1.531066,"""contig_1""","""a""","""GGCA......TGG""",3
"""..............…","BetaBernoulliModel(alpha=5537, beta=644)",1.52708,"""contig_1""","""a""","""AATT""",1
"""..............…","BetaBernoulliModel(alpha=13544, beta=5875)",0.874797,"""contig_1""","""a""","""GATC""",1
"""..............…","BetaBernoulliModel(alpha=12759, beta=2306)",0.497409,"""contig_1""","""a""","""CTCGAG""",4
""".......GG.GGTG…","BetaBernoulliModel(alpha=11, beta=8)",0.115143,"""contig_1""","""a""","""GG.GGTG.GG.GTA…",13
"""..........GG.G…","BetaBernoulliModel(alpha=42, beta=57)",0.106211,"""contig_1""","""a""","""GG.GGTGGG.A.G""",10


In [15]:
gb_motifs

sequence,model,score,contig,mod_type,motif,mod_position
str,object,f64,str,str,str,i64
"""..............…","BetaBernoulliModel(alpha=23157, beta=3280)",1.454763,"""contig_1""","""a""","""GATC""",1
"""..............…","BetaBernoulliModel(alpha=2867, beta=318)",1.371515,"""contig_1""","""a""","""CCAAAT""",4
"""..............…","BetaBernoulliModel(alpha=1834, beta=769)",0.819725,"""contig_1""","""a""","""ACCCA""",4
"""..............…","BetaBernoulliModel(alpha=1264, beta=158)",0.755262,"""contig_1""","""a""","""GA.GAAGC""",5
"""..............…","BetaBernoulliModel(alpha=2142, beta=353)",0.679272,"""contig_1""","""a""","""GA.GAAG[TC]""",5
"""..............…","BetaBernoulliModel(alpha=60, beta=18)",0.561305,"""contig_1""","""a""","""ACCCAAAT""",5
"""..............…","BetaBernoulliModel(alpha=537, beta=108)",0.537634,"""contig_1""","""a""","""GG.GAAGC""",5
"""..........CG.C…","BetaBernoulliModel(alpha=4, beta=1)",0.335793,"""contig_1""","""a""","""CG.C.CGATCA[TC…",10
"""..............…","BetaBernoulliModel(alpha=399, beta=135)",0.310102,"""contig_1""","""a""","""GG.GAAGT""",5
""".........C..C.…","BetaBernoulliModel(alpha=4, beta=1)",0.293819,"""contig_1""","""a""","""C..C..CGATCACT…",11


In [4]:
gb_filtered_motifs = []
for motif, df in gb_motifs.group_by("contig"):
    gb_filtered_motifs.append(remove_sub_motifs(df))
gb_motifs_filt = pl.concat(gb_filtered_motifs)
gb_motifs_filt

0: GATC
1: CCAAAT
2: ACCCA
3: GA.GAAGC
  - GA.GAAG[TC]
4: GA.GAAG[TC]
5: ACCCAAAT
  - CCAAAT
  - ACCCA
6: GG.GAAGC
7: CG.C.CGATCA[TC]G.T
  - GATC
8: GG.GAAGT
9: C..C..CGATCACTT................
  - GATC
10: CGATCAT..C[GC].GC.....G
  - GATC
11: [TC]GATCATC.....CA...G
  - GATC
12: C.[GC].G..TGATCAT.....C
  - GATC
13: GC.GGCGATCA[TC]...........G
  - GATC
14: GGA.TGATCA[TC]..C.............C
  - GATC
15: CGATCAC.CT..C
  - GATC
16: G...TGATCATG.....C......G
  - GATC
17: TT.CCGATCAT.T...A
  - GATC
18: G......[TC]GATCACG.T...CG
  - GATC
19: C.....TGATCA[TC].CTG
  - GATC
20: A.........C..AATGATCAC
  - GATC
21: G...GCGATCAT.CCG......[GC]
  - GATC
22: CGATCACGC..........C
  - GATC
23: G.............GTGATCA[TC].CC
  - GATC
24: [TG]....TGATCAT......CG.G
  - GATC
25: G...........CGATCAT....G.T..GA
  - GATC
26: G....CGATCA[TC].TT..C
  - GATC
27: C.GTGATCAT..G...C
  - GATC
28: GG...CGATCAT......C
  - GATC
29: G..G...[TC]GATCATC.C
  - GATC
30: G..T..[TC]GATCAC..C
  - GATC
31: CG...CGATCA[TC]T.T.......G


sequence,model,score,contig,mod_type,motif,mod_position
str,object,f64,str,str,str,i64
"""..............…","BetaBernoulliModel(alpha=23157, beta=3280)",1.454763,"""contig_1""","""a""","""GATC""",1
"""..............…","BetaBernoulliModel(alpha=2867, beta=318)",1.057583,"""contig_1""","""a""","""CCAAAT""",4
"""..............…","BetaBernoulliModel(alpha=1834, beta=769)",0.819725,"""contig_1""","""a""","""ACCCA""",4
"""..............…","BetaBernoulliModel(alpha=2142, beta=353)",0.679272,"""contig_1""","""a""","""GA.GAAG[TC]""",5
"""..............…","BetaBernoulliModel(alpha=537, beta=108)",0.537634,"""contig_1""","""a""","""GG.GAAGC""",5
"""..............…","BetaBernoulliModel(alpha=399, beta=135)",0.310102,"""contig_1""","""a""","""GG.GAAGT""",5
"""..............…","BetaBernoulliModel(alpha=65, beta=12)",0.620491,"""contig_2""","""a""","""CCAAAT""",4
"""..............…","BetaBernoulliModel(alpha=521, beta=240)",0.552569,"""contig_2""","""a""","""GATC""",1
"""..............…","BetaBernoulliModel(alpha=57, beta=36)",0.356419,"""contig_2""","""a""","""ACCCA""",4
"""..............…","BetaBernoulliModel(alpha=71, beta=22)",0.254243,"""contig_2""","""a""","""G[AG].GAAG[TC]…",5


In [5]:
gb_motifs_filt_clean =  remove_noisy_motifs(gb_motifs_filt)
gb_motifs_filt_clean.sort("score")

GATC
CCAAAT
ACCCA
GA.GAAG[TC]
GG.GAAGC
GG.GAAGT
CCAAAT
GATC
ACCCA
G[AG].GAAG[TC]
CCAAAT
GATC
ACCCA
GA.GAAG[TC]
C............GG.GAAGT
  - Noisy
GG.GAAGC
GG..CG..A..C.....C.....C
  - Noisy
Removed: 2


sequence,model,score,contig,mod_type,motif,mod_position
str,object,f64,str,str,str,i64
"""..............…","BetaBernoulliModel(alpha=17, beta=5)",0.176612,"""contig_3""","""a""","""GG.GAAGC""",5
"""..............…","BetaBernoulliModel(alpha=71, beta=22)",0.254243,"""contig_2""","""a""","""G[AG].GAAG[TC]…",5
"""..............…","BetaBernoulliModel(alpha=399, beta=135)",0.310102,"""contig_1""","""a""","""GG.GAAGT""",5
"""..............…","BetaBernoulliModel(alpha=45, beta=7)",0.344808,"""contig_3""","""a""","""GA.GAAG[TC]""",5
"""..............…","BetaBernoulliModel(alpha=57, beta=36)",0.356419,"""contig_2""","""a""","""ACCCA""",4
"""..............…","BetaBernoulliModel(alpha=537, beta=108)",0.537634,"""contig_1""","""a""","""GG.GAAGC""",5
"""..............…","BetaBernoulliModel(alpha=521, beta=240)",0.552569,"""contig_2""","""a""","""GATC""",1
"""..............…","BetaBernoulliModel(alpha=48, beta=18)",0.558942,"""contig_3""","""a""","""ACCCA""",4
"""..............…","BetaBernoulliModel(alpha=65, beta=12)",0.620491,"""contig_2""","""a""","""CCAAAT""",4
"""..............…","BetaBernoulliModel(alpha=2142, beta=353)",0.679272,"""contig_1""","""a""","""GA.GAAG[TC]""",5


# Motif mergin
Require a bit more thought

In [6]:
def merge_motif(motif1, motif2):
    motif1_split = motif1.split()
    motif2_split = motif2.split()
    index_offset = motif1.mod_position - motif2.mod_position
    new_motif_string = ""
    for i, base in enumerate(motif1_split):
        if i + index_offset < 0:
            new_motif_string += "."
        elif i - index_offset >= len(motif2_split):
            new_motif_string += "."
        else:
            new_bases = set(base + motif2_split[i - index_offset]).intersection(["A", "C", "G", "T"])
            if len(new_bases) == 4 or len(new_bases) == 0:
                new_motif_string += "."
            elif len(new_bases) == 1:
                new_motif_string += new_bases.pop()
            else:
                new_motif_string += "["
                new_motif_string += "".join(list(new_bases))
                new_motif_string += "]"
    return nm.candidate.Motif(new_motif_string, motif1.mod_position).new_stripped_motif()




In [13]:
motifs_str = gb_motifs_filt_clean.filter(col("contig") == "contig_1").get_column("motif").to_list()
motifs_pos = gb_motifs_filt_clean.filter(col("contig") == "contig_1").get_column("mod_position").to_list()
motifs = [nm.candidate.Motif(motif_str, pos) for motif_str, pos in zip(motifs_str, motifs_pos)]
motifs

merged_motifs = []
merged_index = []
for i, motif in enumerate(motifs):
    if i in merged_index:
        continue
    merged = motif
    for j, other in enumerate(motifs):
        if i == j or j in merged_index:
            continue
        if motif.distance(other) <= 2:
            merged = (merge_motif(merged, other))
            merged_index.append(j)
    merged_motifs.append(merged)

merged_motifs

['GATC', 'CCAAAT', 'ACCCA', 'G[GA].GAAG[CT]']