In [1]:
import nanomotif as nm
import logging as log
log.getLogger().setLevel(log.DEBUG)
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
padding = 12


# Intro

Documentation of considerations in the implementation of the candidate selection. 

The selection is based on the sequences around methylation sites on the contig. At each methylated position, a frame of n posiion on each side of the methylation side is extracted and the index within this subsequence is considered relative to the methylation position (range: -n to n or 0:n*2).

The initial approach for candidate selection is outlined here:

1. Sample random sequences in contig at canonical bases (e.g. A when 6mA i evaluated)
2. Calculate [PSSM](https://cs.rice.edu/~ogilvie/comp571/pssm/) (positional frequencies of bases) for contig sequences
3. Extract sequences in contig at all methylation sites
4. Set motif candidate to canonical base
2. While len(methylations sequences) > minimum sequences 
    - calculate PSSM for methylations sequences
    - Calculate KL-divergence from methylation PSSM to contig PSSM at each position
    - Select position in sequence with highest distance
    - Select most frequent base at this position
    - Add base to motif
    - Score new motif
    - if score > threshold
        - remove seuqences with motif from methylation sequences
        - keep motif
    - else
        - select sequences with motif from methylation sequences

This should grow a motif by incorporation the most informative positions and bases first. 

In [None]:
ec_assembly = nm.load_assembly("../data/ecoli/assembly.polished.fasta")
ec_pileup = nm.load_pileup("../data/ecoli/modkit.pileup.bed")

In [None]:
ec_motifs = nm.evaluate.process_sample(
    ec_assembly, 
    ec_pileup.pileup, 
    min_read_methylation_fraction = 0.7,
    min_kl_divergence = 0.2
)


In [None]:
ec_motifs_plot = ec_motifs \
    .with_columns([
        pl.col("model").apply(lambda model: model.mean()).alias("mean"),
        pl.col("contig").apply(lambda 
                               contig: f"{contig}, {len(ec_assembly.assembly[contig])/ 1e3:.0f}kB").alias("contig_len"),
    ]).sort("motif", descending = True)
heatmap = sns.heatmap(ec_motifs_plot.to_pandas().pivot(values = "mean", index = "contig_len", columns = "motif"), 
                      cmap = "viridis", square=True, vmin = 0.0, vmax = 1.0, annot = True);
heatmap.set_facecolor('lightgray');
plt.title("Mean methylation of identified motifs in E. coli");
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=90);

# M. ruber

In [None]:
mr_assembly = nm.load_assembly("../data/mruber/assembly.polished.fasta")
mr_pileup = nm.load_pileup("../data/mruber/modkit.pileup.bed")

In [None]:
mr_motifs = nm.evaluate.process_sample(mr_assembly, mr_pileup.pileup.filter(pl.col("mod_type")=="a"), 
    min_read_methylation_fraction = 0.80,
    min_kl_divergence = 0.2
    )

In [None]:
mr_motifs_plot = mr_motifs \
    .with_columns([
        pl.col("model").apply(lambda model: model.mean()).alias("mean"),
        pl.col("contig").apply(lambda contig: f"{contig}, {len(mr_assembly.assembly[contig])/ 1e3:.0f}kB").alias("contig_len"),
    ]).sort("motif", descending = True)
cmap = sns.cubehelix_palette(as_cmap=True, light=.9)
heatmap = sns.heatmap(mr_motifs_plot.to_pandas().pivot(values = "mean", index = "contig_len", columns = "motif"), 
                      cmap = "viridis", square=True, vmin = 0, vmax = 1, annot = True);
heatmap.set_facecolor('lightgray');
plt.title("Mean methylation of identified motifs in M. ruber");

# Geobacillus

In [2]:
gb_assembly = nm.load_assembly("../data/geobacillus/assembly.polished.fasta")
gb_pileup = nm.load_pileup("../data/geobacillus/modkit.pileup.bed")

In [3]:
gb_motifs = nm.evaluate.process_sample(
    gb_assembly, 
    gb_pileup.pileup.filter(pl.col("mod_type") == "a"), 
    max_candidate_size = 30, 
    min_valid_coverage=5,
    min_kl_divergence=0.2, 
    min_read_methylation_fraction = 0.8
    )

INFO: Processing contig_3
INFO: Processing a


DEBUG: ...............A............... | BetaBernoulliModel(alpha=807, beta=45886) | 0.06 | 0
DEBUG: ...............A.C............. | BetaBernoulliModel(alpha=686, beta=8740) | 0.79 | 0
DEBUG: ..............GA.C............. | BetaBernoulliModel(alpha=603, beta=1453) | 2.36 | 0
DEBUG: ..............GATC............. | BetaBernoulliModel(alpha=603, beta=106) | 4.62 | 0
INFO: ..............GATC............., 602 seqs. model: BetaBernoulliModel(alpha=603, beta=106). (25.8 % left)
INFO: Continuing search
DEBUG: ...............A............... | BetaBernoulliModel(alpha=807, beta=45886) | 0.06 | 0
DEBUG: ..............CA............... | BetaBernoulliModel(alpha=52, beta=10001) | 0.00 | 2
DEBUG: .............CCA............... | BetaBernoulliModel(alpha=48, beta=2509) | 0.18 | 2
DEBUG: ............CCCA............... | BetaBernoulliModel(alpha=48, beta=354) | 1.36 | 2
DEBUG: ...........ACCCA............... | BetaBernoulliModel(alpha=48, beta=18) | 5.60 | 2
DEBUG: ...........ACCCA..........

In [None]:
gb_motifs.filter(pl.col("score") > 1)


In [None]:
log.basicConfig(level=log.INFO);
gb_motifs_plot = gb_motifs.filter(pl.col("score") > 1) \
    .with_columns([
        pl.col("model").apply(lambda model: model.mean()).alias("mean"),
        pl.col("contig").apply(lambda contig: f"{contig}, {len(gb_assembly.assembly[contig])/ 1e3:.0f}kB").alias("contig_len"),
        (pl.col("motif") + pl.col("mod_position").cast(pl.Utf8)).alias("motif_position")
    ]).sort("motif", descending = True)
cmap = sns.cubehelix_palette(as_cmap=True, light=.9)
heatmap = sns.heatmap(gb_motifs_plot.to_pandas().pivot(values = "mean", index = "contig_len", columns = "motif_position"), 
                      cmap = "viridis", square=True, vmin = 0, vmax = 1, annot = True);
heatmap.set_facecolor('lightgray')
plt.title("Mean methylation of identified motifs in Geobacillus");

# E. coli -dcm/-dam

In [None]:
ecneg_assembly = nm.load_assembly("../data/ecoli_neg/assembly.polished.fasta")
ecneg_pileup = nm.load_pileup("../data/ecoli_neg/modkit.pileup.bed")

In [None]:
ecneg_assembly.assembly.keys()

In [None]:
ecneg_motifs = nm.evaluate.process_sample(ecneg_assembly, ecneg_pileup.pileup, min_read_methylation_fraction = 0)

# Combining results

In [None]:
assemblies = {
    "E. coli K12": ec_assembly,
    "E. coli -dam/-dcm": ecneg_assembly,
    "Geobacillus": gb_assembly,
    "M. ruber": mr_assembly
}
pileups = {
    "E. coli K12": ec_pileup,
    "E. coli -dam/-dcm": ecneg_pileup,
    "Geobacillus": gb_pileup,
    "M. ruber": mr_pileup
}

In [None]:
all_motifs = pl.concat([ec_motifs, mr_motifs, gb_motifs])
all_motifs_scored = {
    "sample":[],
    "contig":[],
    "type":[],
    "motif":[],
    "model":[],
    "mean":[],
    "score":[],
    "contig_length":[],
    "mod_position":[]
}
for sample, assembly in assemblies.items():

    pileup_filt = pileups[sample].pileup.filter(pl.col("Nvalid_cov") > 5) \
        .filter(pl.col("fraction_mod") > 0.80)
    for contig, contig_sequence in assembly.assembly.items():
        print(contig)
        pileup_filt_contig = pileup_filt.filter(pl.col("contig") == contig)
        for row in all_motifs.rows():
            print(row[-2])
            model = nm.evaluate.score_candidate(
                pileup_filt_contig.filter(pl.col("mod_type") == row[-3]),
                contig_sequence.sequence,
                row[-2],
                row[-1]
            )
            all_motifs_scored["sample"].append(sample)
            all_motifs_scored["contig"].append(contig)
            all_motifs_scored["motif"].append(row[-2])
            all_motifs_scored["model"].append(model)
            all_motifs_scored["mean"].append(model.mean())
            all_motifs_scored["score"].append(1 - model.cdf(0.55))
            all_motifs_scored["type"].append(row[-3])
            all_motifs_scored["contig_length"].append(len(contig_sequence.sequence))
            all_motifs_scored["mod_position"].append(row[-1])
all_motifs_scored

In [None]:
motifs_all_samples = pl.DataFrame(all_motifs_scored) \
.with_columns(
    (pl.col("sample") + "\n" + (pl.col("contig_length")/1e3).cast(pl.Int16).cast(pl.Utf8) + "kb").alias("id"),
    (pl.col("motif") + "_" + pl.col("type")+ "_" + pl.col("mod_position").cast(pl.Utf8)).alias("motif_mod_type")
)
motifs_all_samples

In [None]:
cmap = sns.cubehelix_palette(as_cmap=True, light=.8)
heatmap = sns.clustermap(
    motifs_all_samples.to_pandas().reset_index().drop_duplicates(subset=['id', 'motif_mod_type'], keep='last').pivot(values = "score", index = "id", columns = "motif_mod_type"), 
    cmap = "viridis", vmin = 0, vmax = 1,
    row_cluster=True, col_cluster=True, dendrogram_ratio=0.15, figsize = (8, 8));
heatmap.ax_heatmap.yaxis.set_ticklabels(heatmap.ax_heatmap.yaxis.get_ticklabels(), rotation=0, ha='left');

In [None]:
motifs_all_samples

In [None]:
motifs_all_samples \
        .to_pandas().reset_index().drop_duplicates(subset=['id', 'motif_mod_type'], keep='last') \
        .pivot(values = "mean", index = "id", columns = "motif_mod_type")

In [None]:
sns.heatmap(
    motifs_all_samples \
        .to_pandas().reset_index().drop_duplicates(subset=['id', 'motif_mod_type'], keep='last') \
        .pivot(values = "mean", index = "id", columns = "motif_mod_type"),
    cmap="viridis",
    annot = True,
    square = True)

In [None]:

cm_plot = sns.clustermap(
    motifs_all_samples \
        .with_columns(
            pl.col("mean").round(1).alias("mean_rounded")
        )
        .to_pandas().reset_index().drop_duplicates(subset=['id', 'motif_mod_type'], keep='last') \
        .pivot(values = "mean_rounded", index = "id", columns = "motif_mod_type"),
    cmap="viridis",
    tree_kws={"linewidths": 0.},
    cbar_kws={"label": "Degree of methylation"},
    cbar_pos=(0.84, 0.31, 0.014, 0.52),
    vmin=0,
    vmax=1,
    figsize=(10, 6),
    annot=True
    );
cm_plot.ax_heatmap.set_xlabel("motif, type, modifed position")
cm_plot.ax_heatmap.set_ylabel("")
cm_plot.ax_heatmap.yaxis.tick_left()