# Modkit dmr
## Use my Filtered Reads

Based on:
https://nanoporetech.github.io/modkit/intro_dmr.html#perform-differential-methylation-scoring

Select kernal: dimelo_v2_modkit_parsing

Preparing the input data
The inputs to all modkit dmr commands are two or more bedMethyl files (created by modkit pileup) that have been compressed with bgzip and indexed with tabix. An example of how to generate the input data is shown below:


ref=grch38.fasta
threads=32

norm=normal_sample.bam
norm_pileup=normal_pileup.bed

modkit pileup ${norm} ${norm_pileup} \
  --cpg \
  --ref ${ref} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${norm_pileup}
tabix -p bed ${norm_pileup}.gz

# pileup and compression can also be done in one step
tumor=tumor_sample.bam
tumor_pileup=tumor_pileup.bed.gz

modkit pileup ${tumor} - \
  --cpg \
  --ref ${ref} \
  --threads ${threads} \
  --log-filepath log.txt | ${bgzip} -c > ${tumor_pileup}

tabix -p bed ${tumor_pileup}

In [53]:
from datetime import datetime

def current_time():
    """Returns the current date and time as a formatted string."""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S") 

if __name__ == "__main__":
    print("Current Date and Time:", current_time())

Current Date and Time: 2025-11-09 23:26:37


In [54]:
%%bash
echo "hello"

hello


# Use the NEW modkit latest installed version in ipython kernel modkit_new

In [55]:
# ! python3 -m ipykernel install --user --name=modkit_new --display-name "modkit_new Python"
# ! which modkit

In [56]:
import os
os.environ["PATH"] = "/home/michalula/.cargo/bin:" + os.environ["PATH"]
! which modkit
! modkit --version

/home/michalula/.cargo/bin/modkit
modkit 0.5.1


In [57]:
! modkit

Modkit is a bioinformatics tool for working with modified bases from Oxford
Nanopore

[1m[4mUsage:[0m [1mmodkit[0m <COMMAND>

[1m[4mCommands:[0m
  [1mpileup[0m          Tabulates base modification calls across genomic positions.
                  This command produces a bedMethyl formatted file. Schema and
                  description of fields can be found in the README
  [1madjust-mods[0m     Performs various operations on BAM files containing base
                  modification information, such as converting base modification
                  codes and ignoring modification calls. Produces a BAM output
                  file
  [1mupdate-tags[0m     Renames Mm/Ml to tags to MM/ML. Also allows changing the mode
                  flag from silent '.' to explicitly '?' or '.'
  [1msample-probs[0m    Calculate an estimate of the base modification probability
                  distribution
  [1msummary[0m         Summarize the mod tags present in a BAM and get basic
 

In [58]:
! modkit --version 

modkit 0.5.1


In [59]:
import os
import pandas as pd

def load_pileup_bed(bed_path):
    # bed_path = existing[0]
    print("Reading bedMethyl file:", bed_path)

    # bedMethyl column names (18 columns as provided)
    colnames = [
        "chrom", "start", "end", "mod_code", "score", "strand",
        "start2", "end2", "color",
        "Nvalid_cov", "percent_modified", "Nmod", "Ncanonical",
        "Nother_mod", "Ndelete", "Nfail", "Ndiff", "Nnocall"
    ]

    # Configure dtypes where reasonable
    dtypes = {
        "chrom": str,
        "start": "Int64",
        "end": "Int64",
        "mod_code": str,
        "score": "Int64",
        "strand": str,
        "start2": "Int64",
        "end2": "Int64",
        "color": str,
        "Nvalid_cov": "Int64",
        "percent_modified": float,
        "Nmod": "Int64",
        "Ncanonical": "Int64",
        "Nother_mod": "Int64",
        "Ndelete": "Int64",
        "Nfail": "Int64",
        "Ndiff": "Int64",
        "Nnocall": "Int64"
    }

    compression = "gzip" if bed_path.endswith(".gz") else None

    # Read file (headerless BED-like table). If file has extra columns, keep them with automatic numeric conversion below.
    df = pd.read_csv(
        pileup_CROFF_day35_bed,
        sep="\t",
        header=None,
        comment="#",
        names=colnames,
        dtype=dtypes,
        compression=compression,
        engine="python",
        na_values=[".", "NA", ""],
        keep_default_na=True
    )

    # If file contained more than 18 columns, pandas assigned remaining data to extra columns named like col_18, col_19...
    # Ensure numeric conversion for numeric-like columns
    for c in df.columns:
        if df[c].dtype == object:
            # try safe numeric conversion where appropriate
            try:
                df[c] = pd.to_numeric(df[c], errors="ignore")
            except Exception:
                pass

    print("Loaded DataFrame shape:", df.shape)
    display(df.head())
    return df


In [60]:
import os
from IPython.display import display, HTML
from plotly import express as px
from plotly import graph_objects as go

def plot_pileup_roi_df(df_roi, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    # ensure numeric types for plotting
    df_roi['pos'] = df_roi['start'].astype(int)
    df_roi['percent_modified'] = df_roi['percent_modified'].astype(float)
    df_roi['Nvalid_cov'] = df_roi['Nvalid_cov'].astype(int)
    df_roi['Nmod'] = df_roi['Nmod'].astype(int)
    df_roi['Ncanonical'] = df_roi['Ncanonical'].astype(int)

    # Scatter: genomic position vs percent modified (point size = coverage)
    fig1 = px.scatter(
        df_roi,
        x='pos',
        y='percent_modified',
        color='strand',
        size='Nvalid_cov',
        hover_data=['Nvalid_cov','Nmod','Ncanonical','Nother_mod','Nnocall'],
        title='Percent modified across ROI (size = Nvalid_cov)',
        height=500
    )
    fig1.update_layout(xaxis_title='Genomic position (start)', yaxis_title='Percent modified')
    fig1.show()
    fig1.write_html(os.path.join(out_dir, "roi_percent_modified_scatter.html"), include_plotlyjs='cdn')

    # Histogram: coverage distribution
    fig2 = px.histogram(
        df_roi,
        x='Nvalid_cov',
        nbins=40,
        title='Distribution of Nvalid_cov (coverage) in ROI',
        height=400
    )
    fig2.update_layout(xaxis_title='Nvalid_cov', yaxis_title='Count')
    fig2.show()
    fig2.write_html(os.path.join(out_dir, "roi_nvalidcov_hist.html"), include_plotlyjs='cdn')

    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked)
    topn = 30
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(name='Nmod', x=df_top['label'], y=df_top['Nmod']))
    fig3.add_trace(go.Bar(name='Ncanonical', x=df_top['label'], y=df_top['Ncanonical']))
    fig3.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod / Ncanonical)',
                    xaxis_title='position:strand', yaxis_title='reads', height=520)
    fig3.show()
    fig3.write_html(os.path.join(out_dir, "roi_top_sites_stacked_counts.html"), include_plotlyjs='cdn')

    # Print simple summaries
    print("ROI rows:", df_roi.shape[0])
    print("Percent modified: median={:.2f}, mean={:.2f}".format(df_roi['percent_modified'].median(), df_roi['percent_modified'].mean()))
    print("Coverage (Nvalid_cov): min={}, median={}, max={}".format(df_roi['Nvalid_cov'].min(), df_roi['Nvalid_cov'].median(), df_roi['Nvalid_cov'].max()))

    # Display first rows table for quick inspection
    display(HTML(df_roi.head(20).to_html(index=False)))


    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked)
    topn = 274
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(name='Nmod', x=df_top['label'], y=df_top['Nmod']))
    fig3.add_trace(go.Bar(name='Ncanonical', x=df_top['label'], y=df_top['Ncanonical']))
    fig3.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod / Ncanonical)',
                    xaxis_title='position:strand', yaxis_title='reads', height=520)
    fig3.show()
    fig3.write_html(os.path.join(out_dir, "roi_top_sites_stacked_counts.html"), include_plotlyjs='cdn')

    # Print simple summaries
    print("ROI rows:", df_roi.shape[0])
    print("Percent modified: median={:.2f}, mean={:.2f}".format(df_roi['percent_modified'].median(), df_roi['percent_modified'].mean()))
    print("Coverage (Nvalid_cov): min={}, median={}, max={}".format(df_roi['Nvalid_cov'].min(), df_roi['Nvalid_cov'].median(), df_roi['Nvalid_cov'].max()))

    # Display first rows table for quick inspection
    display(HTML(df_roi.head(20).to_html(index=False)))

    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    topn = 30
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig4 = go.Figure()
    fig4.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig4.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig4.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig4.show()
    fig4.write_html(os.path.join(out_dir, "roi_top_sites_stacked_percentage.html"), include_plotlyjs='cdn')     


    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    topn = 277
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig4 = go.Figure()
    fig4.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig4.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig4.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig4.show()
    fig4.write_html(os.path.join(out_dir, "roi_top_sites_stacked_percentage.html"), include_plotlyjs='cdn')     

    return df_top



In [61]:
! ls /home/michalula/data/ref_genomes/t2t_v2_0/

chm13v2.0.fa	  chm13v2.0.fa.fai		   haplotype_vcf
chm13v2.0.fa.amb  chm13v2.0.fa.pac		   up_chm13v2.0.fasta
chm13v2.0.fa.ann  chm13v2.0.fa.sa		   up_chm13v2.0.fasta.fai
chm13v2.0.fa.bwt  convert_to_uppercase_fasta.bash


# Pileups 
## for CRISPRoff filtered data for Day 35 

In [62]:
! ls /home/michalula/data/cas9_nanopore/data/20250908_nCATs_T_CRoff_Day_35/5mCG/to_t2t_v2_0

align_t2t_v2_0_trim_20250908_Day35_CROFF_Tcells_2Libraries_Minion_R9.dna_r9.4.1_e8_sup@v3.3.5mCG.bam
sort_align_t2t_v2_0_trim_20250908_Day35_CROFF_Tcells_2Libraries_Minion_R9.dna_r9.4.1_e8_sup@v3.3.5mCG.bam
sort_align_t2t_v2_0_trim_20250908_Day35_CROFF_Tcells_2Libraries_Minion_R9.dna_r9.4.1_e8_sup@v3.3.5mCG.bam.bai
summary_sort_align_t2t_v2_0_trim_20250908_Day35_CROFF_Tcells_2Libraries_Minion_R9.dna_r9.4.1_e8_sup@v3.3.5mCG.tsv


In [75]:
%%bash

ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref= "/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"

threads=32

date_today="20251109"
# data_folder_path="/home/michalula/data/cas9_nanopore/data/20250908_nCATs_T_CRoff_Day_35/5mCG/to_t2t_v2_0/"
# CROFF_day35_bam=${data_folder_path}"sort_align_t2t_v2_0_trim_20250908_Day35_CROFF_Tcells_2Libraries_Minion_R9.dna_r9.4.1_e8_sup@v3.3.5mCG.bam"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_pileup/"
mkdir ${pileup_data_folder_path}

filtered_CROFF_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
pileup_CROFF_day35_bed=${pileup_data_folder_path}${date_today}"_filtered""_pileup_CROFF_Day35_Tcells.bed"
# "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_CROFF_Day28/mergered_outputs/pileup_sort_merge_sort_align_t2t_v1_1_trim_20250721_nCATs_Tcells_CROFF_Day28_minion_run2_day8.dna_r9.4.1_e8_sup@v3.3.5mCG.bam"
# norm=normal_sample.bam
# norm_pileup=normal_pileup.bed

modkit pileup ${filtered_CROFF_day35_bam} ${pileup_CROFF_day35_bed} \
  --cpg \
  --ref ${ref_genome_fa} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${pileup_CROFF_day35_bed}
tabix -p bed ${pileup_CROFF_day35_bed}.gz

printf '%s\n' "filtered_CROFF_day35_bam: $filtered_CROFF_day35_bam"
printf '%s\n' "pileup_CROFF_day35_bed: $pileup_CROFF_day35_bed"
cat "$pileup_CROFF_day35_bed"

mkdir: cannot create directory ‘/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/new_pileup/’: File exists
[0;32m>[0m calculated chunk size: 48, interval size 100000, processing 4800000 positions concurrently
[0;32m>[0m filtering to only CpG motifs
[0;32m>[0m attempting to sample 10042 reads
[0;32m>[0m Using filter threshold 0.7910156 for C.
[0;32m>[0m Done, processed 327 rows. Processed ~326 reads and skipped zero reads.
[tabix] the index file exists. Please use '-f' to overwrite.


filtered_CROFF_day35_bam: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
pileup_CROFF_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/new_pileup/20251109_filtered_pileup_CROFF_Day35_Tcells.bed
chr1	206583089	206583090	m	4	+	206583089	206583090	255,0,0	4	75.00	3	1	0	0	0	0	0
chr1	206583090	206583091	m	2	-	206583090	206583091	255,0,0	2	100.00	2	0	0	0	0	0	0
chr1	206583173	206583174	m	143	+	206583173	206583174	255,0,0	143	90.91	130	13	0	6	25	16	6
chr1	206583174	206583175	m	118	-	206583174	206583175	255,0,0	118	94.92	112	6	0	1	2	0	5
chr1	206583387	206583388	m	171	+	206583387	206583388	255,0,0	171	76.61	131	40	0	0	16	5	7
chr1	206583388	206583389	m	98	-	20

bedMethyl column descriptions.

Definitions:

Nmod - Number of calls passing filters that were classified as a residue with a specified base modification.

Ncanonical - Number of calls passing filters were classified as the canonical base rather than modified. The exact base must be inferred by the modification code. For example, if the modification code is m (5mC) then the canonical base is cytosine. If the modification code is a, the canonical base is adenine.

Nother mod - Number of calls passing filters that were classified as modified, but where the modification is different from the listed base (and the corresponding canonical base is equal). For example, for a given cytosine there may be 3 reads with h calls, 1 with a canonical call, and 2 with m calls. In the bedMethyl row for h Nother_mod would be 2. In the m row Nother_mod would be 3.

Nvalid_cov - the valid coverage. Nvalid_cov = Nmod + Nother_mod + Ncanonical, also used as the score in the bedMethyl

Ndiff - Number of reads with a base other than the canonical base for this modification. For example, in a row for h the canonical base is cytosine, if there are 2 reads with C->A substitutions, Ndiff will be 2.

Ndelete - Number of reads with a deletion at this reference position

Nfail - Number of calls where the probability of the call was below the threshold. The threshold can be set on the command line or computed from the data (usually failing the lowest 10th percentile of calls).

Nnocall - Number of reads aligned to this reference position, with the correct canonical base, but without a base modification call. This can happen, for example, if the model requires a CpG dinucleotide and the read has a CG->CH substitution such that no modification call was produced by the basecaller.


column	name	description	type

    1	chrom	name of reference sequence from BAM header	str

    2	start position	0-based start position	int

    3	end position	0-based exclusive end position	int

    4	modified base code and motif	single letter code for modified base and motif when more than one motif is used	str

    5	score	equal to Nvalid_cov	int

    6	strand	'+' for positive strand '-' for negative strand, '.' when strands are combined	str

    7	start position	included for compatibility	int

    8	end position	included for compatibility	int

    9	color	included for compatibility, always 255,0,0	str

    10	Nvalid_cov	see definitions above.	int

    11	percent modified	(Nmod / Nvalid_cov) * 100	float

    12	Nmod	see definitions above	int

    13	Ncanonical	see definitions above	int

    14	Nother_mod	see definitions above	int

    15	Ndelete	see definitions above	int

    16	Nfail	see definitions above	int

    17	Ndiff	see definitions above	int

    18	Nnocall	see definitions above	int


In [76]:
date_today="20251109"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=data_folder_path+"new_pileup/"
# mkdir ${pileup_data_folder_path}

filtered_CROFF_day35_bam=data_folder_path+"filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
pileup_CROFF_day35_bed=pileup_data_folder_path+date_today+"_filtered""_pileup_CROFF_Day35_Tcells.bed"



In [77]:

pileup_CROFF_day35_df = load_pileup_bed(pileup_CROFF_day35_bed)

Reading bedMethyl file: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/new_pileup/20251109_filtered_pileup_CROFF_Day35_Tcells.bed
Loaded DataFrame shape: (327, 18)



errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead



Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583089,206583090,m,4,+,206583089,206583090,25500,4,75.0,3,1,0,0,0,0,0
1,chr1,206583090,206583091,m,2,-,206583090,206583091,25500,2,100.0,2,0,0,0,0,0,0
2,chr1,206583173,206583174,m,143,+,206583173,206583174,25500,143,90.91,130,13,0,6,25,16,6
3,chr1,206583174,206583175,m,118,-,206583174,206583175,25500,118,94.92,112,6,0,1,2,0,5
4,chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7


In [78]:
pileup_CROFF_day35_df

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583089,206583090,m,4,+,206583089,206583090,25500,4,75.00,3,1,0,0,0,0,0
1,chr1,206583090,206583091,m,2,-,206583090,206583091,25500,2,100.00,2,0,0,0,0,0,0
2,chr1,206583173,206583174,m,143,+,206583173,206583174,25500,143,90.91,130,13,0,6,25,16,6
3,chr1,206583174,206583175,m,118,-,206583174,206583175,25500,118,94.92,112,6,0,1,2,0,5
4,chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,chr1,206597139,206597140,m,1,+,206597139,206597140,25500,1,100.00,1,0,0,0,0,0,0
323,chr1,206597440,206597441,m,1,+,206597440,206597441,25500,1,100.00,1,0,0,0,0,0,0
324,chr1,206598045,206598046,m,1,+,206598045,206598046,25500,1,100.00,1,0,0,0,0,0,0
325,chr1,206598626,206598627,m,1,+,206598626,206598627,25500,1,100.00,1,0,0,0,0,0,0


# Look at CpGs within out target ROI
T2T v2.0

First CG:
206583388,206583390

Last of selected 137 CGs in the ROI:

206589746,206589748 --CpG_137

=> here have each CG position separate: so have 137*2  = 276

In [79]:
137*2, 277-5

(274, 272)

In [80]:
pileup_CROFF_day35_df[pileup_CROFF_day35_df['start'] == 206583387]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
4,chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7


In [81]:
pileup_CROFF_day35_df[pileup_CROFF_day35_df['start'] == 206583388]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
5,chr1,206583388,206583389,m,98,-,206583388,206583389,25500,98,83.67,82,16,0,4,12,6,7


In [82]:
pileup_CROFF_day35_df[pileup_CROFF_day35_df['start'] == 206589746]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
277,chr1,206589746,206589747,m,102,-,206589746,206589747,25500,102,98.04,100,2,0,13,2,3,7


In [83]:
(279-5) / 2

137.0

In [84]:
pileup_CROFF_day35_df_roi = pileup_CROFF_day35_df.iloc[4:278, :]  # Display target region rows
print(pileup_CROFF_day35_df_roi.shape,pileup_CROFF_day35_df_roi.shape[0]/2)
pileup_CROFF_day35_df_roi

(274, 18) 137.0


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
4,chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7
5,chr1,206583388,206583389,m,98,-,206583388,206583389,25500,98,83.67,82,16,0,4,12,6,7
6,chr1,206583707,206583708,m,187,+,206583707,206583708,25500,187,93.58,175,12,0,1,3,6,2
7,chr1,206583708,206583709,m,106,-,206583708,206583709,25500,106,96.23,102,4,0,6,8,1,6
8,chr1,206583766,206583767,m,169,+,206583766,206583767,25500,169,87.57,148,21,0,3,2,20,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,chr1,206589213,206589214,m,114,-,206589213,206589214,25500,114,95.61,109,5,0,4,5,1,3
274,chr1,206589436,206589437,m,190,+,206589436,206589437,25500,190,92.63,176,14,0,1,2,4,2
275,chr1,206589437,206589438,m,116,-,206589437,206589438,25500,116,96.55,112,4,0,1,5,1,4
276,chr1,206589745,206589746,m,172,+,206589745,206589746,25500,172,100.00,172,0,0,7,2,6,11


<!>
> Threshold of  0.7597656 for base C is low. Consider increasing the filter-percentile or specifying a higher threshold.
> Done, processed 11762972 rows. Processed ~129977 reads and skipped ~150 reads.

In [85]:
# Plot pileup_Unedit_day35_df_roi summary plots (use existing variables/imports in the notebook)
# Saves interactive HTMLs to pileup_data_folder_path and displays inline.
out_dir = pileup_data_folder_path  # existing variable in the notebook

df_roi = pileup_CROFF_day35_df_roi.copy()

df_roi_stats = plot_pileup_roi_df(df_roi=pileup_CROFF_day35_df_roi, out_dir=pileup_data_folder_path)
df_roi_stats




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

ROI rows: 274
Percent modified: median=81.37, mean=73.62
Coverage (Nvalid_cov): min=18, median=121.0, max=197


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7,206583387
chr1,206583388,206583389,m,98,-,206583388,206583389,25500,98,83.67,82,16,0,4,12,6,7,206583388
chr1,206583707,206583708,m,187,+,206583707,206583708,25500,187,93.58,175,12,0,1,3,6,2,206583707
chr1,206583708,206583709,m,106,-,206583708,206583709,25500,106,96.23,102,4,0,6,8,1,6,206583708
chr1,206583766,206583767,m,169,+,206583766,206583767,25500,169,87.57,148,21,0,3,2,20,5,206583766
chr1,206583767,206583768,m,105,-,206583767,206583768,25500,105,94.29,99,6,0,1,13,1,7,206583767
chr1,206584104,206584105,m,175,+,206584104,206584105,25500,175,94.29,165,10,0,4,2,15,3,206584104
chr1,206584105,206584106,m,111,-,206584105,206584106,25500,111,95.5,106,5,0,2,2,4,8,206584105
chr1,206584137,206584138,m,194,+,206584137,206584138,25500,194,97.42,189,5,0,0,5,0,0,206584137
chr1,206584138,206584139,m,83,-,206584138,206584139,25500,83,85.54,71,12,0,0,44,0,0,206584138


ROI rows: 274
Percent modified: median=81.37, mean=73.62
Coverage (Nvalid_cov): min=18, median=121.0, max=197


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7,206583387
chr1,206583388,206583389,m,98,-,206583388,206583389,25500,98,83.67,82,16,0,4,12,6,7,206583388
chr1,206583707,206583708,m,187,+,206583707,206583708,25500,187,93.58,175,12,0,1,3,6,2,206583707
chr1,206583708,206583709,m,106,-,206583708,206583709,25500,106,96.23,102,4,0,6,8,1,6,206583708
chr1,206583766,206583767,m,169,+,206583766,206583767,25500,169,87.57,148,21,0,3,2,20,5,206583766
chr1,206583767,206583768,m,105,-,206583767,206583768,25500,105,94.29,99,6,0,1,13,1,7,206583767
chr1,206584104,206584105,m,175,+,206584104,206584105,25500,175,94.29,165,10,0,4,2,15,3,206584104
chr1,206584105,206584106,m,111,-,206584105,206584106,25500,111,95.5,106,5,0,2,2,4,8,206584105
chr1,206584137,206584138,m,194,+,206584137,206584138,25500,194,97.42,189,5,0,0,5,0,0,206584137
chr1,206584138,206584139,m,83,-,206584138,206584139,25500,83,85.54,71,12,0,0,44,0,0,206584138


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,...,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos,label,Ntotal,Nmod_perc,Ncanonical_perc
276,chr1,206589745,206589746,m,172,+,206589745,206589746,25500,172,...,0,7,2,6,11,206589745,206589745:+,172,100.000000,0.000000
21,chr1,206584202,206584203,m,127,-,206584202,206584203,25500,127,...,0,0,0,0,0,206584202,206584202:-,127,98.425197,1.574803
14,chr1,206584151,206584152,m,185,+,206584151,206584152,25500,185,...,0,1,12,1,0,206584151,206584151:+,185,98.378378,1.621622
277,chr1,206589746,206589747,m,102,-,206589746,206589747,25500,102,...,0,13,2,3,7,206589746,206589746:-,102,98.039216,1.960784
18,chr1,206584178,206584179,m,188,+,206584178,206584179,25500,188,...,0,0,10,1,0,206584178,206584178:+,188,97.872340,2.127660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,chr1,206587018,206587019,m,92,-,206587018,206587019,25500,92,...,0,4,14,0,17,206587018,206587018:-,92,18.478261,81.521739
79,chr1,206586454,206586455,m,101,-,206586454,206586455,25500,101,...,0,1,8,5,12,206586454,206586454:-,101,16.831683,83.168317
192,chr1,206587149,206587150,m,176,+,206587149,206587150,25500,176,...,0,2,11,0,10,206587149,206587149:+,176,16.477273,83.522727
188,chr1,206587105,206587106,m,164,+,206587105,206587106,25500,164,...,0,3,5,1,26,206587105,206587105:+,164,15.853659,84.146341


# Unedited T cells Day 35

In [None]:
! ls "/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"

/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam


In [None]:
%%bash

date_today="20251109"
data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_pileup/"
mkdir ${pileup_data_folder_path}

filtered_Unedit_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
pileup_Unedit_day35_bed=${pileup_data_folder_path}${date_today}"_filtered""_pileup_NT_Day35_Tcells.bed"
cat "$pileup_Unedit_day35_bed"

ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref="/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"

threads=32

modkit pileup ${filtered_Unedit_day35_bam} ${pileup_Unedit_day35_bed} \
  --cpg \
  --ref ${ref_genome_fa} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${pileup_Unedit_day35_bed}
tabix -p bed ${pileup_Unedit_day35_bed}.gz

printf '%s\n' "filtered_Unedit_day35_bam: $filtered_Unedit_day35_bam"
printf '%s\n' "pileup_Unedit_day35_bed: $pileup_Unedit_day35_bed"
cat "$pileup_Unedit_day35_bed"


mkdir: cannot create directory ‘/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/’: File exists


chr1	206583090	206583091	m	1	-	206583090	206583091	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206583173	206583174	m	78	+	206583173	206583174	255,0,0	78	84.62	66	12	0	2	17	7	4
chr1	206583174	206583175	m	84	-	206583174	206583175	255,0,0	84	96.43	81	3	0	1	3	2	2
chr1	206583387	206583388	m	81	+	206583387	206583388	255,0,0	81	71.60	58	23	0	1	15	9	3
chr1	206583388	206583389	m	71	-	206583388	206583389	255,0,0	71	85.92	61	10	0	3	12	1	5
chr1	206583707	206583708	m	99	+	206583707	206583708	255,0,0	99	95.96	95	4	0	3	3	0	4
chr1	206583708	206583709	m	77	-	206583708	206583709	255,0,0	77	96.10	74	3	0	1	7	6	1
chr1	206583766	206583767	m	95	+	206583766	206583767	255,0,0	95	93.68	89	6	0	2	1	10	1
chr1	206583767	206583768	m	75	-	206583767	206583768	255,0,0	75	93.33	70	5	0	2	11	2	2
chr1	206584104	206584105	m	97	+	206584104	206584105	255,0,0	97	95.88	93	4	0	1	1	9	1
chr1	206584105	206584106	m	88	-	206584105	206584106	255,0,0	88	93.18	82	6	0	0	1	0	3
chr1	206584137	206584138	m	104	+	206584137	206584138	255,0,0	104	100.0

[0;32m>[0m calculated chunk size: 48, interval size 100000, processing 4800000 positions concurrently
[0;32m>[0m filtering to only CpG motifs
[0;32m>[0m attempting to sample 10042 reads
[0;32m>[0m Using filter threshold 0.8496094 for C.
[0;32m>[0m Done, processed 285 rows. Processed ~201 reads and skipped zero reads.
[tabix] the index file exists. Please use '-f' to overwrite.


filtered_Unedit_day35_bam: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
pileup_Unedit_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/20251109_filtered_pileup_NT_Day35_Tcells.bed
chr1	206583090	206583091	m	1	-	206583090	206583091	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206583173	206583174	m	78	+	206583173	206583174	255,0,0	78	84.62	66	12	0	2	17	7	4
chr1	206583174	206583175	m	84	-	206583174	206583175	255,0,0	84	96.43	81	3	0	1	3	2	2
chr1	206583387	206583388	m	81	+	206583387	206583388	255,0,0	81	71.60	58	23	0	1	15	9	3
chr1	206583388	206583389	m	71	-	206583388	206583389	255,0,0	71	85.92	61	10	0	3	12	1	5
chr1	206583707	206583708	m	99	+	206583707	20658

In [None]:
date_today="20251109"
data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/"
pileup_data_folder_path=data_folder_path+"new_pileup/"
# mkdir ${pileup_data_folder_path}

# filtered_Unedit_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
pileup_Unedit_day35_bed=pileup_data_folder_path+date_today+"_filtered""_pileup_NT_Day35_Tcells.bed"
print("pileup_Unedit_day35_bed:", pileup_Unedit_day35_bed)

pileup_Unedit_day35_df = load_pileup_bed(pileup_Unedit_day35_bed)
pileup_Unedit_day35_df

pileup_Unedit_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/20251109_filtered_pileup_NT_Day35_Tcells.bed
Reading bedMethyl file: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/20251109_filtered_pileup_NT_Day35_Tcells.bed
Loaded DataFrame shape: (327, 18)



errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead



Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583089,206583090,m,4,+,206583089,206583090,25500,4,75.0,3,1,0,0,0,0,0
1,chr1,206583090,206583091,m,2,-,206583090,206583091,25500,2,100.0,2,0,0,0,0,0,0
2,chr1,206583173,206583174,m,143,+,206583173,206583174,25500,143,90.91,130,13,0,6,25,16,6
3,chr1,206583174,206583175,m,118,-,206583174,206583175,25500,118,94.92,112,6,0,1,2,0,5
4,chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583089,206583090,m,4,+,206583089,206583090,25500,4,75.00,3,1,0,0,0,0,0
1,chr1,206583090,206583091,m,2,-,206583090,206583091,25500,2,100.00,2,0,0,0,0,0,0
2,chr1,206583173,206583174,m,143,+,206583173,206583174,25500,143,90.91,130,13,0,6,25,16,6
3,chr1,206583174,206583175,m,118,-,206583174,206583175,25500,118,94.92,112,6,0,1,2,0,5
4,chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,chr1,206597139,206597140,m,1,+,206597139,206597140,25500,1,100.00,1,0,0,0,0,0,0
323,chr1,206597440,206597441,m,1,+,206597440,206597441,25500,1,100.00,1,0,0,0,0,0,0
324,chr1,206598045,206598046,m,1,+,206598045,206598046,25500,1,100.00,1,0,0,0,0,0,0
325,chr1,206598626,206598627,m,1,+,206598626,206598627,25500,1,100.00,1,0,0,0,0,0,0


In [None]:
pileup_Unedit_day35_df[pileup_Unedit_day35_df['start'] == 206583388-1]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
4,chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7


In [None]:
pileup_Unedit_day35_df[pileup_Unedit_day35_df['start'] == 206583388]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
5,chr1,206583388,206583389,m,98,-,206583388,206583389,25500,98,83.67,82,16,0,4,12,6,7


In [None]:
pileup_Unedit_day35_df[pileup_Unedit_day35_df['start'] == 206589746-1]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
276,chr1,206589745,206589746,m,172,+,206589745,206589746,25500,172,100.0,172,0,0,7,2,6,11


In [None]:
pileup_Unedit_day35_df[pileup_Unedit_day35_df['start'] == 206589746]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
277,chr1,206589746,206589747,m,102,-,206589746,206589747,25500,102,98.04,100,2,0,13,2,3,7


In [None]:
pileup_Unedit_day35_df_roi = pileup_Unedit_day35_df.iloc[4:278, :]  # Display target region rows
print(pileup_Unedit_day35_df_roi.shape,pileup_Unedit_day35_df_roi.shape[0]/2)
pileup_Unedit_day35_df_roi

(274, 18) 137.0


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
4,chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7
5,chr1,206583388,206583389,m,98,-,206583388,206583389,25500,98,83.67,82,16,0,4,12,6,7
6,chr1,206583707,206583708,m,187,+,206583707,206583708,25500,187,93.58,175,12,0,1,3,6,2
7,chr1,206583708,206583709,m,106,-,206583708,206583709,25500,106,96.23,102,4,0,6,8,1,6
8,chr1,206583766,206583767,m,169,+,206583766,206583767,25500,169,87.57,148,21,0,3,2,20,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,chr1,206589213,206589214,m,114,-,206589213,206589214,25500,114,95.61,109,5,0,4,5,1,3
274,chr1,206589436,206589437,m,190,+,206589436,206589437,25500,190,92.63,176,14,0,1,2,4,2
275,chr1,206589437,206589438,m,116,-,206589437,206589438,25500,116,96.55,112,4,0,1,5,1,4
276,chr1,206589745,206589746,m,172,+,206589745,206589746,25500,172,100.00,172,0,0,7,2,6,11


In [None]:
pileup_Unedit_day35_df_roi

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
4,chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7
5,chr1,206583388,206583389,m,98,-,206583388,206583389,25500,98,83.67,82,16,0,4,12,6,7
6,chr1,206583707,206583708,m,187,+,206583707,206583708,25500,187,93.58,175,12,0,1,3,6,2
7,chr1,206583708,206583709,m,106,-,206583708,206583709,25500,106,96.23,102,4,0,6,8,1,6
8,chr1,206583766,206583767,m,169,+,206583766,206583767,25500,169,87.57,148,21,0,3,2,20,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,chr1,206589213,206589214,m,114,-,206589213,206589214,25500,114,95.61,109,5,0,4,5,1,3
274,chr1,206589436,206589437,m,190,+,206589436,206589437,25500,190,92.63,176,14,0,1,2,4,2
275,chr1,206589437,206589438,m,116,-,206589437,206589438,25500,116,96.55,112,4,0,1,5,1,4
276,chr1,206589745,206589746,m,172,+,206589745,206589746,25500,172,100.00,172,0,0,7,2,6,11


In [None]:
pileup_Unedit_day35_df_roi

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
4,chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7
5,chr1,206583388,206583389,m,98,-,206583388,206583389,25500,98,83.67,82,16,0,4,12,6,7
6,chr1,206583707,206583708,m,187,+,206583707,206583708,25500,187,93.58,175,12,0,1,3,6,2
7,chr1,206583708,206583709,m,106,-,206583708,206583709,25500,106,96.23,102,4,0,6,8,1,6
8,chr1,206583766,206583767,m,169,+,206583766,206583767,25500,169,87.57,148,21,0,3,2,20,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,chr1,206589213,206589214,m,114,-,206589213,206589214,25500,114,95.61,109,5,0,4,5,1,3
274,chr1,206589436,206589437,m,190,+,206589436,206589437,25500,190,92.63,176,14,0,1,2,4,2
275,chr1,206589437,206589438,m,116,-,206589437,206589438,25500,116,96.55,112,4,0,1,5,1,4
276,chr1,206589745,206589746,m,172,+,206589745,206589746,25500,172,100.00,172,0,0,7,2,6,11


In [None]:
pileup_data_folder_path

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/'

In [None]:

# data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/"
# pileup_data_folder_path=data_folder_path+"new_pileup/"
# # mkdir ${pileup_data_folder_path}

# # Plot pileup_Unedit_day35_df_roi summary plots (use existing variables/imports in the notebook)
# # Saves interactive HTMLs to pileup_data_folder_path and displays inline.
# out_dir = pileup_data_folder_path  # existing variable in the notebook

# df_roi = pileup_Unedit_day35_df_roi.copy()

df_roi_stats = plot_pileup_roi_df(df_roi=pileup_Unedit_day35_df_roi, out_dir=pileup_data_folder_path)
df_roi_stats




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

ROI rows: 274
Percent modified: median=81.37, mean=73.62
Coverage (Nvalid_cov): min=18, median=121.0, max=197


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7,206583387
chr1,206583388,206583389,m,98,-,206583388,206583389,25500,98,83.67,82,16,0,4,12,6,7,206583388
chr1,206583707,206583708,m,187,+,206583707,206583708,25500,187,93.58,175,12,0,1,3,6,2,206583707
chr1,206583708,206583709,m,106,-,206583708,206583709,25500,106,96.23,102,4,0,6,8,1,6,206583708
chr1,206583766,206583767,m,169,+,206583766,206583767,25500,169,87.57,148,21,0,3,2,20,5,206583766
chr1,206583767,206583768,m,105,-,206583767,206583768,25500,105,94.29,99,6,0,1,13,1,7,206583767
chr1,206584104,206584105,m,175,+,206584104,206584105,25500,175,94.29,165,10,0,4,2,15,3,206584104
chr1,206584105,206584106,m,111,-,206584105,206584106,25500,111,95.5,106,5,0,2,2,4,8,206584105
chr1,206584137,206584138,m,194,+,206584137,206584138,25500,194,97.42,189,5,0,0,5,0,0,206584137
chr1,206584138,206584139,m,83,-,206584138,206584139,25500,83,85.54,71,12,0,0,44,0,0,206584138


ROI rows: 274
Percent modified: median=81.37, mean=73.62
Coverage (Nvalid_cov): min=18, median=121.0, max=197


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,171,+,206583387,206583388,25500,171,76.61,131,40,0,0,16,5,7,206583387
chr1,206583388,206583389,m,98,-,206583388,206583389,25500,98,83.67,82,16,0,4,12,6,7,206583388
chr1,206583707,206583708,m,187,+,206583707,206583708,25500,187,93.58,175,12,0,1,3,6,2,206583707
chr1,206583708,206583709,m,106,-,206583708,206583709,25500,106,96.23,102,4,0,6,8,1,6,206583708
chr1,206583766,206583767,m,169,+,206583766,206583767,25500,169,87.57,148,21,0,3,2,20,5,206583766
chr1,206583767,206583768,m,105,-,206583767,206583768,25500,105,94.29,99,6,0,1,13,1,7,206583767
chr1,206584104,206584105,m,175,+,206584104,206584105,25500,175,94.29,165,10,0,4,2,15,3,206584104
chr1,206584105,206584106,m,111,-,206584105,206584106,25500,111,95.5,106,5,0,2,2,4,8,206584105
chr1,206584137,206584138,m,194,+,206584137,206584138,25500,194,97.42,189,5,0,0,5,0,0,206584137
chr1,206584138,206584139,m,83,-,206584138,206584139,25500,83,85.54,71,12,0,0,44,0,0,206584138


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,...,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos,label,Ntotal,Nmod_perc,Ncanonical_perc
276,chr1,206589745,206589746,m,172,+,206589745,206589746,25500,172,...,0,7,2,6,11,206589745,206589745:+,172,100.000000,0.000000
21,chr1,206584202,206584203,m,127,-,206584202,206584203,25500,127,...,0,0,0,0,0,206584202,206584202:-,127,98.425197,1.574803
14,chr1,206584151,206584152,m,185,+,206584151,206584152,25500,185,...,0,1,12,1,0,206584151,206584151:+,185,98.378378,1.621622
277,chr1,206589746,206589747,m,102,-,206589746,206589747,25500,102,...,0,13,2,3,7,206589746,206589746:-,102,98.039216,1.960784
18,chr1,206584178,206584179,m,188,+,206584178,206584179,25500,188,...,0,0,10,1,0,206584178,206584178:+,188,97.872340,2.127660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,chr1,206587018,206587019,m,92,-,206587018,206587019,25500,92,...,0,4,14,0,17,206587018,206587018:-,92,18.478261,81.521739
79,chr1,206586454,206586455,m,101,-,206586454,206586455,25500,101,...,0,1,8,5,12,206586454,206586454:-,101,16.831683,83.168317
192,chr1,206587149,206587150,m,176,+,206587149,206587150,25500,176,...,0,2,11,0,10,206587149,206587149:+,176,16.477273,83.522727
188,chr1,206587105,206587106,m,164,+,206587105,206587106,25500,164,...,0,3,5,1,26,206587105,206587105:+,164,15.853659,84.146341


# Todo: fix the percentage plots

3. Detecting differential modification at single base positions
The modkit dmr pair command has the ability to score individual bases (e.g. differentially methylated CpGs). To run single-base analysis on one or more paired samples, simply omit the --regions (-r) option when running modkit dmr pair. When performing single-base analysis the likelihood ratio score and a MAP-based p-value are available. For details on the likelihood ratio score and the MAP-based p-value, see the scoring details section. For example the above command becomes:

dmr_result=single_base_haplotype_dmr.bed

modkit dmr pair \
  -a ${hp1_pileup}.gz \
  -b ${hp2_pileup}.gz \
  -o ${dmr_result} \
  --ref ${ref} \
  --base C \
  --threads ${threads} \
  --log-filepath dmr.log

In [86]:
# ! ls "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG"

In [87]:
# ! ls /home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG/pileup_sorted_align_t2t_v1_1_trim_20250721_nCATs_Tcells_UNEDITED_Day28_minion_merged.dna_r9.4.1_e8_sup@v3.3.5mCG.bam.gz

In [88]:
# /home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/pileup_sorted_align_t2t_v1_1_trim_20250721_nCATs_Tcells_UNEDITED_Day28_minion_merged.dna_r9.4.1_e8_sup.5mCG.bam.gz

In [89]:
# %%bash
# ls -ld /home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs
# ls -l /home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/pileup_sorted_align_t2t_v1_1_trim_20250721_nCATs_Tcells_UNEDITED_Day28_minion_merged.dna_r9.4.1_e8_sup.5mCG.bam.gz

In [90]:
%%bash

pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/"
pileup_Unedit_day35_bed=${pileup_data_folder_path}"20251109_filtered_pileup_NT_Day35_Tcells.bed"
# "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG/to_t2t_v2_0/pileup/pileup_Unedit_Day28_Tcells_20250721.bam"

echo "pileup_Unedit_day35_bed: ${pileup_Unedit_day35_bed}.gz"

ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_Unedit_day35_bed}.gz"
# ls -l "${pileup_Unedit_day28_bam}.gz"

chmod u+rwx "${pileup_data_folder_path}"
chmod u+rwx "${pileup_Unedit_day35_bed}.gz"

# ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_Unedit_day35_bed}.gz"


pileup_Unedit_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/20251109_filtered_pileup_NT_Day35_Tcells.bed.gz
20251109_filtered_pileup_NT_Day35_Tcells.bam
20251109_filtered_pileup_NT_Day35_Tcells.bam.gz
20251109_filtered_pileup_NT_Day35_Tcells.bam.gz.tbi
20251109_filtered_pileup_NT_Day35_Tcells.bed
20251109_filtered_pileup_NT_Day35_Tcells.bed.gz
20251109_filtered_pileup_NT_Day35_Tcells.bed.gz.tbi
roi_nvalidcov_hist.html
roi_percent_modified_scatter.html
roi_top_sites_stacked_counts.html
roi_top_sites_stacked_percentage.html
drwxrwxr-x 2 michalula michalula 4096 Nov  9 21:41 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/
-rwxrw-r-- 1 michalula michalula 5585 Nov  9 21:27 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2librari

In [91]:
%%bash

# pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/pileup/"
# pileup_Unedit_day35_bam=${pileup_data_folder_path}"20251109_filtered_pileup_NT_Day35_Tcells.bam"
# # "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG/to_t2t_v2_0/pileup/pileup_Unedit_Day28_Tcells_20250721.bam"
pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/new_pileup/"
pileup_CROFF_day35_bed=${pileup_data_folder_path}"20251109_filtered_pileup_CROFF_Day35_Tcells.bed"

echo "pileup_CROFF_day35_bed: ${pileup_CROFF_day35_bed}.gz"

ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_CROFF_day35_bed}.gz"
# ls -l "${pileup_Unedit_day28_bam}.gz"

chmod u+rwx "${pileup_data_folder_path}"
chmod u+rwx "${pileup_CROFF_day35_bed}.gz"


# ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_CROFF_day35_bed}.gz"


pileup_CROFF_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/new_pileup/20251109_filtered_pileup_CROFF_Day35_Tcells.bed.gz
20251109_filtered_pileup_CROFF_Day35_Tcells.bed
20251109_filtered_pileup_CROFF_Day35_Tcells.bed.gz
20251109_filtered_pileup_CROFF_Day35_Tcells.bed.gz.tbi
roi_nvalidcov_hist.html
roi_percent_modified_scatter.html
roi_top_sites_stacked_counts.html
roi_top_sites_stacked_percentage.html
drwxrwxr-x 2 michalula michalula 4096 Nov  9 22:03 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/new_pileup/
-rwxrw-r-- 1 michalula michalula 7115 Nov  9 20:51 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/new_pileup/20251109_filtered_pileup_CROFF_Day35_Tcells.bed.gz
drwxrwxr-x 2 michalula michalula 4096 Nov  9 22:03 /

In [92]:
# %%bash

#    chmod u+rwx "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG/to_t2t_v2_0/pileup/"
#    chmod u+rwx "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_CROFF_Day28/mergered_outputs/5mCG/to_t2t_v2_0/pileup_CROFF_Day28_Tcells_20250721.bam.gz"

In [93]:
# %%bash

# ls -ld /home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs
# ls -ld "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_CROFF_Day28/mergered_outputs/5mCG/to_t2t_v2_0/pileup_CROFF_Day28_Tcells_20250721.bam.gz"

In [94]:
# ls /home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/pileup_sorted_align_t2t_v1_1_trim_20250721_nCATs_Tcells_UNEDITED_Day28_minion_merged.dna_r9.4.1_e8_sup.5mCG.bam.gz

In [117]:
%%bash
 
# 3. Detecting differential modification at single base positions
# The modkit dmr pair command has the ability to score individual bases (e.g. differentially methylated CpGs). To run single-base analysis on one or more paired samples, simply omit the --regions (-r) option when running modkit dmr pair. When performing single-base analysis the likelihood ratio score and a MAP-based p-value are available. For details on the likelihood ratio score and the MAP-based p-value, see the scoring details section. For example the above command becomes:
date_today="20251109"

experiment_codition="day35_CRoff_vs_Unedit"
dmr_output_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/new_dmr_output/"
dmr_result=${dmr_output_path}${date_today}"_single_base_"${experiment_codition}".bed"

# pileup_Unedit_day28_bam="/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/pileup_sorted_align_t2t_v1_1_trim_20250721_nCATs_Tcells_UNEDITED_Day28_minion_merged.dna_r9.4.1_e8_sup@v3.3.5mCG.bam"
# removed the @ from the file name
# pileup_Unedit_day28_bam="/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/pileup_sorted_align_t2t_v1_1_trim_20250721_nCATs_Tcells_UNEDITED_Day28_minion_merged.dna_r9.4.1_e8_sup.5mCG.bam"
# pileup_Unedit_day28_bam="/home/michalula/code/epiCausality/epiCode/differential_methyl/pileups/t2t_v1_1/pileup_Unedit_Day28_Tcells_20250721.bam"

pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/"
pileup_Unedit_day35_bed=${pileup_data_folder_path}"20251109_filtered_pileup_NT_Day35_Tcells.bed"
# "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG/to_t2t_v2_0/pileup/pileup_Unedit_Day28_Tcells_20250721.bam"

echo "pileup_Unedit_day35_bed: ${pileup_Unedit_day35_bed}.gz"
ls -l "${pileup_Unedit_day35_bed}.gz"
# "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/pileup_sorted_align_t2t_v1_1_trim_20250721_nCATs_Tcells_UNEDITED_Day28_minion_merged.dna_r9.4.1_e8_sup.5mCG.bam"
# pileup_Unedit_day28_bam_gz="/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/pileup_sorted_align_t2t_v1_1_trim_20250721_nCATs_Tcells_UNEDITED_Day28_minion_merged.dna_r9.4.1_e8_sup.5mCG.bam.gz"
  
# pileup_CROFF_day28_bam="/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_CROFF_Day28/mergered_outputs/pileup_sort_merge_sort_align_t2t_v1_1_trim_20250721_nCATs_Tcells_CROFF_Day28_minion_run2_day8.dna_r9.4.1_e8_sup@v3.3.5mCG.bam"
# pileup_CROFF_day28_bam="/home/michalula/code/epiCausality/epiCode/differential_methyl/pileups/t2t_v1_1/pileup_CROFF_Day28_Tcells_20250721.bam"
pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/new_pileup/"
pileup_CROFF_day35_bed=${pileup_data_folder_path}"20251109_filtered_pileup_CROFF_Day35_Tcells.bed"

echo "pileup_CROFF_day35_bed: ${pileup_CROFF_day35_bed}.gz"
ls -l "${pileup_CROFF_day35_bed}.gz"

ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref="/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"
threads=32
 
cd ${dmr_output_path}
# '/home/michalula/code/epiCausality/epiCode/differential_methyl'

modkit dmr pair \
  -a ${pileup_CROFF_day35_bed}.gz \
  -b ${pileup_Unedit_day35_bed}.gz \
  -o ${dmr_result} \
  --ref ${ref_genome_fa} \
  --base C \
  --threads ${threads} \
  --log-filepath dmr.log


echo "dmr_result: $dmr_result"
ls -lah $dmr_result

pileup_Unedit_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/20251109_filtered_pileup_NT_Day35_Tcells.bed.gz
-rwxrw-r-- 1 michalula michalula 5585 Nov  9 21:27 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/20251109_filtered_pileup_NT_Day35_Tcells.bed.gz
pileup_CROFF_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/new_pileup/20251109_filtered_pileup_CROFF_Day35_Tcells.bed.gz
-rwxrw-r-- 1 michalula michalula 7115 Nov  9 20:51 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/analyze_single_reads/dimelo_v2_output/new_pileup/20251109_filtered_pileup_CROFF_Day35_Tcells.bed.gz


[0;32m>[0m reading reference FASTA at "/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
[0;32m>[0m 1 common sequence(s) between FASTA and both samples
[0;32m>[0m running single-site analysis
[0;32m>[0m using default prior, Beta(α: 0.55, β: 0.55)
[0;32m>[0m estimating max coverages from data
[0;32m>[0m sampled 327 a records and 285 b records, calculating max coverages for 95th percentile
[0;32m>[0m calculated max coverage for a: 183 and b: 103
[0;32m>[0m calculated max coverage 183 is greater than maximum allowed (100), setting to 100
[0;32m>[0m calculated max coverage 103 is greater than maximum allowed (100), setting to 100
[0;31;1m>[0m errors:
+--------------------------+-------+
| error                    | count |
+--------------------------+-------+
| missing-in-one-condition | 42    |
+--------------------------+-------+

[0;32m>[0m finished, processed 285 sites successfully, 42 failed


dmr_result: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/new_dmr_output/20251109_single_base_day35_CRoff_vs_Unedit.bed
-rw-rw-r-- 1 michalula michalula 57K Nov  9 23:40 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/new_dmr_output/20251109_single_base_day35_CRoff_vs_Unedit.bed


In [118]:
%%bash

date_today="20251109"
experiment_codition="day35_CRoff_vs_Unedit"
dmr_output_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/new_dmr_output/"
dmr_result=${dmr_output_path}${date_today}"single_base_"${experiment_codition}".bed"

echo "dmr_result: $dmr_result"
# ls -lah $dmr_result
ls -lah $dmr_output_path
# cat $dmr_result

dmr_result: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/new_dmr_output/20251109single_base_day35_CRoff_vs_Unedit.bed
total 72K
drwxrwxr-x 2 michalula michalula 4.0K Nov  9 23:39 .
drwxrwxr-x 3 michalula michalula 4.0K Nov  9 23:39 ..
-rw-rw-r-- 1 michalula michalula  57K Nov  9 23:40 20251109_single_base_day35_CRoff_vs_Unedit.bed
-rw-rw-r-- 1 michalula michalula 2.4K Nov  9 23:40 dmr.log


In [119]:
pwd

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr'

## modkit dmr explore output

The full table when performing single-site analysis with equal numbers of samples in groups, when running modkit dmr pair, will have the following schema:

column	name	description	type
1	chrom	name of reference sequence from bedMethyl input samples	str
2	start position	0-based start position, from --regions argument	int
3	end position	0-based exclusive end position, from --regions argument	int
4	name	name column from --regions BED, or chr:start-stop if absent, "." for single sites	str
5	score	difference score, more positive values have increased difference	float
6	strand	strand for the region or single-base position	str
7	samplea counts	counts of each base modification in the region, comma-separated, for sample A	str
8	samplea total	total number of base modification calls in the region, including unmodified, for sample A	int
9	sampleb counts	counts of each base modification in the region, comma-separated, for sample B	str
10	sampleb total	total number of base modification calls in the region, including unmodified, for sample B	int
11	samplea percents	percent of calls for each base modification in the region, comma-separated, for sample A	str
12	sampleb percents	percent of calls for each base modification in the region, comma-separated, for sample B	str
13	samplea fraction modified	fraction modification (of any kind) in sample A	float
14	sampleb fraction modified	fraction modification (of any kind) in sample B	float
15	MAP-based p-value	ratio of the posterior probability of observing the effect size over zero effect size	float
16	effect size	percent modified in sample A (col 12) minus percent modified in sample B (col 13)	float
<!-- 17	balanced MAP-based p-value	MAP-based p-value when all replicates are balanced	float
18	balanced effect size	effect size when all replicates are balanced	float -->


17	cohen_h	Cohen's h statistic (useful with regions and high-depth runs)	float
18	cohen_h_low	95% confidence interval lower bound	float
19	cohen_h_high	95% confidence interval upper bound	float

<!-- Differential methylation output format
The output from modkit dmr pair (and for each pairwise comparison with modkit dmr multi) is (roughly) a BED file with the following schema: -->
<!-- 
column	name	description	type
        1	chrom	name of reference sequence from bedMethyl input samples	str
        2	start position	0-based start position, from --regions argument	int
        3	end position	0-based exclusive end position, from --regions argument	int
        4	name	name column from --regions BED, or chr:start-stop if absent, "." for single sites	str
        5	score	difference score, more positive values have increased difference	float
        6	strand	strand for the region or single-base position	str
        7	samplea counts	counts of each base modification in the region, comma-separated, for sample A	str
        8	samplea total	total number of base modification calls in the region, including unmodified, for sample A	int
        9	sampleb counts	counts of each base modification in the region, comma-separated, for sample B	str
        10	sampleb total	total number of base modification calls in the region, including unmodified, for sample B	int
        11	samplea percents	percent of calls for each base modification in the region, comma-separated, for sample A	str
        12	sampleb percents	percent of calls for each base modification in the region, comma-separated, for sample B	str
        13	samplea fraction modified	fraction modification (of any kind) in sample A	float
        14	sampleb fraction modified	fraction modification (of any kind) in sample B	float
        15	MAP-based p-value	ratio of the posterior probability of observing the effect size over zero effect size	float
        16	effect size	percent modified in sample A (col 12) minus percent modified in sample B (col 13)	float
        17	balanced MAP-based p-value	MAP-based p-value when all replicates are balanced	float
        18	balanced effect size	effect size when all replicates are balanced	float
        19	pct_a_samples	percent of 'a' samples used in statistical test	float
        20	pct_b_samples	percent of 'b' samples used in statistical test	float
        21	per-replicate p-values	MAP-based p-values for matched replicate pairs	float
        22	per-replicate effect sizes	effect sizes matched replicate pairs	float
        23	cohen_h	Cohen's h statistic (useful with regions and high-depth runs)	float
        24	cohen_h_low	95% confidence interval lower bound	float
        25	cohen_h_high	95% confidence interval upper bound	float
        Columns 16-19 are only produced when multiple samples are provided, columns 20 and 21 are only produced when there is an equal number of 'a' and 'b' samples. When using multiple samples, it is possible that not every sample will have a modification fraction at a position. When this happens, the statistical test is still performed and the values of pct_a_samples and pct_b_samples reflect the percent of samples from each condition used in the test. 


     (15)	cohen_h	Cohen's h statistic (useful with regions and high-depth runs)	float
    (16)	cohen_h_low	95% confidence interval lower bound	float
    (17)	cohen_h_high	95% confidence interval upper bound	float
    
    n.b. Columns 15, 16, and 17 are present when the --regions option is passed, but these columns are on the right side of the table when performing single-site analysis (below). It is generally recommended to use the --header flag and standard CSV parsing to make sure the schema's between experiments are maintained.

When performing single-site analysis, the following additional columns are added:

column	name	description	type
Columns 20 and 21 have the replicate pairwise MAP-based p-values and effect sizes which are calculated based on their order provided on the command line. For example in the abbreviated command below:

In [120]:
# Path to the DMR BED output (adjust if you use a different file)
date_today="20251109"
experiment_codition="day35_CRoff_vs_Unedit"
dmr_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/new_dmr_output/"
dmr_path=dmr_folder_path+date_today+"_single_base_"+experiment_codition+".bed"

out_dir = dmr_folder_path  # save artifacts alongside the DMR file
os.makedirs(out_dir, exist_ok=True)

# # Read the DMR file (header or no-header)
# try:
#     df = pd.read_csv(dmr_path, sep='	', comment='#', header=0, engine='python')
#     if df.shape[1] < 3:
#         df = pd.read_csv(dmr_path, sep='	', comment='#', header=None, engine='python')
# except Exception as e:
#     print("Failed to read DMR file:", e)
#     raise

df

Unnamed: 0,chr1,206583090,206583091,.,-0.2776317365982791,-,m:1,1,m:2,2,m:100.00,m:100.00.1,1.1,1.2,1.3,0,0.1,-2.4004558381776544,2.4004558381776544
0,chr1,206583173,206583174,.,0.618367,+,m:66,78,m:130,143,m:84.62,m:90.91,0.846154,0.909091,0.438954,-0.063846,-0.193559,-0.082326,0.469445
1,chr1,206583174,206583175,.,-0.202477,-,m:81,84,m:112,118,m:96.43,m:94.92,0.964286,0.949152,1.000000,0.014286,0.074649,-0.205148,0.354446
2,chr1,206583387,206583388,.,0.017219,+,m:58,81,m:131,171,m:71.60,m:76.61,0.716049,0.766082,1.000000,-0.053951,-0.114335,-0.150032,0.378703
3,chr1,206583388,206583389,.,-0.262716,-,m:61,71,m:82,98,m:85.92,m:83.67,0.859155,0.836735,1.000000,0.022420,0.062478,-0.242979,0.367934
4,chr1,206583707,206583708,.,0.023835,+,m:95,99,m:175,187,m:95.96,m:93.58,0.959596,0.935829,1.000000,0.019596,0.107449,-0.136160,0.351058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,chr1,206589931,206589932,.,-0.327891,-,m:5,5,m:5,5,m:100.00,m:100.00,1.000000,1.000000,1.000000,0.000000,0.000000,-1.239590,1.239590
280,chr1,206589955,206589956,.,-0.326868,+,m:4,4,m:6,6,m:100.00,m:100.00,1.000000,1.000000,1.000000,0.000000,0.000000,-1.265151,1.265151
281,chr1,206589956,206589957,.,-0.319263,-,m:3,3,m:4,4,m:100.00,m:100.00,1.000000,1.000000,1.000000,0.000000,0.000000,-1.496947,1.496947
282,chr1,206590032,206590033,.,1.323236,+,m:5,6,m:2,6,m:83.33,m:33.33,0.833333,0.333333,0.186500,0.500000,1.069564,-0.062021,2.201150


In [121]:
dmr_path

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/new_dmr_output/20251109_single_base_day35_CRoff_vs_Unedit.bed'

In [122]:
# Read DMR BED (robust to header/no-header) and assign canonical column names (uses existing vars: dmr_path, out_dir, date_today, pd, os)
canonical_cols = [
    "chrom", "start", "end", "name", "score", "strand",
    "samplea_counts", "samplea_total", "sampleb_counts", "sampleb_total",
    "samplea_percents", "sampleb_percents",
    "samplea_fraction_modified", "sampleb_fraction_modified",
    "map_pvalue", "effect_size",
    "cohen_h", "cohen_h_low", "cohen_h_high",
]
    # "balanced_map_pvalue", "balanced_effect_size"

# read file with header and fallback to header=None when headers look numeric or columns are unexpected
try:
    dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")

    # dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", engine="python") # , header=0
    # # heuristic: if too many numeric-looking column names, re-read as headerless
    # numeric_headers = sum(1 for c in dmr_df.columns if str(c).strip().isdigit())
    # if numeric_headers >= (len(dmr_df.columns) / 2) or dmr_df.shape[1] < 3:
    #     dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")
except Exception:
    dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")

# assign canonical names up to number of columns present, add generic names for extras
ncols = dmr_df.shape[1]
if ncols <= len(canonical_cols):
    dmr_df.columns = canonical_cols[:ncols]
else:
    extras = [f"col_{i}" for i in range(ncols - len(canonical_cols))]
    dmr_df.columns = canonical_cols + extras

# coerce obvious numeric columns to numeric where present
num_cols_to_try = [
    "start", "end", "score",
    "samplea_total", "sampleb_total",
    "samplea_fraction_modified", "sampleb_fraction_modified",
    "map_pvalue", "effect_size",
    "balanced_map_pvalue", "balanced_effect_size"
]
for c in num_cols_to_try:
    if c in dmr_df.columns:
        dmr_df[c] = pd.to_numeric(dmr_df[c], errors="coerce")

# ensure output directory exists and save parsed table (parquet preferred)
os.makedirs(out_dir, exist_ok=True)
parsed_path = os.path.join(out_dir, f"{date_today}_dmr_parsed.parquet")
try:
    dmr_df.to_parquet(parsed_path, index=False)
    print("Saved parquet:", parsed_path)
except Exception:
    csv_path = os.path.join(out_dir, f"{date_today}_dmr_parsed.csv")
    dmr_df.to_csv(csv_path, index=False)
    print("Parquet not available, saved csv:", csv_path)

print("Loaded DMR:", dmr_path)
print("Assigned columns:", dmr_df.columns.tolist())
print("Shape:", dmr_df.shape)
dmr_df.head()

Parquet not available, saved csv: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/new_dmr_output/20251109_dmr_parsed.csv
Loaded DMR: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/new_dmr_output/20251109_single_base_day35_CRoff_vs_Unedit.bed
Assigned columns: ['chrom', 'start', 'end', 'name', 'score', 'strand', 'samplea_counts', 'samplea_total', 'sampleb_counts', 'sampleb_total', 'samplea_percents', 'sampleb_percents', 'samplea_fraction_modified', 'sampleb_fraction_modified', 'map_pvalue', 'effect_size', 'cohen_h', 'cohen_h_low', 'cohen_h_high']
Shape: (285, 19)


Unnamed: 0,chrom,start,end,name,score,strand,samplea_counts,samplea_total,sampleb_counts,sampleb_total,samplea_percents,sampleb_percents,samplea_fraction_modified,sampleb_fraction_modified,map_pvalue,effect_size,cohen_h,cohen_h_low,cohen_h_high
0,chr1,206583090,206583091,.,-0.277632,-,m:2,2,m:1,1,m:100.00,m:100.00,1.0,1.0,1.0,0.0,0.0,-2.400456,2.400456
1,chr1,206583173,206583174,.,0.618367,+,m:130,143,m:66,78,m:90.91,m:84.62,0.909091,0.846154,0.438954,0.063846,0.193559,-0.082326,0.469445
2,chr1,206583174,206583175,.,-0.202477,-,m:112,118,m:81,84,m:94.92,m:96.43,0.949152,0.964286,1.0,-0.014286,-0.074649,-0.205148,0.354446
3,chr1,206583387,206583388,.,0.017219,+,m:131,171,m:58,81,m:76.61,m:71.60,0.766082,0.716049,1.0,0.053951,0.114335,-0.150032,0.378703
4,chr1,206583388,206583389,.,-0.262716,-,m:82,98,m:61,71,m:83.67,m:85.92,0.836735,0.859155,1.0,-0.02242,-0.062478,-0.242979,0.367934


In [123]:
import os
from IPython.display import display, HTML

# Visualize all columns from dmr_df and save interactive HTMLs to out_dir
import plotly.express as px
import plotly.graph_objects as go

os.makedirs(out_dir, exist_ok=True)

# Save a table summary
summary = dmr_df.describe(include='all').transpose()
summary_path = os.path.join(out_dir, f"{date_today}_dmr_column_summary.csv")
summary.to_csv(summary_path)

numcols = dmr_df.select_dtypes(include=['number']).columns.tolist()

def _safe_name(name):
    return str(name).replace(os.sep, "_").replace(" ", "_").replace("\t", "_")

# Per-column visualizations
for col in dmr_df.columns:
    safe = _safe_name(col)
    try:
        if col in numcols:
            # Histogram
            fig_h = px.histogram(dmr_df, x=col, nbins=80, title=f"Histogram: {col}")
            fig_h.write_html(os.path.join(out_dir, f"{date_today}_dmr_hist_{safe}.html"), include_plotlyjs='cdn')
            fig_h.show()

            # Boxplot
            fig_b = px.box(dmr_df, y=col, points="outliers", title=f"Boxplot: {col}")
            fig_b.write_html(os.path.join(out_dir, f"{date_today}_dmr_box_{safe}.html"), include_plotlyjs='cdn')
            fig_b.show()
        else:
            # Categorical / text: show top value counts (up to 50)
            vc = dmr_df[col].fillna("NA").astype(str).value_counts().head(50)
            if len(vc):
                fig_c = px.bar(x=vc.values[::-1], y=vc.index.astype(str)[::-1], orientation='h',
                               title=f"Top value counts: {col}", labels={'x':'count','y':col})
                fig_c.update_layout(yaxis={'categoryorder':'array','categoryarray':vc.index[::-1].astype(str).tolist()})
                fig_c.write_html(os.path.join(out_dir, f"{date_today}_dmr_valcounts_{safe}.html"), include_plotlyjs='cdn')
                fig_c.show()
            else:
                # fallback: display empty info
                display(HTML(f"<b>{col}</b>: no values to plot"))
    except Exception as e:
        print(f"Skipped plotting column {col!r} due to error: {e}")

# Correlation heatmap for numeric columns
if len(numcols) >= 2:
    try:
        corr = dmr_df[numcols].corr()
        fig_corr = px.imshow(corr, text_auto=True, aspect="auto", title="Correlation matrix (numeric columns)")
        fig_corr.write_html(os.path.join(out_dir, f"{date_today}_dmr_correlation_numeric.html"), include_plotlyjs='cdn')
        fig_corr.show()
    except Exception as e:
        print("Failed to create correlation heatmap:", e)

print("Saved summary:", summary_path)
print("Plots saved to:", out_dir)

Saved summary: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/new_dmr_output/20251109_dmr_column_summary.csv
Plots saved to: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/new_dmr_output/


In [None]:
# ! python3 -m pip install plotly

In [None]:
# ! python3 -m pip install matplotlib

In [None]:
# ! python3 -m pip install nbformat>=4.2.0
