# Modkit dmr - Unedit K562 vs unedit T cells day 35 data
## Use my Filtered Reads (epiCG collection .bam, but NO quality condtrol additional reads filtering)

Based on:
https://nanoporetech.github.io/modkit/intro_dmr.html#perform-differential-methylation-scoring

Select kernal: dimelo_v2_modkit_parsing

In [None]:
from datetime import datetime
def current_time():
    """Returns the current date and time as a formatted string."""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S") 
    
print("Current Date and Time:", current_time())

Preparing the input data
The inputs to all modkit dmr commands are two or more bedMethyl files (created by modkit pileup) that have been compressed with bgzip and indexed with tabix. An example of how to generate the input data is shown below:


ref=grch38.fasta
threads=32

norm=normal_sample.bam
norm_pileup=normal_pileup.bed

modkit pileup ${norm} ${norm_pileup} \
  --cpg \
  --ref ${ref} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${norm_pileup}
tabix -p bed ${norm_pileup}.gz

# pileup and compression can also be done in one step
tumor=tumor_sample.bam
tumor_pileup=tumor_pileup.bed.gz

modkit pileup ${tumor} - \
  --cpg \
  --ref ${ref} \
  --threads ${threads} \
  --log-filepath log.txt | ${bgzip} -c > ${tumor_pileup}

tabix -p bed ${tumor_pileup}

In [None]:
%%bash
echo "hello"

# Use the NEW modkit latest installed version in ipython kernel modkit_new

In [None]:
# ! python3 -m ipykernel install --user --name=modkit_new --display-name "modkit_new Python"
# ! which modkit

In [None]:
import os
os.environ["PATH"] = "/home/michalula/.cargo/bin:" + os.environ["PATH"]
! which modkit
! modkit --version

In [None]:
# ! modkit

In [None]:
! modkit --version 

In [None]:
1+1

In [None]:
import os
import pandas as pd

def load_pileup_bed(bed_path):
    # bed_path = existing[0]
    print("Reading bedMethyl file:", bed_path)

    # bedMethyl column names (18 columns as provided)
    colnames = [
        "chrom", "start", "end", "mod_code", "score", "strand",
        "start2", "end2", "color",
        "Nvalid_cov", "percent_modified", "Nmod", "Ncanonical",
        "Nother_mod", "Ndelete", "Nfail", "Ndiff", "Nnocall"
    ]

    # Configure dtypes where reasonable
    dtypes = {
        "chrom": str,
        "start": "Int64",
        "end": "Int64",
        "mod_code": str,
        "score": "Int64",
        "strand": str,
        "start2": "Int64",
        "end2": "Int64",
        "color": str,
        "Nvalid_cov": "Int64",
        "percent_modified": float,
        "Nmod": "Int64",
        "Ncanonical": "Int64",
        "Nother_mod": "Int64",
        "Ndelete": "Int64",
        "Nfail": "Int64",
        "Ndiff": "Int64",
        "Nnocall": "Int64"
    }

    compression = "gzip" if bed_path.endswith(".gz") else None

    # Read file (headerless BED-like table). If file has extra columns, keep them with automatic numeric conversion below.
    df = pd.read_csv(
        bed_path,
        sep="\t",
        header=None,
        comment="#",
        names=colnames,
        dtype=dtypes,
        compression=compression,
        engine="python",
        na_values=[".", "NA", ""],
        keep_default_na=True
    )

    # If file contained more than 18 columns, pandas assigned remaining data to extra columns named like col_18, col_19...
    # Ensure numeric conversion for numeric-like columns
    for c in df.columns:
        if df[c].dtype == object:
            # try safe numeric conversion where appropriate
            try:
                df[c] = pd.to_numeric(df[c], errors="ignore")
            except Exception:
                pass

    print("Loaded DataFrame shape:", df.shape)
    display(df.head())
    return df


In [None]:
import os
from IPython.display import display, HTML
from plotly import express as px
from plotly import graph_objects as go

# ! python3 -m pip install plotly
# ! python3 -m pip install matplotlib
# ! python3 -m pip install nbformat>=4.2.0

def plot_pileup_roi_df(df_roi, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    # ensure numeric types for plotting
    df_roi['pos'] = df_roi['start'].astype(int)
    df_roi['percent_modified'] = df_roi['percent_modified'].astype(float)
    df_roi['Nvalid_cov'] = df_roi['Nvalid_cov'].astype(int)
    df_roi['Nmod'] = df_roi['Nmod'].astype(int)
    df_roi['Ncanonical'] = df_roi['Ncanonical'].astype(int)

    # Scatter: genomic position vs percent modified (point size = coverage)
    fig1 = px.scatter(
        df_roi,
        x='pos',
        y='percent_modified',
        color='strand',
        size='Nvalid_cov',
        hover_data=['Nvalid_cov','Nmod','Ncanonical','Nother_mod','Nnocall'],
        title='Percent modified across ROI (size = Nvalid_cov)',
        height=500
    )
    fig1.update_layout(xaxis_title='Genomic position (start)', yaxis_title='Percent modified')
    fig1.show()
    # fig1.write_html(os.path.join(out_dir, "roi_percent_modified_scatter.html"), include_plotlyjs='cdn')

    # Histogram: coverage distribution
    fig2 = px.histogram(
        df_roi,
        x='Nvalid_cov',
        nbins=40,
        title='Distribution of Nvalid_cov (coverage) in ROI',
        height=400
    )
    fig2.update_layout(xaxis_title='Nvalid_cov', yaxis_title='Count')
    fig2.show()
    # fig2.write_html(os.path.join(out_dir, "roi_nvalidcov_hist.html"), include_plotlyjs='cdn')

    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked)
    topn = 30
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(name='Nmod', x=df_top['label'], y=df_top['Nmod']))
    fig3.add_trace(go.Bar(name='Ncanonical', x=df_top['label'], y=df_top['Ncanonical']))
    fig3.update_layout(barmode='stack', title=f'Sorted Top {topn} sites by percent_modified (stacked Nmod / Ncanonical)',
                    xaxis_title='position:strand', yaxis_title='reads', height=520)
    fig3.show()
    # fig3.write_html(os.path.join(out_dir, "roi_top_sites_stacked_counts.html"), include_plotlyjs='cdn')

    # Print simple summaries
    print("ROI rows:", df_roi.shape[0])
    print("Percent modified: median={:.2f}, mean={:.2f}".format(df_roi['percent_modified'].median(), df_roi['percent_modified'].mean()))
    print("Coverage (Nvalid_cov): min={}, median={}, max={}".format(df_roi['Nvalid_cov'].min(), df_roi['Nvalid_cov'].median(), df_roi['Nvalid_cov'].max()))

    # Display first rows table for quick inspection
    display(HTML(df_roi.head(20).to_html(index=False)))


    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked)
    topn = df_roi.shape[0]
    # df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_roi.copy() #.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(name='Nmod', x=df_top['label'], y=df_top['Nmod']))
    fig3.add_trace(go.Bar(name='Ncanonical', x=df_top['label'], y=df_top['Ncanonical']))
    fig3.update_layout(barmode='stack', title=f'All {topn} CpG sites by percent_modified (stacked Nmod / Ncanonical) [ordered=not s]',
                    xaxis_title='position:strand', yaxis_title='reads', height=520)
    fig3.show()
    # fig3.write_html(os.path.join(out_dir, "roi_top_sites_stacked_counts.html"), include_plotlyjs='cdn')

    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked, NOT SORTED)
    topn = df_roi.shape[0]
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(name='Nmod', x=df_top['label'], y=df_top['Nmod']))
    fig3.add_trace(go.Bar(name='Ncanonical', x=df_top['label'], y=df_top['Ncanonical']))
    fig3.update_layout(barmode='stack', title=f'Sorted Top {topn} sites by percent_modified (stacked Nmod / Ncanonical)',
                    xaxis_title='position:strand', yaxis_title='reads', height=520)
    fig3.show()
    # fig3.write_html(os.path.join(out_dir, "roi_top_sites_stacked_counts.html"), include_plotlyjs='cdn')

    # Print simple summaries
    print("ROI rows:", df_roi.shape[0])
    print("Percent modified: median={:.2f}, mean={:.2f}".format(df_roi['percent_modified'].median(), df_roi['percent_modified'].mean()))
    print("Coverage (Nvalid_cov): min={}, median={}, max={}".format(df_roi['Nvalid_cov'].min(), df_roi['Nvalid_cov'].median(), df_roi['Nvalid_cov'].max()))

    # Display first rows table for quick inspection
    display(HTML(df_roi.head(20).to_html(index=False)))

    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    topn = 30
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig4 = go.Figure()
    fig4.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig4.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig4.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig4.show()
    # fig4.write_html(os.path.join(out_dir, "roi_top_sites_stacked_percentage.html"), include_plotlyjs='cdn')     


    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    topn = 277
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig4 = go.Figure()
    fig4.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig4.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig4.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig4.show()
    # fig4.write_html(os.path.join(out_dir, "roi_top_sites_stacked_percentage.html"), include_plotlyjs='cdn')     

    # Bar: Unsorted sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    df_top = df_roi.copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig5 = go.Figure()
    fig5.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig5.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig5.update_layout(barmode='stack', title=f'All sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig5.show()
    # fig5.write_html(os.path.join(out_dir, "roi_all_sites_stacked_percentage.html"), include_plotlyjs='cdn')    


    return df_top



In [None]:
! ls /home/michalula/data/ref_genomes/t2t_v2_0/

# K562 Unedited  - postSort Low:

In [None]:
! ls "/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/"

In [None]:
%%bash

ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref= "/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"

threads=32

date_today="20251118"
# data_folder_path="/home/michalula/data/cas9_nanopore/data/20250908_nCATs_T_CRoff_Day_35/5mCG/to_t2t_v2_0/"
# CROFF_day35_bam=${data_folder_path}"sort_align_t2t_v2_0_trim_20250908_Day35_CROFF_Tcells_2Libraries_Minion_R9.dna_r9.4.1_e8_sup@v3.3.5mCG.bam"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_dmr_pileup/"
mkdir ${pileup_data_folder_path}

# mC >70% = 07 Filter 
K562_unedit_Filter_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_K562_unedited_Day2_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
K562_unedit_Filter_bed=${pileup_data_folder_path}${date_today}"_pileup""_unedit_K562_Filter_mC07.bed"
# "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_CROFF_Day28/mergered_outputs/pileup_sort_merge_sort_align_t2t_v1_1_trim_20250721_nCATs_Tcells_CROFF_Day28_minion_run2_day8.dna_r9.4.1_e8_sup@v3.3.5mCG.bam"
# norm=normal_sample.bam
# norm_pileup=normal_pileup.bed

modkit pileup ${K562_unedit_Filter_bam} ${K562_unedit_Filter_bed} \
  --cpg \
  --ref ${ref_genome_fa} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${K562_unedit_Filter_bed}
tabix -p bed ${K562_unedit_Filter_bed}.gz

printf '%s\n' "K562_unedit_Filter_bam: $K562_unedit_Filter_bam"
printf '%s\n' "K562_unedit_Filter_bed: $K562_unedit_Filter_bed"
cat "$K562_unedit_Filter_bed"


## Pileup columns explore

In [None]:
date_today="20251118"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=data_folder_path+"new_dmr_pileup/"
# mkdir ${pileup_data_folder_path}

K562_unedit_Filter_bam="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/filtered_reads_overlap_MORE_than_0.9_K562_unedited_Day2_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
K562_unedit_Filter_bed="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_pileup_unedit_K562_Filter_mC07.bed"

K562_unedit_Filter_bam, K562_unedit_Filter_bed

In [17]:
pileup_K562_unedit_Filter_pileup_df = load_pileup_bed(K562_unedit_Filter_bed)
pileup_K562_unedit_Filter_pileup_df

Reading bedMethyl file: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_pileup_unedit_K562_Filter_mC07.bed
Loaded DataFrame shape: (286, 18)


  df[c] = pd.to_numeric(df[c], errors="ignore")


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583089,206583090,m,6,+,206583089,206583090,25500,6,0.0,0,6,0,0,0,0,3
1,chr1,206583090,206583091,m,2,-,206583090,206583091,25500,2,0.0,0,2,0,1,0,0,0
2,chr1,206583173,206583174,m,88,+,206583173,206583174,25500,88,78.41,69,19,0,0,14,8,9
3,chr1,206583174,206583175,m,74,-,206583174,206583175,25500,74,89.19,66,8,0,2,2,0,6
4,chr1,206583387,206583388,m,92,+,206583387,206583388,25500,92,4.35,4,88,0,2,14,8,5


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583089,206583090,m,6,+,206583089,206583090,25500,6,0.00,0,6,0,0,0,0,3
1,chr1,206583090,206583091,m,2,-,206583090,206583091,25500,2,0.00,0,2,0,1,0,0,0
2,chr1,206583173,206583174,m,88,+,206583173,206583174,25500,88,78.41,69,19,0,0,14,8,9
3,chr1,206583174,206583175,m,74,-,206583174,206583175,25500,74,89.19,66,8,0,2,2,0,6
4,chr1,206583387,206583388,m,92,+,206583387,206583388,25500,92,4.35,4,88,0,2,14,8,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,chr1,206589931,206589932,m,6,-,206589931,206589932,25500,6,83.33,5,1,0,0,1,1,0
282,chr1,206589955,206589956,m,17,+,206589955,206589956,25500,17,88.24,15,2,0,0,1,1,1
283,chr1,206589956,206589957,m,6,-,206589956,206589957,25500,6,83.33,5,1,0,1,0,0,0
284,chr1,206590032,206590033,m,18,+,206590032,206590033,25500,18,94.44,17,1,0,0,0,1,1


# Look at CpGs within out target ROI
T2T v2.0

First CG:
206583388,206583390

Last of selected 137 CGs in the ROI:

206589746,206589748 --CpG_137

=> here have each CG position separate: so have 137*2  = 276

In [18]:
137*2, 277-5

(274, 272)

In [19]:
pileup_K562_unedit_Filter_pileup_df[pileup_K562_unedit_Filter_pileup_df['start'] == 206583387]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
4,chr1,206583387,206583388,m,92,+,206583387,206583388,25500,92,4.35,4,88,0,2,14,8,5


In [20]:
pileup_K562_unedit_Filter_pileup_df[pileup_K562_unedit_Filter_pileup_df['start'] == 206583388]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
5,chr1,206583388,206583389,m,47,-,206583388,206583389,25500,47,17.02,8,39,0,5,3,1,29


In [21]:
pileup_K562_unedit_Filter_pileup_df[pileup_K562_unedit_Filter_pileup_df['start'] == 206589746]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
277,chr1,206589746,206589747,m,61,-,206589746,206589747,25500,61,98.36,60,1,0,11,3,3,7


In [22]:
(279-5) / 2

137.0

In [23]:
pileup_K562_unedit_Filter_pileup_df_roi = pileup_K562_unedit_Filter_pileup_df.iloc[4:278, :]  # Display target region rows
print(pileup_K562_unedit_Filter_pileup_df_roi.shape, pileup_K562_unedit_Filter_pileup_df_roi.shape[0]/2)
pileup_K562_unedit_Filter_pileup_df_roi

(274, 18) 137.0


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
4,chr1,206583387,206583388,m,92,+,206583387,206583388,25500,92,4.35,4,88,0,2,14,8,5
5,chr1,206583388,206583389,m,47,-,206583388,206583389,25500,47,17.02,8,39,0,5,3,1,29
6,chr1,206583707,206583708,m,96,+,206583707,206583708,25500,96,12.50,12,84,0,6,4,2,13
7,chr1,206583708,206583709,m,66,-,206583708,206583709,25500,66,7.58,5,61,0,1,2,8,8
8,chr1,206583766,206583767,m,84,+,206583766,206583767,25500,84,17.86,15,69,0,3,1,6,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,chr1,206589213,206589214,m,79,-,206589213,206589214,25500,79,88.61,70,9,0,0,3,0,3
274,chr1,206589436,206589437,m,113,+,206589436,206589437,25500,113,84.07,95,18,0,1,2,4,1
275,chr1,206589437,206589438,m,70,-,206589437,206589438,25500,70,82.86,58,12,0,1,2,5,7
276,chr1,206589745,206589746,m,99,+,206589745,206589746,25500,99,98.99,98,1,0,8,2,3,9


<!>
> Threshold of  0.7597656 for base C is low. Consider increasing the filter-percentile or specifying a higher threshold.
> Done, processed 11762972 rows. Processed ~129977 reads and skipped ~150 reads.

In [24]:
# Plot pileup_Unedit_day35_df_roi summary plots (use existing variables/imports in the notebook)
# Saves interactive HTMLs to pileup_data_folder_path and displays inline.

pileup_K562_unedit_Filter_pileup_df_roi_stats = plot_pileup_roi_df(df_roi=pileup_K562_unedit_Filter_pileup_df_roi, out_dir=pileup_data_folder_path)
pileup_K562_unedit_Filter_pileup_df_roi_stats


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_roi['pos'] = df_roi['start'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_roi['percent_modified'] = df_roi['percent_modified'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_roi['Nvalid_cov'] = df_roi['Nvalid_cov'].astype(int)
A value is trying to be set on a co

ROI rows: 274
Percent modified: median=3.62, mean=17.31
Coverage (Nvalid_cov): min=10, median=81.0, max=118


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,92,+,206583387,206583388,25500,92,4.35,4,88,0,2,14,8,5,206583387
chr1,206583388,206583389,m,47,-,206583388,206583389,25500,47,17.02,8,39,0,5,3,1,29,206583388
chr1,206583707,206583708,m,96,+,206583707,206583708,25500,96,12.5,12,84,0,6,4,2,13,206583707
chr1,206583708,206583709,m,66,-,206583708,206583709,25500,66,7.58,5,61,0,1,2,8,8,206583708
chr1,206583766,206583767,m,84,+,206583766,206583767,25500,84,17.86,15,69,0,3,1,6,27,206583766
chr1,206583767,206583768,m,72,-,206583767,206583768,25500,72,31.94,23,49,0,0,7,2,4,206583767
chr1,206584104,206584105,m,104,+,206584104,206584105,25500,104,71.15,74,30,0,0,0,10,7,206584104
chr1,206584105,206584106,m,75,-,206584105,206584106,25500,75,78.67,59,16,0,0,3,3,4,206584105
chr1,206584137,206584138,m,117,+,206584137,206584138,25500,117,73.5,86,31,0,0,3,1,0,206584137
chr1,206584138,206584139,m,54,-,206584138,206584139,25500,54,79.63,43,11,0,1,29,1,0,206584138


ROI rows: 274
Percent modified: median=3.62, mean=17.31
Coverage (Nvalid_cov): min=10, median=81.0, max=118


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,92,+,206583387,206583388,25500,92,4.35,4,88,0,2,14,8,5,206583387
chr1,206583388,206583389,m,47,-,206583388,206583389,25500,47,17.02,8,39,0,5,3,1,29,206583388
chr1,206583707,206583708,m,96,+,206583707,206583708,25500,96,12.5,12,84,0,6,4,2,13,206583707
chr1,206583708,206583709,m,66,-,206583708,206583709,25500,66,7.58,5,61,0,1,2,8,8,206583708
chr1,206583766,206583767,m,84,+,206583766,206583767,25500,84,17.86,15,69,0,3,1,6,27,206583766
chr1,206583767,206583768,m,72,-,206583767,206583768,25500,72,31.94,23,49,0,0,7,2,4,206583767
chr1,206584104,206584105,m,104,+,206584104,206584105,25500,104,71.15,74,30,0,0,0,10,7,206584104
chr1,206584105,206584106,m,75,-,206584105,206584106,25500,75,78.67,59,16,0,0,3,3,4,206584105
chr1,206584137,206584138,m,117,+,206584137,206584138,25500,117,73.5,86,31,0,0,3,1,0,206584137
chr1,206584138,206584139,m,54,-,206584138,206584139,25500,54,79.63,43,11,0,1,29,1,0,206584138


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,...,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos,label,Ntotal,Nmod_perc,Ncanonical_perc
4,chr1,206583387,206583388,m,92,+,206583387,206583388,25500,92,...,0,2,14,8,5,206583387,206583387:+,92,4.347826,95.652174
5,chr1,206583388,206583389,m,47,-,206583388,206583389,25500,47,...,0,5,3,1,29,206583388,206583388:-,47,17.021277,82.978723
6,chr1,206583707,206583708,m,96,+,206583707,206583708,25500,96,...,0,6,4,2,13,206583707,206583707:+,96,12.500000,87.500000
7,chr1,206583708,206583709,m,66,-,206583708,206583709,25500,66,...,0,1,2,8,8,206583708,206583708:-,66,7.575758,92.424242
8,chr1,206583766,206583767,m,84,+,206583766,206583767,25500,84,...,0,3,1,6,27,206583766,206583766:+,84,17.857143,82.142857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,chr1,206589213,206589214,m,79,-,206589213,206589214,25500,79,...,0,0,3,0,3,206589213,206589213:-,79,88.607595,11.392405
274,chr1,206589436,206589437,m,113,+,206589436,206589437,25500,113,...,0,1,2,4,1,206589436,206589436:+,113,84.070796,15.929204
275,chr1,206589437,206589438,m,70,-,206589437,206589438,25500,70,...,0,1,2,5,7,206589437,206589437:-,70,82.857143,17.142857
276,chr1,206589745,206589746,m,99,+,206589745,206589746,25500,99,...,0,8,2,3,9,206589745,206589745:+,99,98.989899,1.010101


# Unedited T cells Day 35

In [27]:
! ls "/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output"

CG_137_padded_reads_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_mCthresh0.7_T2Tv2_0_chr1:206583354-206589854_2025-11-09_units_combined_numFWD87_numRVS115.npy
CG_137_padded_reads_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_mCthresh0.995_T2Tv2_0_chr1:206583354-206589854_2025-11-09_units_combined_numFWD87_numRVS114.npy
extracted_reads
filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam.bai
filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_thre

In [28]:
%%bash

date_today="20251118"
data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_dmr_pileup/"
mkdir ${pileup_data_folder_path}

# mC >70% = 07 Filter   
Filter_Unedit_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
pileup_Unedit_day35_bed=${pileup_data_folder_path}${date_today}"_unedit_T_Filter_mC07""_pileup_NT_Day35_Tcells.bed"
cat "$pileup_Unedit_day35_bed"

ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref="/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"

threads=32

modkit pileup ${Filter_Unedit_day35_bam} ${pileup_Unedit_day35_bed} \
  --cpg \
  --ref ${ref_genome_fa} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${pileup_Unedit_day35_bed}
tabix -p bed ${pileup_Unedit_day35_bed}.gz

printf '%s\n' "Filter_Unedit_day35_bam: $Filter_Unedit_day35_bam"
printf '%s\n' "pileup_Unedit_day35_bed: $pileup_Unedit_day35_bed"
cat "$pileup_Unedit_day35_bed"


mkdir: cannot create directory ‘/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pileup/’: File exists
cat: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pileup/20251118_unedit_T_Filter_mC07_pileup_NT_Day35_Tcells.bed: No such file or directory
[0;32m>[0m calculated chunk size: 48, interval size 100000, processing 4800000 positions concurrently
[0;32m>[0m filtering to only CpG motifs
[0;32m>[0m attempting to sample 10042 reads
[0;32m>[0m Using filter threshold 0.8496094 for C.
[0;32m>[0m Done, processed 285 rows. Processed ~202 reads and skipped zero reads.


Filter_Unedit_day35_bam: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
pileup_Unedit_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pileup/20251118_unedit_T_Filter_mC07_pileup_NT_Day35_Tcells.bed
chr1	206583090	206583091	m	1	-	206583090	206583091	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206583173	206583174	m	78	+	206583173	206583174	255,0,0	78	84.62	66	12	0	2	17	7	4
chr1	206583174	206583175	m	84	-	206583174	206583175	255,0,0	84	96.43	81	3	0	1	3	2	2
chr1	206583387	206583388	m	81	+	206583387	206583388	255,0,0	81	71.60	58	23	0	1	15	9	3
chr1	206583388	206583389	m	71	-	206583388	206583389	255,0,0	71	85.92	61	10	0	3	12	1	5
chr1	206583707	206583708	m	99	+	206

In [29]:
date_today="20251118"
# data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/"
# pileup_data_folder_path=data_folder_path+"new_dmr_pileup/"
# # mkdir ${pileup_data_folder_path}

pileup_Unedit_day35_bed="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pileup/20251118_unedit_T_Filter_mC07_pileup_NT_Day35_Tcells.bed"

pileup_Unedit_day35_df = load_pileup_bed(pileup_Unedit_day35_bed)
pileup_Unedit_day35_df

Reading bedMethyl file: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pileup/20251118_unedit_T_Filter_mC07_pileup_NT_Day35_Tcells.bed
Loaded DataFrame shape: (285, 18)



errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead



Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583090,206583091,m,1,-,206583090,206583091,25500,1,100.0,1,0,0,0,0,0,0
1,chr1,206583173,206583174,m,78,+,206583173,206583174,25500,78,84.62,66,12,0,2,17,7,4
2,chr1,206583174,206583175,m,84,-,206583174,206583175,25500,84,96.43,81,3,0,1,3,2,2
3,chr1,206583387,206583388,m,81,+,206583387,206583388,25500,81,71.6,58,23,0,1,15,9,3
4,chr1,206583388,206583389,m,71,-,206583388,206583389,25500,71,85.92,61,10,0,3,12,1,5


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583090,206583091,m,1,-,206583090,206583091,25500,1,100.00,1,0,0,0,0,0,0
1,chr1,206583173,206583174,m,78,+,206583173,206583174,25500,78,84.62,66,12,0,2,17,7,4
2,chr1,206583174,206583175,m,84,-,206583174,206583175,25500,84,96.43,81,3,0,1,3,2,2
3,chr1,206583387,206583388,m,81,+,206583387,206583388,25500,81,71.60,58,23,0,1,15,9,3
4,chr1,206583388,206583389,m,71,-,206583388,206583389,25500,71,85.92,61,10,0,3,12,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,chr1,206589931,206589932,m,5,-,206589931,206589932,25500,5,100.00,5,0,0,0,0,0,0
281,chr1,206589955,206589956,m,4,+,206589955,206589956,25500,4,100.00,4,0,0,0,2,0,0
282,chr1,206589956,206589957,m,3,-,206589956,206589957,25500,3,100.00,3,0,0,0,0,0,1
283,chr1,206590032,206590033,m,6,+,206590032,206590033,25500,6,83.33,5,1,0,0,0,0,0


In [30]:
pileup_Unedit_day35_df[pileup_Unedit_day35_df['start'] == 206583388-1]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
3,chr1,206583387,206583388,m,81,+,206583387,206583388,25500,81,71.6,58,23,0,1,15,9,3


In [31]:
pileup_Unedit_day35_df[pileup_Unedit_day35_df['start'] == 206583388]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
4,chr1,206583388,206583389,m,71,-,206583388,206583389,25500,71,85.92,61,10,0,3,12,1,5


In [32]:
pileup_Unedit_day35_df[pileup_Unedit_day35_df['start'] == 206589746-1]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
275,chr1,206589745,206589746,m,98,+,206589745,206589746,25500,98,98.98,97,1,0,7,0,0,4


In [33]:
pileup_Unedit_day35_df[pileup_Unedit_day35_df['start'] == 206589746]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
276,chr1,206589746,206589747,m,72,-,206589746,206589747,25500,72,100.0,72,0,0,7,7,3,4


In [34]:
pileup_Unedit_day35_df_roi = pileup_Unedit_day35_df.iloc[3:277, :]  # Display target region rows
print(pileup_Unedit_day35_df_roi.shape,pileup_Unedit_day35_df_roi.shape[0]/2)
pileup_Unedit_day35_df_roi

(274, 18) 137.0


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
3,chr1,206583387,206583388,m,81,+,206583387,206583388,25500,81,71.60,58,23,0,1,15,9,3
4,chr1,206583388,206583389,m,71,-,206583388,206583389,25500,71,85.92,61,10,0,3,12,1,5
5,chr1,206583707,206583708,m,99,+,206583707,206583708,25500,99,95.96,95,4,0,3,3,0,4
6,chr1,206583708,206583709,m,77,-,206583708,206583709,25500,77,96.10,74,3,0,1,7,6,2
7,chr1,206583766,206583767,m,95,+,206583766,206583767,25500,95,93.68,89,6,0,2,1,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,chr1,206589213,206589214,m,84,-,206589213,206589214,25500,84,77.38,65,19,0,2,5,0,2
273,chr1,206589436,206589437,m,104,+,206589436,206589437,25500,104,95.19,99,5,0,0,4,1,0
274,chr1,206589437,206589438,m,86,-,206589437,206589438,25500,86,95.35,82,4,0,0,4,2,1
275,chr1,206589745,206589746,m,98,+,206589745,206589746,25500,98,98.98,97,1,0,7,0,0,4


In [35]:
pileup_Unedit_day35_df_roi

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
3,chr1,206583387,206583388,m,81,+,206583387,206583388,25500,81,71.60,58,23,0,1,15,9,3
4,chr1,206583388,206583389,m,71,-,206583388,206583389,25500,71,85.92,61,10,0,3,12,1,5
5,chr1,206583707,206583708,m,99,+,206583707,206583708,25500,99,95.96,95,4,0,3,3,0,4
6,chr1,206583708,206583709,m,77,-,206583708,206583709,25500,77,96.10,74,3,0,1,7,6,2
7,chr1,206583766,206583767,m,95,+,206583766,206583767,25500,95,93.68,89,6,0,2,1,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,chr1,206589213,206589214,m,84,-,206589213,206589214,25500,84,77.38,65,19,0,2,5,0,2
273,chr1,206589436,206589437,m,104,+,206589436,206589437,25500,104,95.19,99,5,0,0,4,1,0
274,chr1,206589437,206589438,m,86,-,206589437,206589438,25500,86,95.35,82,4,0,0,4,2,1
275,chr1,206589745,206589746,m,98,+,206589745,206589746,25500,98,98.98,97,1,0,7,0,0,4


In [36]:
pileup_Unedit_day35_df_roi

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
3,chr1,206583387,206583388,m,81,+,206583387,206583388,25500,81,71.60,58,23,0,1,15,9,3
4,chr1,206583388,206583389,m,71,-,206583388,206583389,25500,71,85.92,61,10,0,3,12,1,5
5,chr1,206583707,206583708,m,99,+,206583707,206583708,25500,99,95.96,95,4,0,3,3,0,4
6,chr1,206583708,206583709,m,77,-,206583708,206583709,25500,77,96.10,74,3,0,1,7,6,2
7,chr1,206583766,206583767,m,95,+,206583766,206583767,25500,95,93.68,89,6,0,2,1,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,chr1,206589213,206589214,m,84,-,206589213,206589214,25500,84,77.38,65,19,0,2,5,0,2
273,chr1,206589436,206589437,m,104,+,206589436,206589437,25500,104,95.19,99,5,0,0,4,1,0
274,chr1,206589437,206589438,m,86,-,206589437,206589438,25500,86,95.35,82,4,0,0,4,2,1
275,chr1,206589745,206589746,m,98,+,206589745,206589746,25500,98,98.98,97,1,0,7,0,0,4


In [37]:
pileup_data_folder_path

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/'

In [38]:

# data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/"
# pileup_data_folder_path=data_folder_path+"new_dmr_pileup/"
# # mkdir ${pileup_data_folder_path}

# # Plot pileup_Unedit_day35_df_roi summary plots (use existing variables/imports in the notebook)
# # Saves interactive HTMLs to pileup_data_folder_path and displays inline.
# out_dir = pileup_data_folder_path  # existing variable in the notebook

# df_roi = pileup_Unedit_day35_df_roi.copy()

df_roi_stats = plot_pileup_roi_df(df_roi=pileup_Unedit_day35_df_roi, out_dir=pileup_data_folder_path)
df_roi_stats




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

ROI rows: 274
Percent modified: median=1.98, mean=21.90
Coverage (Nvalid_cov): min=13, median=86.0, max=107


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,81,+,206583387,206583388,25500,81,71.6,58,23,0,1,15,9,3,206583387
chr1,206583388,206583389,m,71,-,206583388,206583389,25500,71,85.92,61,10,0,3,12,1,5,206583388
chr1,206583707,206583708,m,99,+,206583707,206583708,25500,99,95.96,95,4,0,3,3,0,4,206583707
chr1,206583708,206583709,m,77,-,206583708,206583709,25500,77,96.1,74,3,0,1,7,6,2,206583708
chr1,206583766,206583767,m,95,+,206583766,206583767,25500,95,93.68,89,6,0,2,1,10,1,206583766
chr1,206583767,206583768,m,75,-,206583767,206583768,25500,75,93.33,70,5,0,2,12,2,2,206583767
chr1,206584104,206584105,m,97,+,206584104,206584105,25500,97,95.88,93,4,0,1,1,9,1,206584104
chr1,206584105,206584106,m,89,-,206584105,206584106,25500,89,92.13,82,7,0,0,1,0,3,206584105
chr1,206584137,206584138,m,104,+,206584137,206584138,25500,104,100.0,104,0,0,1,3,0,1,206584137
chr1,206584138,206584139,m,52,-,206584138,206584139,25500,52,92.31,48,4,0,0,41,0,0,206584138


ROI rows: 274
Percent modified: median=1.98, mean=21.90
Coverage (Nvalid_cov): min=13, median=86.0, max=107


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,81,+,206583387,206583388,25500,81,71.6,58,23,0,1,15,9,3,206583387
chr1,206583388,206583389,m,71,-,206583388,206583389,25500,71,85.92,61,10,0,3,12,1,5,206583388
chr1,206583707,206583708,m,99,+,206583707,206583708,25500,99,95.96,95,4,0,3,3,0,4,206583707
chr1,206583708,206583709,m,77,-,206583708,206583709,25500,77,96.1,74,3,0,1,7,6,2,206583708
chr1,206583766,206583767,m,95,+,206583766,206583767,25500,95,93.68,89,6,0,2,1,10,1,206583766
chr1,206583767,206583768,m,75,-,206583767,206583768,25500,75,93.33,70,5,0,2,12,2,2,206583767
chr1,206584104,206584105,m,97,+,206584104,206584105,25500,97,95.88,93,4,0,1,1,9,1,206584104
chr1,206584105,206584106,m,89,-,206584105,206584106,25500,89,92.13,82,7,0,0,1,0,3,206584105
chr1,206584137,206584138,m,104,+,206584137,206584138,25500,104,100.0,104,0,0,1,3,0,1,206584137
chr1,206584138,206584139,m,52,-,206584138,206584139,25500,52,92.31,48,4,0,0,41,0,0,206584138


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,...,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos,label,Ntotal,Nmod_perc,Ncanonical_perc
3,chr1,206583387,206583388,m,81,+,206583387,206583388,25500,81,...,0,1,15,9,3,206583387,206583387:+,81,71.604938,28.395062
4,chr1,206583388,206583389,m,71,-,206583388,206583389,25500,71,...,0,3,12,1,5,206583388,206583388:-,71,85.915493,14.084507
5,chr1,206583707,206583708,m,99,+,206583707,206583708,25500,99,...,0,3,3,0,4,206583707,206583707:+,99,95.959596,4.040404
6,chr1,206583708,206583709,m,77,-,206583708,206583709,25500,77,...,0,1,7,6,2,206583708,206583708:-,77,96.103896,3.896104
7,chr1,206583766,206583767,m,95,+,206583766,206583767,25500,95,...,0,2,1,10,1,206583766,206583766:+,95,93.684211,6.315789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,chr1,206589213,206589214,m,84,-,206589213,206589214,25500,84,...,0,2,5,0,2,206589213,206589213:-,84,77.380952,22.619048
273,chr1,206589436,206589437,m,104,+,206589436,206589437,25500,104,...,0,0,4,1,0,206589436,206589436:+,104,95.192308,4.807692
274,chr1,206589437,206589438,m,86,-,206589437,206589438,25500,86,...,0,0,4,2,1,206589437,206589437:-,86,95.348837,4.651163
275,chr1,206589745,206589746,m,98,+,206589745,206589746,25500,98,...,0,7,0,0,4,206589745,206589745:+,98,98.979592,1.020408


# dmr modkit Unedited K562 vs T cells (NT)

3. Detecting differential modification at single base positions
The modkit dmr pair command has the ability to score individual bases (e.g. differentially methylated CpGs). To run single-base analysis on one or more paired samples, simply omit the --regions (-r) option when running modkit dmr pair. When performing single-base analysis the likelihood ratio score and a MAP-based p-value are available. For details on the likelihood ratio score and the MAP-based p-value, see the scoring details section. For example the above command becomes:

dmr_result=single_base_haplotype_dmr.bed

modkit dmr pair \
  -a ${hp1_pileup}.gz \
  -b ${hp2_pileup}.gz \
  -o ${dmr_result} \
  --ref ${ref} \
  --base C \
  --threads ${threads} \
  --log-filepath dmr.log

In [39]:
%%bash

pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pileup/"
# "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG/to_t2t_v2_0/pileup/pileup_Unedit_Day28_Tcells_20250721.bam"
pileup_Unedit_day35_bed="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pileup/20251118_unedit_T_Filter_mC07_pileup_NT_Day35_Tcells.bed"

echo "pileup_Unedit_day35_bed: ${pileup_Unedit_day35_bed}.gz"

ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_Unedit_day35_bed}.gz"
# ls -l "${pileup_Unedit_day28_bam}.gz"

chmod u+rwx "${pileup_data_folder_path}"
chmod u+rwx "${pileup_Unedit_day35_bed}.gz"

# ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_Unedit_day35_bed}.gz"


pileup_Unedit_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pileup/20251118_unedit_T_Filter_mC07_pileup_NT_Day35_Tcells.bed.gz
20251118_unedit_T_Filter_mC07_pileup_NT_Day35_Tcells.bed
20251118_unedit_T_Filter_mC07_pileup_NT_Day35_Tcells.bed.gz
20251118_unedit_T_Filter_mC07_pileup_NT_Day35_Tcells.bed.gz.tbi
20251118_unedit_T_noFilter_mC07_pileup_NT_Day35_Tcells.bed
20251118_unedit_T_noFilter_mC07_pileup_NT_Day35_Tcells.bed.gz
20251118_unedit_T_noFilter_mC07_pileup_NT_Day35_Tcells.bed.gz.tbi
drwxrwxr-x 2 michalula michalula 4096 Nov 18 09:26 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pileup/
-rw-rw-r-- 1 michalula michalula 5543 Nov 18 09:26 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pile

In [40]:
%%bash

pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/"

# K562_unedit_Filter_bam="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/pre_filtered_ROI_reads_K562_unedited_Day2_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
K562_unedit_Filter_bed="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_pileup_unedit_K562_Filter_mC07.bed"

echo "K562_unedit_Filter_bed: ${K562_unedit_Filter_bed}.gz"

ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${K562_unedit_Filter_bed}.gz"
# ls -l "${pileup_Unedit_day28_bam}.gz"

chmod u+rwx "${pileup_data_folder_path}"
chmod u+rwx "${K562_unedit_Filter_beded}.gz"


# ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${K562_unedit_Filter_bed}.gz"


K562_unedit_Filter_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_pileup_unedit_K562_Filter_mC07.bed.gz
20251118_pileup_unedit_K562_Filter_mC07.bed
20251118_pileup_unedit_K562_Filter_mC07.bed.gz
20251118_pileup_unedit_K562_Filter_mC07.bed.gz.tbi
20251118_pileup_unedit_K562_noFilter_mC07.bed
20251118_pileup_unedit_K562_noFilter_mC07.bed.gz
20251118_pileup_unedit_K562_noFilter_mC07.bed.gz.tbi
drwxrwxr-x 2 michalula michalula 4096 Nov 18 09:18 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/
-rw-rw-r-- 1 michalula michalula 5922 Nov 18 09:18 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_pileup_unedit_K562_Filter_mC07.bed.gz


chmod: cannot access '.gz': No such file or directory


drwxrwxr-x 2 michalula michalula 4096 Nov 18 09:18 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/
-rw-rw-r-- 1 michalula michalula 5922 Nov 18 09:18 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_pileup_unedit_K562_Filter_mC07.bed.gz


In [44]:
%%bash
 
# 3. Detecting differential modification at single base positions
# The modkit dmr pair command has the ability to score individual bases (e.g. differentially methylated CpGs). To run single-base analysis on one or more paired samples, simply omit the --regions (-r) option when running modkit dmr pair. When performing single-base analysis the likelihood ratio score and a MAP-based p-value are available. For details on the likelihood ratio score and the MAP-based p-value, see the scoring details section. For example the above command becomes:
date_today="20251118"

experiment_codition="unedit_K562_v_T_day35_Filter_mC07"
dmr_output_path="/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/"
dmr_result=${dmr_output_path}${date_today}"_single_base_"${experiment_codition}".bed"

# pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pileup/"
pileup_Unedit_day35_bed="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pileup/20251118_unedit_T_Filter_mC07_pileup_NT_Day35_Tcells.bed"
# "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG/to_t2t_v2_0/pileup/pileup_Unedit_Day28_Tcells_20250721.bam"

echo "pileup_Unedit_day35_bed: ${pileup_Unedit_day35_bed}.gz"
ls -l "${pileup_Unedit_day35_bed}.gz"

K562_unedit_Filter_bed="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_pileup_unedit_K562_Filter_mC07.bed"

echo "K562_unedit_Filter_bed: ${K562_unedit_Filter_bed}.gz"
ls -l "${K562_unedit_Filter_bed}.gz"

ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref="/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"
threads=32
 
cd ${dmr_output_path}
# '/home/michalula/code/epiCausality/epiCode/differential_methyl'

modkit dmr pair \
  -a ${K562_unedit_Filter_bed}.gz \
  -b ${pileup_Unedit_day35_bed}.gz \
  -o ${dmr_result} \
  --ref ${ref_genome_fa} \
  --base C \
  --threads ${threads} \
  --log-filepath dmr.log


echo "dmr_result: $dmr_result"
ls -lah $dmr_result

pileup_Unedit_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pileup/20251118_unedit_T_Filter_mC07_pileup_NT_Day35_Tcells.bed.gz
-rwxrw-r-- 1 michalula michalula 5543 Nov 18 09:26 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_dmr_pileup/20251118_unedit_T_Filter_mC07_pileup_NT_Day35_Tcells.bed.gz
K562_unedit_Filter_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_pileup_unedit_K562_Filter_mC07.bed.gz
-rw-rw-r-- 1 michalula michalula 5922 Nov 18 09:18 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/unedited/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_pileup_unedit_K562_Filter_mC07.bed.gz


bash: line 26: cd: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/: No such file or directory
[0;32m>[0m creating directory at "/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35"
[0;32m>[0m reading reference FASTA at "/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
[0;32m>[0m 1 common sequence(s) between FASTA and both samples
[0;32m>[0m running single-site analysis
[0;32m>[0m using default prior, Beta(α: 0.55, β: 0.55)
[0;32m>[0m estimating max coverages from data
[0;32m>[0m sampled 286 a records and 285 b records, calculating max coverages for 95th percentile
[0;32m>[0m calculated max coverage for a: 113 and b: 103
[0;32m>[0m calculated max coverage 113 is greater than maximum allowed (100), setting to 100
[0;32m>[0m calculated max coverage 103 is greater than maximum allowed (100), setting to 100
[0;31;1m>[0m errors:
+--------

dmr_result: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/20251118_single_base_unedit_K562_v_T_day35_Filter_mC07.bed
-rw-rw-r-- 1 michalula michalula 50K Nov 18 09:31 /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/20251118_single_base_unedit_K562_v_T_day35_Filter_mC07.bed


In [45]:
%%bash

date_today="20251118"
experiment_codition="unedit_K562_v_T_day35_Filter_mC07"
dmr_output_path="/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/"
# "/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_Filter/new_dmr_output/"
dmr_result=${dmr_output_path}"_"${date_today}"single_base_"${experiment_codition}".bed"

echo "dmr_result: $dmr_result"
# ls -lah $dmr_result
ls -lah $dmr_output_path
# cat $dmr_result

dmr_result: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/_20251118single_base_unedit_K562_v_T_day35_Filter_mC07.bed
total 60K
drwxrwxr-x 2 michalula michalula 4.0K Nov 18 09:31 .
drwxrwxr-x 3 michalula michalula 4.0K Nov 18 09:31 ..
-rw-rw-r-- 1 michalula michalula  50K Nov 18 09:31 20251118_single_base_unedit_K562_v_T_day35_Filter_mC07.bed


In [46]:
pwd

'/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered'

## modkit dmr explore output

The full table when performing single-site analysis with equal numbers of samples in groups, when running modkit dmr pair, will have the following schema:

column	name	description	type
1	chrom	name of reference sequence from bedMethyl input samples	str
2	start position	0-based start position, from --regions argument	int
3	end position	0-based exclusive end position, from --regions argument	int
4	name	name column from --regions BED, or chr:start-stop if absent, "." for single sites	str
5	score	difference score, more positive values have increased difference	float
6	strand	strand for the region or single-base position	str
7	samplea counts	counts of each base modification in the region, comma-separated, for sample A	str
8	samplea total	total number of base modification calls in the region, including unmodified, for sample A	int
9	sampleb counts	counts of each base modification in the region, comma-separated, for sample B	str
10	sampleb total	total number of base modification calls in the region, including unmodified, for sample B	int
11	samplea percents	percent of calls for each base modification in the region, comma-separated, for sample A	str
12	sampleb percents	percent of calls for each base modification in the region, comma-separated, for sample B	str
13	samplea fraction modified	fraction modification (of any kind) in sample A	float
14	sampleb fraction modified	fraction modification (of any kind) in sample B	float
15	MAP-based p-value	ratio of the posterior probability of observing the effect size over zero effect size	float
16	effect size	percent modified in sample A (col 12) minus percent modified in sample B (col 13)	float
<!-- 17	balanced MAP-based p-value	MAP-based p-value when all replicates are balanced	float
18	balanced effect size	effect size when all replicates are balanced	float -->


17	cohen_h	Cohen's h statistic (useful with regions and high-depth runs)	float
18	cohen_h_low	95% confidence interval lower bound	float
19	cohen_h_high	95% confidence interval upper bound	float

<!-- Differential methylation output format
The output from modkit dmr pair (and for each pairwise comparison with modkit dmr multi) is (roughly) a BED file with the following schema: -->
<!-- 
column	name	description	type
        1	chrom	name of reference sequence from bedMethyl input samples	str
        2	start position	0-based start position, from --regions argument	int
        3	end position	0-based exclusive end position, from --regions argument	int
        4	name	name column from --regions BED, or chr:start-stop if absent, "." for single sites	str
        5	score	difference score, more positive values have increased difference	float
        6	strand	strand for the region or single-base position	str
        7	samplea counts	counts of each base modification in the region, comma-separated, for sample A	str
        8	samplea total	total number of base modification calls in the region, including unmodified, for sample A	int
        9	sampleb counts	counts of each base modification in the region, comma-separated, for sample B	str
        10	sampleb total	total number of base modification calls in the region, including unmodified, for sample B	int
        11	samplea percents	percent of calls for each base modification in the region, comma-separated, for sample A	str
        12	sampleb percents	percent of calls for each base modification in the region, comma-separated, for sample B	str
        13	samplea fraction modified	fraction modification (of any kind) in sample A	float
        14	sampleb fraction modified	fraction modification (of any kind) in sample B	float
        15	MAP-based p-value	ratio of the posterior probability of observing the effect size over zero effect size	float
        16	effect size	percent modified in sample A (col 12) minus percent modified in sample B (col 13)	float
        17	balanced MAP-based p-value	MAP-based p-value when all replicates are balanced	float
        18	balanced effect size	effect size when all replicates are balanced	float
        19	pct_a_samples	percent of 'a' samples used in statistical test	float
        20	pct_b_samples	percent of 'b' samples used in statistical test	float
        21	per-replicate p-values	MAP-based p-values for matched replicate pairs	float
        22	per-replicate effect sizes	effect sizes matched replicate pairs	float
        23	cohen_h	Cohen's h statistic (useful with regions and high-depth runs)	float
        24	cohen_h_low	95% confidence interval lower bound	float
        25	cohen_h_high	95% confidence interval upper bound	float
        Columns 16-19 are only produced when multiple samples are provided, columns 20 and 21 are only produced when there is an equal number of 'a' and 'b' samples. When using multiple samples, it is possible that not every sample will have a modification fraction at a position. When this happens, the statistical test is still performed and the values of pct_a_samples and pct_b_samples reflect the percent of samples from each condition used in the test. 


     (15)	cohen_h	Cohen's h statistic (useful with regions and high-depth runs)	float
    (16)	cohen_h_low	95% confidence interval lower bound	float
    (17)	cohen_h_high	95% confidence interval upper bound	float
    
    n.b. Columns 15, 16, and 17 are present when the --regions option is passed, but these columns are on the right side of the table when performing single-site analysis (below). It is generally recommended to use the --header flag and standard CSV parsing to make sure the schema's between experiments are maintained.

When performing single-site analysis, the following additional columns are added:

column	name	description	type
Columns 20 and 21 have the replicate pairwise MAP-based p-values and effect sizes which are calculated based on their order provided on the command line. For example in the abbreviated command below:

In [47]:
experiment_codition="unedit_K562_v_T_day35_Filter_mC07"

In [48]:
dmr_path="/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/20251118_single_base_unedit_K562_v_T_day35_Filter_mC07.bed"


In [49]:
dmr_path

'/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/20251118_single_base_unedit_K562_v_T_day35_Filter_mC07.bed'

In [50]:
out_dir = "/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/"

In [51]:
# Read DMR BED (robust to header/no-header) and assign canonical column names (uses existing vars: dmr_path, out_dir, date_today, pd, os)
canonical_cols = [
    "chrom", "start", "end", "name", "score", "strand",
    "samplea_counts", "samplea_total", "sampleb_counts", "sampleb_total",
    "samplea_percents", "sampleb_percents",
    "samplea_fraction_modified", "sampleb_fraction_modified",
    "map_pvalue", "effect_size",
    "cohen_h", "cohen_h_low", "cohen_h_high",
]
    # "balanced_map_pvalue", "balanced_effect_size"

# read file with header and fallback to header=None when headers look numeric or columns are unexpected
try:
    dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")

    # dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", engine="python") # , header=0
    # # heuristic: if too many numeric-looking column names, re-read as headerless
    # numeric_headers = sum(1 for c in dmr_df.columns if str(c).strip().isdigit())
    # if numeric_headers >= (len(dmr_df.columns) / 2) or dmr_df.shape[1] < 3:
    #     dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")
except Exception:
    dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")

# assign canonical names up to number of columns present, add generic names for extras
ncols = dmr_df.shape[1]
if ncols <= len(canonical_cols):
    dmr_df.columns = canonical_cols[:ncols]
else:
    extras = [f"col_{i}" for i in range(ncols - len(canonical_cols))]
    dmr_df.columns = canonical_cols + extras

# coerce obvious numeric columns to numeric where present
num_cols_to_try = [
    "start", "end", "score",
    "samplea_total", "sampleb_total",
    "samplea_fraction_modified", "sampleb_fraction_modified",
    "map_pvalue", "effect_size",
    "balanced_map_pvalue", "balanced_effect_size"
]
for c in num_cols_to_try:
    if c in dmr_df.columns:
        dmr_df[c] = pd.to_numeric(dmr_df[c], errors="coerce")

# ensure output directory exists and save parsed table (parquet preferred)
os.makedirs(out_dir, exist_ok=True)
parsed_path = os.path.join(out_dir, f"{date_today}_dmr_parsed.parquet")
try:
    dmr_df.to_parquet(parsed_path, index=False)
    print("Saved parquet:", parsed_path)
except Exception:
    csv_path = os.path.join(out_dir, f"{date_today}_dmr_parsed.csv")
    dmr_df.to_csv(csv_path, index=False)
    print("Parquet not available, saved csv:", csv_path)

print("Loaded DMR:", dmr_path)
print("Assigned columns:", dmr_df.columns.tolist())
print("Shape:", dmr_df.shape)
dmr_df.head()

Parquet not available, saved csv: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/20251118_dmr_parsed.csv
Loaded DMR: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/20251118_single_base_unedit_K562_v_T_day35_Filter_mC07.bed
Assigned columns: ['chrom', 'start', 'end', 'name', 'score', 'strand', 'samplea_counts', 'samplea_total', 'sampleb_counts', 'sampleb_total', 'samplea_percents', 'sampleb_percents', 'samplea_fraction_modified', 'sampleb_fraction_modified', 'map_pvalue', 'effect_size', 'cohen_h', 'cohen_h_low', 'cohen_h_high']
Shape: (285, 19)


Unnamed: 0,chrom,start,end,name,score,strand,samplea_counts,samplea_total,sampleb_counts,sampleb_total,samplea_percents,sampleb_percents,samplea_fraction_modified,sampleb_fraction_modified,map_pvalue,effect_size,cohen_h,cohen_h_low,cohen_h_high
0,chr1,206583090,206583091,.,1.609438,-,m:0,2,m:1,1,m:0.00,m:100.00,0.0,1.0,0.2615765,-1.0,-3.141593,0.741137,5.542048
1,chr1,206583173,206583174,.,0.185934,+,m:69,88,m:66,78,m:78.41,m:84.62,0.784091,0.846154,0.5817092,-0.062063,-0.160387,-0.144412,0.465186
2,chr1,206583174,206583175,.,1.291342,-,m:66,74,m:81,84,m:89.19,m:96.43,0.891892,0.964286,0.205079,-0.072394,-0.28981,-0.022669,0.602289
3,chr1,206583387,206583388,.,47.757516,+,m:4,92,m:58,81,m:4.35,m:71.60,0.043478,0.716049,0.0,-0.672571,-1.597504,1.298873,1.896134
4,chr1,206583388,206583389,.,29.444003,-,m:8,47,m:61,71,m:17.02,m:85.92,0.170213,0.859155,9.23e-14,-0.688942,-1.521622,1.15306,1.890185


In [52]:
import os
from IPython.display import display, HTML

# Visualize all columns from dmr_df and save interactive HTMLs to out_dir
import plotly.express as px
import plotly.graph_objects as go

os.makedirs(out_dir, exist_ok=True)

# Save a table summary
summary = dmr_df.describe(include='all').transpose()
summary_path = os.path.join(out_dir, f"{date_today}_dmr_column_summary.csv")
summary.to_csv(summary_path)

numcols = dmr_df.select_dtypes(include=['number']).columns.tolist()

def _safe_name(name):
    return str(name).replace(os.sep, "_").replace(" ", "_").replace("\t", "_")

# Per-column visualizations
for col in dmr_df.columns:
    safe = _safe_name(col)
    try:
        if col in numcols:
            # Histogram
            fig_h = px.histogram(dmr_df, x=col, nbins=80, title=f"Histogram: {col}")
            # fig_h.write_html(os.path.join(out_dir, f"{date_today}_dmr_hist_{safe}.html"), include_plotlyjs='cdn')
            fig_h.show()

            # Boxplot
            fig_b = px.box(dmr_df, y=col, points="outliers", title=f"Boxplot: {col}")
            # fig_b.write_html(os.path.join(out_dir, f"{date_today}_dmr_box_{safe}.html"), include_plotlyjs='cdn')
            fig_b.show()
        else:
            # Categorical / text: show top value counts (up to 50)
            vc = dmr_df[col].fillna("NA").astype(str).value_counts().head(50)
            if len(vc):
                fig_c = px.bar(x=vc.values[::-1], y=vc.index.astype(str)[::-1], orientation='h',
                               title=f"Top value counts: {col}", labels={'x':'count','y':col})
                fig_c.update_layout(yaxis={'categoryorder':'array','categoryarray':vc.index[::-1].astype(str).tolist()})
                # fig_c.write_html(os.path.join(out_dir, f"{date_today}_dmr_valcounts_{safe}.html"), include_plotlyjs='cdn')
                fig_c.show()
            else:
                # fallback: display empty info
                display(HTML(f"<b>{col}</b>: no values to plot"))
    except Exception as e:
        print(f"Skipped plotting column {col!r} due to error: {e}")

# Correlation heatmap for numeric columns
if len(numcols) >= 2:
    try:
        corr = dmr_df[numcols].corr()
        fig_corr = px.imshow(corr, text_auto=True, aspect="auto", title="Correlation matrix (numeric columns)")
        # fig_corr.write_html(os.path.join(out_dir, f"{date_today}_dmr_correlation_numeric.html"), include_plotlyjs='cdn')
        fig_corr.show()
    except Exception as e:
        print("Failed to create correlation heatmap:", e)

print("Saved summary:", summary_path)
print("Plots saved to:", out_dir)

Saved summary: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/20251118_dmr_column_summary.csv
Plots saved to: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/


In [53]:
# Select significant CG pairs from DMR results and plot them (new cell at index 69).
# Uses existing notebook variables: dmr_df (parsed modkit dmr), df_roi_stats (pileup ROI stats),
# plotly (px) and out_dir/dmr_folder_path for saving. Does not re-import modules.

# Parameters
pvalue_thresh = 0.05

# ensure numeric columns
dmr_df['map_pvalue'] = pd.to_numeric(dmr_df['map_pvalue'], errors='coerce')
dmr_df['effect_size'] = pd.to_numeric(dmr_df['effect_size'], errors='coerce')
dmr_df['samplea_fraction_modified'] = pd.to_numeric(dmr_df['samplea_fraction_modified'], errors='coerce')
dmr_df['sampleb_fraction_modified'] = pd.to_numeric(dmr_df['sampleb_fraction_modified'], errors='coerce')

# filter significant by MAP-based p-value
sig = dmr_df[dmr_df['map_pvalue'] <= pvalue_thresh].copy()

dmr_df['map_pval_less005'] = dmr_df['map_pvalue'] <= 0.05


# # restrict to ROI positions if df_roi_stats exists
# if 'df_roi_stats' in globals():
#     roi_positions = set(df_roi_stats['start'].astype(int).tolist())
#     sig = sig[sig['start'].isin(roi_positions)].copy()

# quick exit if none
if sig.shape[0] == 0:
    print(f"No significant CG pairs found in ROI at map_pvalue <= {pvalue_thresh}")
else:
    # add convenience cols
    sig['pos'] = sig['start'].astype(str)
    sig['a_perc'] = sig['samplea_fraction_modified'] * 100
    sig['b_perc'] = sig['sampleb_fraction_modified'] * 100
    sig['total_reads'] = sig.get('samplea_total', 0).fillna(0).astype(int) + sig.get('sampleb_total', 0).fillna(0).astype(int)

    # save a table of significant sites
    os.makedirs(out_dir, exist_ok=True)
    sig_table_path = os.path.join(out_dir, f"dmr_significant_p{pvalue_thresh:.3f}_roi.tsv")
    sig.to_csv(sig_table_path, sep='\t', index=False)
    print("Saved significant sites table:", sig_table_path)
    display(sig[['chrom','start','end','strand','map_pvalue','effect_size','a_perc','b_perc','total_reads']].reset_index(drop=True))

    # plot the map_pval_less005 distribution which corresponds to significant sites
    fig_mappval_hist = px.histogram(
        dmr_df,
        x='map_pval_less005',
        nbins=80,
        title=f"MAP-based p-value distribution (highlighting p <= {pvalue_thresh}) <br>{experiment_codition}",
        labels={'map_pval_less005':'MAP-based p-value'}
    )
    fig_mappval_hist.update_layout(height=520)
    mappval_hist_path = os.path.join(out_dir, f"dmr_map_pval_distribution.html")
    # fig_mappval_hist.write_html(mappval_hist_path, include_plotlyjs='cdn')
    fig_mappval_hist.show()
    # print("Saved MAP-based p-value distribution histogram:", mappval_hist_path)

    # plot the percent of significant sites where map_pvalue <= pvalue_thresh is colored red, others blue (color not working)
    # Check https://plotly.com/python/pie-charts/ for coloring instructions
    percent_significant = (sig.shape[0] / dmr_df.shape[0]) * 100
    fig_mappval_pie = px.pie(
        dmr_df,
        names=['Not Significant (p > {})'.format(pvalue_thresh), 'Significant (p <= {})'.format(pvalue_thresh)],
        values=[dmr_df.shape[0] - sig.shape[0],sig.shape[0]],
        title=f"Percentage of significant CGs (map_pvalue <= {pvalue_thresh}): {percent_significant:.2f}% <br>{experiment_codition}",
        # color_discrete_map={'Not Significant (p > {})'.format(pvalue_thresh): 'blue',
        #                     'Significant (p <= {})'.format(pvalue_thresh): 'red'},
        # colors=['blue','red']
    )
    fig_mappval_pie.update_layout(height=520)
    mappval_pie_path = os.path.join(out_dir, f"dmr_map_pval_percentage.html")
    # fig_mappval_pie.write_html(mappval_pie_path, include_plotlyjs='cdn')
    fig_mappval_pie.show()
    # print("Saved MAP-based p-value percentage pie chart:", mappval_pie_path)


    # plot effect size distribution of all sites without sorting and highlight significant which have map_pvalue <= pvalue_thresh
    fig_effectsize_hist = px.histogram(
        dmr_df,
        x='effect_size',
        nbins=80,
        # add a line break and write experiment_codition into the title
        title=f"Effect size distribution (highlighting significant sites with map_pvalue <= {pvalue_thresh})<br>{experiment_codition}",
        color=(dmr_df['map_pvalue'] <= pvalue_thresh), 
        color_discrete_map={True: 'red', False: 'blue'},
    )
            # labels={'effect_size':'Effect size (A - B)'}

    fig_effectsize_hist.update_layout(height=520)
    effectsize_hist_path = os.path.join(out_dir, f"dmr_effect_size_distribution.html")
    # fig_effectsize_hist.write_html(effectsize_hist_path, include_plotlyjs='cdn')
    fig_effectsize_hist.show()
    # print("Saved effect size distribution histogram:", effectsize_hist_path)        

    # plot effect sizes of all sites without sorting and highlight significant which have map_pvalue <= pvalue_thresh 
    # add color legend names as 'Significant: map_pvalue <= pvalue_thresh' and 'Not Significant: map_pvalue > pvalue_thresh'  
    fig_effectsize_scatter = px.scatter(
        dmr_df,
        x=dmr_df.index,
        y='effect_size',
        color_discrete_map={True: 'red', False: 'blue'},
        color=(dmr_df['map_pvalue'] <= pvalue_thresh), 
        labels={'effect_size':'Effect size (A - B)','index':'Index',
                'color':f'Significant: map_pvalue <= {pvalue_thresh}'},
        title=f"Effect sizes for all CGs (highlighting significant sites with map_pvalue <= {pvalue_thresh}) <br>{experiment_codition}",
    )
    fig_effectsize_scatter.update_layout(height=520)
    effectsize_scatter_path = os.path.join(out_dir, f"dmr_effect_size_scatter.html")
    # fig_effectsize_scatter.write_html(effectsize_scatter_path, include_plotlyjs='cdn')
    fig_effectsize_scatter.show()
    # print("Saved effect size scatter plot:", effectsize_scatter_path)       

    # bar plot effect sizes of all sites without sorting and highlight significant which have map_pvalue <= pvalue_thresh 
    fig_effectsize_bar = px.bar(
        dmr_df,
        x=dmr_df.index,         
        y='effect_size',
        color=(dmr_df['map_pvalue'] <= pvalue_thresh),
        labels={'effect_size':'Effect size (A - B)','index':'Index',
                'color':f'Significant: map_pvalue <= {pvalue_thresh}'},
        color_discrete_map={True: 'red', False: 'blue'},
        title=f"Effect sizes for all CGs (n={len(dmr_df)}) (highlighting significant sites with map_pvalue <= {pvalue_thresh}) <br>{experiment_codition}",
    )
    fig_effectsize_bar.update_layout(height=520)
    effectsize_bar_path = os.path.join(out_dir, f"dmr_effect_size_bar.html")
    # fig_effectsize_bar.write_html(effectsize_bar_path, include_plotlyjs='cdn')
    fig_effectsize_bar.show()
    # print("Saved effect size bar plot:", effectsize_bar_path) 


    # Bar: effect size per position (without sorting) with effect size colors 
    # dmr_df['label'] = dmr_df['pos'] + ":" + dmr_df['strand'].astype(str)
    fig_bar_unsorted = px.bar(
        dmr_df,
        x=dmr_df.index, 
        y='effect_size',        
        color='effect_size',
        title=f"Effect size for all CGs (n={len(dmr_df)}) <br>{experiment_codition}",
        labels={'effect_size':'Effect size (A - B)','label':'position:strand'}
    )
            # hover_data=['map_pvalue','a_perc','b_perc','total_reads'],
            # x='label',

    fig_bar_unsorted.update_layout(xaxis_tickangle=45, height=520)
    bar_unsorted_path = os.path.join(out_dir, f"dmr_sig_effectsize_unsorted_p{pvalue_thresh:.3f}.html")
    # fig_bar_unsorted.write_html(bar_unsorted_path, include_plotlyjs='cdn')
    fig_bar_unsorted.show()
    # print("Saved unsorted effect-size bar plot:", bar_unsorted_path)    
        
    # Bar: effect size per position (without sorting)
    sig['label'] = sig['pos'] + ":" + sig['strand'].astype(str)
    fig_bar_unsorted = px.bar(
        sig,
        x='label',
        y='effect_size',        
        color='effect_size',
        hover_data=['map_pvalue','a_perc','b_perc','total_reads'],
        title=f"Effect size for significant CGs (n={len(sig)}) with map_pvalue <= {pvalue_thresh} <br>{experiment_codition}",
        labels={'effect_size':'Effect size (A - B)','label':'position:strand'}
    )
    fig_bar_unsorted.update_layout(xaxis_tickangle=45, height=520)
    bar_unsorted_path = os.path.join(out_dir, f"dmr_sig_effectsize_unsorted_p{pvalue_thresh:.3f}.html")
    # fig_bar_unsorted.write_html(bar_unsorted_path, include_plotlyjs='cdn')
    fig_bar_unsorted.show()
    # print("Saved unsorted effect-size bar plot:", bar_unsorted_path)    
        

    # Bar: effect size per position (sorted)
    sig_sorted = sig.sort_values('effect_size', ascending=False).copy()
    sig_sorted['label'] = sig_sorted['pos'] + ":" + sig_sorted['strand'].astype(str)
    fig_bar = px.bar(
        sig_sorted,
        x='label',
        y='effect_size',
        color='effect_size',
        hover_data=['map_pvalue','a_perc','b_perc','total_reads'],
        title=f"Effect size for significant CGs (n={len(sig_sorted)}) <br>{experiment_codition}",
        labels={'effect_size':'Effect size (A - B)','label':'position:strand'}
    )
    fig_bar.update_layout(xaxis_tickangle=45, height=520)
    bar_path = os.path.join(out_dir, f"dmr_sig_effectsize_p{pvalue_thresh:.3f}.html")
    # fig_bar.write_html(bar_path, include_plotlyjs='cdn')
    fig_bar.show()
    # print("Saved effect-size bar plot:", bar_path)



    # Scatter: sample A vs sample B percent modified (size = total reads, color = effect size)
    fig_scatter = px.scatter(
        sig,
        x='a_perc',
        y='b_perc',
        color='effect_size',
        size='total_reads',
        hover_data=['pos','start','map_pvalue','effect_size','cohen_h'],
        title=f"Significant CGs (map_pvalue <= {pvalue_thresh}) — sample A vs B percent modified <br>{experiment_codition}",
        labels={'a_perc':'Sample A % modified','b_perc':'Sample B % modified'}
    )
    fig_scatter.update_layout(height=520)
    scatter_path = os.path.join(out_dir, f"dmr_sig_scatter_p{pvalue_thresh:.3f}.html")
    # fig_scatter.write_html(scatter_path, include_plotlyjs='cdn')
    fig_scatter.show()
    # print("Saved scatter plot:", scatter_path)


Saved significant sites table: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/filtered/unedit_K526_v_T_d35/dmr_significant_p0.050_roi.tsv


Unnamed: 0,chrom,start,end,strand,map_pvalue,effect_size,a_perc,b_perc,total_reads
0,chr1,206583387,206583388,+,0.0,-0.672571,4.347826,71.60494,173
1,chr1,206583388,206583389,-,9.23e-14,-0.688942,17.021276,85.915494,118
2,chr1,206583707,206583708,+,0.0,-0.834596,12.5,95.9596,195
3,chr1,206583708,206583709,-,0.0,-0.885281,7.575758,96.103895,143
4,chr1,206583766,206583767,+,0.0,-0.758271,17.857143,93.68421,179
5,chr1,206583767,206583768,-,3.2e-15,-0.613889,31.944445,93.333334,147
6,chr1,206584104,206584105,+,5.268853e-06,-0.248763,71.153843,95.87629,201
7,chr1,206584105,206584106,-,0.04663361,-0.134682,78.66667,92.134833,164
8,chr1,206584137,206584138,+,3.775514e-09,-0.26,73.504275,100.0,221
9,chr1,206584151,206584152,+,0.0269676,-0.079583,90.51724,98.95833,212


# TODO: check
- are there really NO diffs between the reads selected with the mC > 70 and mC > 99.5% filtering ??

could be as the mC calles are automatically selected

and in the CRoff the auto threshold 
* in mC > 70 was to 0.79
> Using filter threshold 0.7910156 for C.
* in mC > 99.5 was to  0.79
> Using filter threshold 0.7910156 for C.


and in the Unediter the auto threshold 
* in mC > 70 was to 0.8496
> Using filter threshold 0.8496094 for C.
* in mC > 99.5 was to 0.849
> Using filter threshold 0.8496094 for C.


SAME per condition AUTOMATIC modkit filtering threshold per mC run were set

(NOT 0.995 and not 0.7)

# TODO: check
- are there really NO diffs between the reads selected with the mC > 70 and mC > 99.5% filtering ??

could be as the mC calles are automatically selected

and in the CRoff the auto threshold 
* in mC > 70 was to 0.79
> Using filter threshold 0.7910156 for C.
* in mC > 99.5 was to  0.79
> Using filter threshold 0.7910156 for C.


and in the Unediter the auto threshold 
* in mC > 70 was to 0.8496
> Using filter threshold 0.8496094 for C.
* in mC > 99.5 was to 0.849
> Using filter threshold 0.8496094 for C.


SAME per condition AUTOMATIC modkit filtering threshold per mC run were set

(NOT 0.995 and not 0.7)