# Modkit dmr - K562 ZFPoff High (ZFPoff silenced cells) vs CRoff day 35 data
## Use my NOT noFilter Reads (epiCG collection .bam, but NO quality condtrol additional reads filtering)

Based on:
https://nanoporetech.github.io/modkit/intro_dmr.html#perform-differential-methylation-scoring

Select kernal: dimelo_v2_modkit_parsing

In [1]:
from datetime import datetime
def current_time():
    """Returns the current date and time as a formatted string."""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S") 
    
print("Current Date and Time:", current_time())

Current Date and Time: 2025-11-18 10:18:06


Preparing the input data
The inputs to all modkit dmr commands are two or more bedMethyl files (created by modkit pileup) that have been compressed with bgzip and indexed with tabix. An example of how to generate the input data is shown below:


ref=grch38.fasta
threads=32

norm=normal_sample.bam
norm_pileup=normal_pileup.bed

modkit pileup ${norm} ${norm_pileup} \
  --cpg \
  --ref ${ref} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${norm_pileup}
tabix -p bed ${norm_pileup}.gz

# pileup and compression can also be done in one step
tumor=tumor_sample.bam
tumor_pileup=tumor_pileup.bed.gz

modkit pileup ${tumor} - \
  --cpg \
  --ref ${ref} \
  --threads ${threads} \
  --log-filepath log.txt | ${bgzip} -c > ${tumor_pileup}

tabix -p bed ${tumor_pileup}

In [2]:
%%bash
echo "hello"

hello


# Use the NEW modkit latest installed version in ipython kernel modkit_new

In [3]:
# ! python3 -m ipykernel install --user --name=modkit_new --display-name "modkit_new Python"
# ! which modkit

In [4]:
import os
os.environ["PATH"] = "/home/michalula/.cargo/bin:" + os.environ["PATH"]
! which modkit
! modkit --version

/home/michalula/.cargo/bin/modkit
modkit 0.5.1


In [5]:
# ! modkit

In [6]:
! modkit --version 

modkit 0.5.1


In [7]:
1+1

2

In [8]:
import os
import pandas as pd

def load_pileup_bed(bed_path):
    # bed_path = existing[0]
    print("Reading bedMethyl file:", bed_path)

    # bedMethyl column names (18 columns as provided)
    colnames = [
        "chrom", "start", "end", "mod_code", "score", "strand",
        "start2", "end2", "color",
        "Nvalid_cov", "percent_modified", "Nmod", "Ncanonical",
        "Nother_mod", "Ndelete", "Nfail", "Ndiff", "Nnocall"
    ]

    # Configure dtypes where reasonable
    dtypes = {
        "chrom": str,
        "start": "Int64",
        "end": "Int64",
        "mod_code": str,
        "score": "Int64",
        "strand": str,
        "start2": "Int64",
        "end2": "Int64",
        "color": str,
        "Nvalid_cov": "Int64",
        "percent_modified": float,
        "Nmod": "Int64",
        "Ncanonical": "Int64",
        "Nother_mod": "Int64",
        "Ndelete": "Int64",
        "Nfail": "Int64",
        "Ndiff": "Int64",
        "Nnocall": "Int64"
    }

    compression = "gzip" if bed_path.endswith(".gz") else None

    # Read file (headerless BED-like table). If file has extra columns, keep them with automatic numeric conversion below.
    df = pd.read_csv(
        bed_path,
        sep="\t",
        header=None,
        comment="#",
        names=colnames,
        dtype=dtypes,
        compression=compression,
        engine="python",
        na_values=[".", "NA", ""],
        keep_default_na=True
    )

    # If file contained more than 18 columns, pandas assigned remaining data to extra columns named like col_18, col_19...
    # Ensure numeric conversion for numeric-like columns
    for c in df.columns:
        if df[c].dtype == object:
            # try safe numeric conversion where appropriate
            try:
                df[c] = pd.to_numeric(df[c], errors="ignore")
            except Exception:
                pass

    print("Loaded DataFrame shape:", df.shape)
    display(df.head())
    return df


In [9]:
import os
from IPython.display import display, HTML
from plotly import express as px
from plotly import graph_objects as go

# ! python3 -m pip install plotly
# ! python3 -m pip install matplotlib
# ! python3 -m pip install nbformat>=4.2.0

def plot_pileup_roi_df(df_roi, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    # ensure numeric types for plotting
    df_roi['pos'] = df_roi['start'].astype(int)
    df_roi['percent_modified'] = df_roi['percent_modified'].astype(float)
    df_roi['Nvalid_cov'] = df_roi['Nvalid_cov'].astype(int)
    df_roi['Nmod'] = df_roi['Nmod'].astype(int)
    df_roi['Ncanonical'] = df_roi['Ncanonical'].astype(int)

    # Scatter: genomic position vs percent modified (point size = coverage)
    fig1 = px.scatter(
        df_roi,
        x='pos',
        y='percent_modified',
        color='strand',
        size='Nvalid_cov',
        hover_data=['Nvalid_cov','Nmod','Ncanonical','Nother_mod','Nnocall'],
        title='Percent modified across ROI (size = Nvalid_cov)',
        height=500
    )
    fig1.update_layout(xaxis_title='Genomic position (start)', yaxis_title='Percent modified')
    fig1.show()
    # fig1.write_html(os.path.join(out_dir, "roi_percent_modified_scatter.html"), include_plotlyjs='cdn')

    # Histogram: coverage distribution
    fig2 = px.histogram(
        df_roi,
        x='Nvalid_cov',
        nbins=40,
        title='Distribution of Nvalid_cov (coverage) in ROI',
        height=400
    )
    fig2.update_layout(xaxis_title='Nvalid_cov', yaxis_title='Count')
    fig2.show()
    # fig2.write_html(os.path.join(out_dir, "roi_nvalidcov_hist.html"), include_plotlyjs='cdn')

    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked)
    topn = 30
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(name='Nmod', x=df_top['label'], y=df_top['Nmod']))
    fig3.add_trace(go.Bar(name='Ncanonical', x=df_top['label'], y=df_top['Ncanonical']))
    fig3.update_layout(barmode='stack', title=f'Sorted Top {topn} sites by percent_modified (stacked Nmod / Ncanonical)',
                    xaxis_title='position:strand', yaxis_title='reads', height=520)
    fig3.show()
    # fig3.write_html(os.path.join(out_dir, "roi_top_sites_stacked_counts.html"), include_plotlyjs='cdn')

    # Print simple summaries
    print("ROI rows:", df_roi.shape[0])
    print("Percent modified: median={:.2f}, mean={:.2f}".format(df_roi['percent_modified'].median(), df_roi['percent_modified'].mean()))
    print("Coverage (Nvalid_cov): min={}, median={}, max={}".format(df_roi['Nvalid_cov'].min(), df_roi['Nvalid_cov'].median(), df_roi['Nvalid_cov'].max()))

    # Display first rows table for quick inspection
    display(HTML(df_roi.head(20).to_html(index=False)))


    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked)
    topn = df_roi.shape[0]
    # df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_roi.copy() #.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(name='Nmod', x=df_top['label'], y=df_top['Nmod']))
    fig3.add_trace(go.Bar(name='Ncanonical', x=df_top['label'], y=df_top['Ncanonical']))
    fig3.update_layout(barmode='stack', title=f'All {topn} CpG sites by percent_modified (stacked Nmod / Ncanonical) [ordered=not s]',
                    xaxis_title='position:strand', yaxis_title='reads', height=520)
    fig3.show()
    # fig3.write_html(os.path.join(out_dir, "roi_top_sites_stacked_counts.html"), include_plotlyjs='cdn')

    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked, NOT SORTED)
    topn = df_roi.shape[0]
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(name='Nmod', x=df_top['label'], y=df_top['Nmod']))
    fig3.add_trace(go.Bar(name='Ncanonical', x=df_top['label'], y=df_top['Ncanonical']))
    fig3.update_layout(barmode='stack', title=f'Sorted Top {topn} sites by percent_modified (stacked Nmod / Ncanonical)',
                    xaxis_title='position:strand', yaxis_title='reads', height=520)
    fig3.show()
    # fig3.write_html(os.path.join(out_dir, "roi_top_sites_stacked_counts.html"), include_plotlyjs='cdn')

    # Print simple summaries
    print("ROI rows:", df_roi.shape[0])
    print("Percent modified: median={:.2f}, mean={:.2f}".format(df_roi['percent_modified'].median(), df_roi['percent_modified'].mean()))
    print("Coverage (Nvalid_cov): min={}, median={}, max={}".format(df_roi['Nvalid_cov'].min(), df_roi['Nvalid_cov'].median(), df_roi['Nvalid_cov'].max()))

    # Display first rows table for quick inspection
    display(HTML(df_roi.head(20).to_html(index=False)))

    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    topn = 30
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig4 = go.Figure()
    fig4.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig4.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig4.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig4.show()
    # fig4.write_html(os.path.join(out_dir, "roi_top_sites_stacked_percentage.html"), include_plotlyjs='cdn')     


    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    topn = 277
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig4 = go.Figure()
    fig4.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig4.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig4.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig4.show()
    # fig4.write_html(os.path.join(out_dir, "roi_top_sites_stacked_percentage.html"), include_plotlyjs='cdn')     

    # Bar: Unsorted sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    df_top = df_roi.copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig5 = go.Figure()
    fig5.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig5.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig5.update_layout(barmode='stack', title=f'All sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig5.show()
    # fig5.write_html(os.path.join(out_dir, "roi_all_sites_stacked_percentage.html"), include_plotlyjs='cdn')    


    return df_top



In [10]:
! ls /home/michalula/data/ref_genomes/t2t_v2_0/

chm13v2.0.fa	  chm13v2.0.fa.fai		   haplotype_vcf
chm13v2.0.fa.amb  chm13v2.0.fa.pac		   up_chm13v2.0.fasta
chm13v2.0.fa.ann  chm13v2.0.fa.sa		   up_chm13v2.0.fasta.fai
chm13v2.0.fa.bwt  convert_to_uppercase_fasta.bash


# Pileups 
## for CRISPRoff noFilter data for Day 35 
# Replica 2b:

In [11]:
! ls /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/

replica_1  replica_2b


In [12]:
! ls /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output

CG_137_padded_reads_replica2B_Day35_CRISPRoff_Tcells_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_mCthresh0.7_T2Tv2_0_chr1:206583354-206589854_2025-11-10_units_combined_numFWD78_numRVS167.npy
CG_137_padded_reads_replica2B_Day35_CRISPRoff_Tcells_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_mCthresh0.995_T2Tv2_0_chr1:206583354-206589854_2025-11-10_units_combined_numFWD78_numRVS166.npy
extracted_reads
filtered_reads_overlap_MORE_than_0.9_replica2B_Day35_CRISPRoff_Tcells_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
filtered_reads_overlap_MORE_than_0.9_replica2B_Day35_CRISPRoff_Tcells_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam.bai
filtered_reads_overlap_MORE_than_0.9_replica2B_Day35_CRISPRoff_Tcells_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7

In [13]:
%%bash

ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref= "/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"

threads=32

date_today="20251118"
# data_folder_path="/home/michalula/data/cas9_nanopore/data/20250908_nCATs_T_CRoff_Day_35/5mCG/to_t2t_v2_0/"
# CROFF_day35_bam=${data_folder_path}"sort_align_t2t_v2_0_trim_20250908_Day35_CROFF_Tcells_2Libraries_Minion_R9.dna_r9.4.1_e8_sup@v3.3.5mCG.bam"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output/"
# "/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_dmr_pileup/"
mkdir ${pileup_data_folder_path}

# mC > 99.5% = 0995 noFilter 
# noFilter_CROFF_day35_bam=${data_folder_path}"noFilter_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# mC >70% = 07 noFilter 
repl2b_noFilter_CROFF_day35_bam=${data_folder_path}"pre_filtered_ROI_reads_replica2B_Day35_CRISPRoff_Tcells_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# "noFilter_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
repl2b_pileup_CROFF_day35_bed=${pileup_data_folder_path}${date_today}"_repl2b""_noFilter_mC07""_pileup_CROFF_Day35_Tcells.bed"
# "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_CROFF_Day28/mergered_outputs/pileup_sort_merge_sort_align_t2t_v1_1_trim_20250721_nCATs_Tcells_CROFF_Day28_minion_run2_day8.dna_r9.4.1_e8_sup@v3.3.5mCG.bam"
# norm=normal_sample.bam
# norm_pileup=normal_pileup.bed

modkit pileup ${repl2b_noFilter_CROFF_day35_bam} ${repl2b_pileup_CROFF_day35_bed} \
  --cpg \
  --ref ${ref_genome_fa} \
  --threads ${threads} \
  --log-filepath log.txt


bgzip -k ${repl2b_pileup_CROFF_day35_bed}
tabix -p bed ${repl2b_pileup_CROFF_day35_bed}.gz


printf '%s\n' "repl2b_noFilter_CROFF_day35_bam: $repl2b_noFilter_CROFF_day35_bam"
printf '%s\n' "repl2b_pileup_CROFF_day35_bed: $repl2b_pileup_CROFF_day35_bed"
cat "$repl2b_pileup_CROFF_day35_bed"


mkdir: cannot create directory ‘/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/’: File exists
[0;32m>[0m calculated chunk size: 48, interval size 100000, processing 4800000 positions concurrently
[0;32m>[0m filtering to only CpG motifs
[0;32m>[0m attempting to sample 10042 reads
[0;32m>[0m Using filter threshold 0.8058593 for C.
[0;32m>[0m Done, processed 286 rows. Processed ~262 reads and skipped zero reads.


repl2b_noFilter_CROFF_day35_bam: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output/pre_filtered_ROI_reads_replica2B_Day35_CRISPRoff_Tcells_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
repl2b_pileup_CROFF_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed
chr1	206583089	206583090	m	1	+	206583089	206583090	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206583090	206583091	m	1	-	206583090	206583091	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206583173	206583174	m	122	+	206583173	206583174	255,0,0	122	88.52	108	14	0	1	25	11	5
chr1	206583174	206583175	m	88	-	206583174	206583175	255,0,0	88	97.73	86	2	0	0	1	1	2
chr1	206583387	206583388	m	125	+	206583387	206583388	255,0,0	125	82.40	103	22

In [14]:
date_today="20251118"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=data_folder_path+"new_dmr_pileup/"
# mkdir ${pileup_data_folder_path}

# mC > 99.5% = 0995 noFilter 
# noFilter_CROFF_day35_bam=${data_folder_path}"noFilter_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# noFilter_CROFF_day35_bam=data_folder_path+"noFilter_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# mC >70% = 07 noFilter 
# noFilter_CROFF_day35_bam=data_folder_path+"noFilter_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"

repl2b_noFilter_CROFF_day35_bam=data_folder_path+"noFilter_reads_overlap_MORE_than_0.9_replica2B_Day35_CRISPRoff_Tcells_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
repl2b_pileup_CROFF_day35_bed=pileup_data_folder_path+date_today+"_repl2b""_noFilter_mC07""_pileup_CROFF_Day35_Tcells.bed"

repl2b_noFilter_CROFF_day35_bam, repl2b_pileup_CROFF_day35_bed

('/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output/noFilter_reads_overlap_MORE_than_0.9_replica2B_Day35_CRISPRoff_Tcells_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam',
 '/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed')

In [15]:
repl2b_pileup_CROFF_day35_df = load_pileup_bed(repl2b_pileup_CROFF_day35_bed)
repl2b_pileup_CROFF_day35_df

Reading bedMethyl file: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed
Loaded DataFrame shape: (286, 18)


  df[c] = pd.to_numeric(df[c], errors="ignore")


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583089,206583090,m,1,+,206583089,206583090,25500,1,100.0,1,0,0,0,0,0,0
1,chr1,206583090,206583091,m,1,-,206583090,206583091,25500,1,100.0,1,0,0,0,0,0,0
2,chr1,206583173,206583174,m,122,+,206583173,206583174,25500,122,88.52,108,14,0,1,25,11,5
3,chr1,206583174,206583175,m,88,-,206583174,206583175,25500,88,97.73,86,2,0,0,1,1,2
4,chr1,206583387,206583388,m,125,+,206583387,206583388,25500,125,82.4,103,22,0,1,23,11,7


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583089,206583090,m,1,+,206583089,206583090,25500,1,100.00,1,0,0,0,0,0,0
1,chr1,206583090,206583091,m,1,-,206583090,206583091,25500,1,100.00,1,0,0,0,0,0,0
2,chr1,206583173,206583174,m,122,+,206583173,206583174,25500,122,88.52,108,14,0,1,25,11,5
3,chr1,206583174,206583175,m,88,-,206583174,206583175,25500,88,97.73,86,2,0,0,1,1,2
4,chr1,206583387,206583388,m,125,+,206583387,206583388,25500,125,82.40,103,22,0,1,23,11,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,chr1,206589931,206589932,m,6,-,206589931,206589932,25500,6,100.00,6,0,0,0,0,3,0
282,chr1,206589955,206589956,m,15,+,206589955,206589956,25500,15,100.00,15,0,0,0,2,4,0
283,chr1,206589956,206589957,m,5,-,206589956,206589957,25500,5,100.00,5,0,0,0,0,0,1
284,chr1,206590032,206590033,m,18,+,206590032,206590033,25500,18,61.11,11,7,0,2,0,0,1


In [16]:
# repl2b_pileup_CROFF_day35_df

In [17]:
repl2b_pileup_CROFF_day35_df_stats = plot_pileup_roi_df(df_roi=repl2b_pileup_CROFF_day35_df, out_dir=pileup_data_folder_path)
repl2b_pileup_CROFF_day35_df_stats

ROI rows: 286
Percent modified: median=80.56, mean=73.93
Coverage (Nvalid_cov): min=1, median=89.0, max=164


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583089,206583090,m,1,+,206583089,206583090,25500,1,100.0,1,0,0,0,0,0,0,206583089
chr1,206583090,206583091,m,1,-,206583090,206583091,25500,1,100.0,1,0,0,0,0,0,0,206583090
chr1,206583173,206583174,m,122,+,206583173,206583174,25500,122,88.52,108,14,0,1,25,11,5,206583173
chr1,206583174,206583175,m,88,-,206583174,206583175,25500,88,97.73,86,2,0,0,1,1,2,206583174
chr1,206583387,206583388,m,125,+,206583387,206583388,25500,125,82.4,103,22,0,1,23,11,7,206583387
chr1,206583388,206583389,m,73,-,206583388,206583389,25500,73,84.93,62,11,0,6,5,2,7,206583388
chr1,206583707,206583708,m,153,+,206583707,206583708,25500,153,92.81,142,11,0,2,2,5,5,206583707
chr1,206583708,206583709,m,75,-,206583708,206583709,25500,75,92.0,69,6,0,4,4,7,3,206583708
chr1,206583766,206583767,m,145,+,206583766,206583767,25500,145,91.03,132,13,0,3,3,10,6,206583766
chr1,206583767,206583768,m,78,-,206583767,206583768,25500,78,97.44,76,2,0,4,7,2,2,206583767


ROI rows: 286
Percent modified: median=80.56, mean=73.93
Coverage (Nvalid_cov): min=1, median=89.0, max=164


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583089,206583090,m,1,+,206583089,206583090,25500,1,100.0,1,0,0,0,0,0,0,206583089
chr1,206583090,206583091,m,1,-,206583090,206583091,25500,1,100.0,1,0,0,0,0,0,0,206583090
chr1,206583173,206583174,m,122,+,206583173,206583174,25500,122,88.52,108,14,0,1,25,11,5,206583173
chr1,206583174,206583175,m,88,-,206583174,206583175,25500,88,97.73,86,2,0,0,1,1,2,206583174
chr1,206583387,206583388,m,125,+,206583387,206583388,25500,125,82.4,103,22,0,1,23,11,7,206583387
chr1,206583388,206583389,m,73,-,206583388,206583389,25500,73,84.93,62,11,0,6,5,2,7,206583388
chr1,206583707,206583708,m,153,+,206583707,206583708,25500,153,92.81,142,11,0,2,2,5,5,206583707
chr1,206583708,206583709,m,75,-,206583708,206583709,25500,75,92.0,69,6,0,4,4,7,3,206583708
chr1,206583766,206583767,m,145,+,206583766,206583767,25500,145,91.03,132,13,0,3,3,10,6,206583766
chr1,206583767,206583768,m,78,-,206583767,206583768,25500,78,97.44,76,2,0,4,7,2,2,206583767


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,...,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos,label,Ntotal,Nmod_perc,Ncanonical_perc
0,chr1,206583089,206583090,m,1,+,206583089,206583090,25500,1,...,0,0,0,0,0,206583089,206583089:+,1,100.000000,0.000000
1,chr1,206583090,206583091,m,1,-,206583090,206583091,25500,1,...,0,0,0,0,0,206583090,206583090:-,1,100.000000,0.000000
2,chr1,206583173,206583174,m,122,+,206583173,206583174,25500,122,...,0,1,25,11,5,206583173,206583173:+,122,88.524590,11.475410
3,chr1,206583174,206583175,m,88,-,206583174,206583175,25500,88,...,0,0,1,1,2,206583174,206583174:-,88,97.727273,2.272727
4,chr1,206583387,206583388,m,125,+,206583387,206583388,25500,125,...,0,1,23,11,7,206583387,206583387:+,125,82.400000,17.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,chr1,206589931,206589932,m,6,-,206589931,206589932,25500,6,...,0,0,0,3,0,206589931,206589931:-,6,100.000000,0.000000
282,chr1,206589955,206589956,m,15,+,206589955,206589956,25500,15,...,0,0,2,4,0,206589955,206589955:+,15,100.000000,0.000000
283,chr1,206589956,206589957,m,5,-,206589956,206589957,25500,5,...,0,0,0,0,1,206589956,206589956:-,5,100.000000,0.000000
284,chr1,206590032,206590033,m,18,+,206590032,206590033,25500,18,...,0,2,0,0,1,206590032,206590032:+,18,61.111111,38.888889


# Look at CpGs within out target ROI
T2T v2.0

First CG:
206583388,206583390

Last of selected 137 CGs in the ROI:

206589746,206589748 --CpG_137

=> here have each CG position separate: so have 137*2  = 276

In [18]:
137*2, 277-5

(274, 272)

In [19]:
repl2b_pileup_CROFF_day35_df

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
0,chr1,206583089,206583090,m,1,+,206583089,206583090,25500,1,100.00,1,0,0,0,0,0,0,206583089
1,chr1,206583090,206583091,m,1,-,206583090,206583091,25500,1,100.00,1,0,0,0,0,0,0,206583090
2,chr1,206583173,206583174,m,122,+,206583173,206583174,25500,122,88.52,108,14,0,1,25,11,5,206583173
3,chr1,206583174,206583175,m,88,-,206583174,206583175,25500,88,97.73,86,2,0,0,1,1,2,206583174
4,chr1,206583387,206583388,m,125,+,206583387,206583388,25500,125,82.40,103,22,0,1,23,11,7,206583387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,chr1,206589931,206589932,m,6,-,206589931,206589932,25500,6,100.00,6,0,0,0,0,3,0,206589931
282,chr1,206589955,206589956,m,15,+,206589955,206589956,25500,15,100.00,15,0,0,0,2,4,0,206589955
283,chr1,206589956,206589957,m,5,-,206589956,206589957,25500,5,100.00,5,0,0,0,0,0,1,206589956
284,chr1,206590032,206590033,m,18,+,206590032,206590033,25500,18,61.11,11,7,0,2,0,0,1,206590032


In [20]:
repl2b_pileup_CROFF_day35_df[repl2b_pileup_CROFF_day35_df['start'] == 206583387]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
4,chr1,206583387,206583388,m,125,+,206583387,206583388,25500,125,82.4,103,22,0,1,23,11,7,206583387


In [21]:
repl2b_pileup_CROFF_day35_df[repl2b_pileup_CROFF_day35_df['start'] == 206583388]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
5,chr1,206583388,206583389,m,73,-,206583388,206583389,25500,73,84.93,62,11,0,6,5,2,7,206583388


In [22]:
repl2b_pileup_CROFF_day35_df[repl2b_pileup_CROFF_day35_df['start'] == 206589746]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
277,chr1,206589746,206589747,m,75,-,206589746,206589747,25500,75,96.0,72,3,0,6,5,3,5,206589746


In [23]:
(279-5) / 2

137.0

In [24]:
repl2b_pileup_CROFF_day35_df_roi = repl2b_pileup_CROFF_day35_df.iloc[4:278, :]  # Display target region rows
print(repl2b_pileup_CROFF_day35_df_roi.shape,repl2b_pileup_CROFF_day35_df_roi.shape[0]/2)
repl2b_pileup_CROFF_day35_df_roi

(274, 19) 137.0


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
4,chr1,206583387,206583388,m,125,+,206583387,206583388,25500,125,82.40,103,22,0,1,23,11,7,206583387
5,chr1,206583388,206583389,m,73,-,206583388,206583389,25500,73,84.93,62,11,0,6,5,2,7,206583388
6,chr1,206583707,206583708,m,153,+,206583707,206583708,25500,153,92.81,142,11,0,2,2,5,5,206583707
7,chr1,206583708,206583709,m,75,-,206583708,206583709,25500,75,92.00,69,6,0,4,4,7,3,206583708
8,chr1,206583766,206583767,m,145,+,206583766,206583767,25500,145,91.03,132,13,0,3,3,10,6,206583766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,chr1,206589213,206589214,m,85,-,206589213,206589214,25500,85,97.65,83,2,0,0,5,2,2,206589213
274,chr1,206589436,206589437,m,158,+,206589436,206589437,25500,158,95.57,151,7,0,0,4,3,3,206589436
275,chr1,206589437,206589438,m,90,-,206589437,206589438,25500,90,96.67,87,3,0,2,0,2,0,206589437
276,chr1,206589745,206589746,m,139,+,206589745,206589746,25500,139,99.28,138,1,0,12,4,2,11,206589745


In [25]:
repl2b_pileup_CROFF_day35_df_roi_stats = plot_pileup_roi_df(df_roi=repl2b_pileup_CROFF_day35_df_roi, out_dir=pileup_data_folder_path)
repl2b_pileup_CROFF_day35_df_roi_stats



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

ROI rows: 274
Percent modified: median=80.00, mean=73.11
Coverage (Nvalid_cov): min=19, median=90.0, max=164


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,125,+,206583387,206583388,25500,125,82.4,103,22,0,1,23,11,7,206583387
chr1,206583388,206583389,m,73,-,206583388,206583389,25500,73,84.93,62,11,0,6,5,2,7,206583388
chr1,206583707,206583708,m,153,+,206583707,206583708,25500,153,92.81,142,11,0,2,2,5,5,206583707
chr1,206583708,206583709,m,75,-,206583708,206583709,25500,75,92.0,69,6,0,4,4,7,3,206583708
chr1,206583766,206583767,m,145,+,206583766,206583767,25500,145,91.03,132,13,0,3,3,10,6,206583766
chr1,206583767,206583768,m,78,-,206583767,206583768,25500,78,97.44,76,2,0,4,7,2,2,206583767
chr1,206584104,206584105,m,152,+,206584104,206584105,25500,152,96.05,146,6,0,3,1,11,1,206584104
chr1,206584105,206584106,m,81,-,206584105,206584106,25500,81,97.53,79,2,0,3,4,3,2,206584105
chr1,206584137,206584138,m,162,+,206584137,206584138,25500,162,99.38,161,1,0,0,5,1,0,206584137
chr1,206584138,206584139,m,66,-,206584138,206584139,25500,66,90.91,60,6,0,0,26,0,1,206584138


ROI rows: 274
Percent modified: median=80.00, mean=73.11
Coverage (Nvalid_cov): min=19, median=90.0, max=164


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,125,+,206583387,206583388,25500,125,82.4,103,22,0,1,23,11,7,206583387
chr1,206583388,206583389,m,73,-,206583388,206583389,25500,73,84.93,62,11,0,6,5,2,7,206583388
chr1,206583707,206583708,m,153,+,206583707,206583708,25500,153,92.81,142,11,0,2,2,5,5,206583707
chr1,206583708,206583709,m,75,-,206583708,206583709,25500,75,92.0,69,6,0,4,4,7,3,206583708
chr1,206583766,206583767,m,145,+,206583766,206583767,25500,145,91.03,132,13,0,3,3,10,6,206583766
chr1,206583767,206583768,m,78,-,206583767,206583768,25500,78,97.44,76,2,0,4,7,2,2,206583767
chr1,206584104,206584105,m,152,+,206584104,206584105,25500,152,96.05,146,6,0,3,1,11,1,206584104
chr1,206584105,206584106,m,81,-,206584105,206584106,25500,81,97.53,79,2,0,3,4,3,2,206584105
chr1,206584137,206584138,m,162,+,206584137,206584138,25500,162,99.38,161,1,0,0,5,1,0,206584137
chr1,206584138,206584139,m,66,-,206584138,206584139,25500,66,90.91,60,6,0,0,26,0,1,206584138


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,...,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos,label,Ntotal,Nmod_perc,Ncanonical_perc
4,chr1,206583387,206583388,m,125,+,206583387,206583388,25500,125,...,0,1,23,11,7,206583387,206583387:+,125,82.400000,17.600000
5,chr1,206583388,206583389,m,73,-,206583388,206583389,25500,73,...,0,6,5,2,7,206583388,206583388:-,73,84.931507,15.068493
6,chr1,206583707,206583708,m,153,+,206583707,206583708,25500,153,...,0,2,2,5,5,206583707,206583707:+,153,92.810458,7.189542
7,chr1,206583708,206583709,m,75,-,206583708,206583709,25500,75,...,0,4,4,7,3,206583708,206583708:-,75,92.000000,8.000000
8,chr1,206583766,206583767,m,145,+,206583766,206583767,25500,145,...,0,3,3,10,6,206583766,206583766:+,145,91.034483,8.965517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,chr1,206589213,206589214,m,85,-,206589213,206589214,25500,85,...,0,0,5,2,2,206589213,206589213:-,85,97.647059,2.352941
274,chr1,206589436,206589437,m,158,+,206589436,206589437,25500,158,...,0,0,4,3,3,206589436,206589436:+,158,95.569620,4.430380
275,chr1,206589437,206589438,m,90,-,206589437,206589438,25500,90,...,0,2,0,2,0,206589437,206589437:-,90,96.666667,3.333333
276,chr1,206589745,206589746,m,139,+,206589745,206589746,25500,139,...,0,12,4,2,11,206589745,206589745:+,139,99.280576,0.719424


# K562 ZFPoff  - postSort High: (ZFPoff silenced cells)

In [26]:
! ls /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output

CG_137_padded_reads_K562_ZFPoff_PostSort_HIGH_Day16_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_mCthresh0.7_T2Tv2_0_chr1:206583354-206589854_2025-11-18_units_combined_numFWD636_numRVS799.npy
CG_137_padded_reads_K562_ZFPoff_PostSort_HIGH_Day16_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_mCthresh0.995_T2Tv2_0_chr1:206583354-206589854_2025-11-18_units_combined_numFWD635_numRVS798.npy
extracted_reads
filtered_reads_overlap_MORE_than_0.9_K562_ZFPoff_PostSort_HIGH_Day16_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
filtered_reads_overlap_MORE_than_0.9_K562_ZFPoff_PostSort_HIGH_Day16_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam.bai
filtered_reads_overlap_MORE_than_0.9_K562_ZFPoff_PostSort_HIGH_Day16_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_ma

In [27]:
%%bash

ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref= "/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"

threads=32

date_today="20251118"
# data_folder_path="/home/michalula/data/cas9_nanopore/data/20250908_nCATs_T_CRoff_Day_35/5mCG/to_t2t_v2_0/"
# CROFF_day35_bam=${data_folder_path}"sort_align_t2t_v2_0_trim_20250908_Day35_CROFF_Tcells_2Libraries_Minion_R9.dna_r9.4.1_e8_sup@v3.3.5mCG.bam"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_dmr_pileup/"
mkdir ${pileup_data_folder_path}

# mC > 99.5% = 0995 noFilter 
# mC >70% = 07 noFilter 
K562_Zoff_noFilter_bam=${data_folder_path}"pre_filtered_ROI_reads_K562_ZFPoff_PostSort_HIGH_Day16_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
K562_Zoff_noFilter_bed=${pileup_data_folder_path}${date_today}"_K562_ZoffHigh_v_T_Coff_noFilter_mC07""_pileup_CROFF_Day35_Tcells.bed"
# "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_CROFF_Day28/mergered_outputs/pileup_sort_merge_sort_align_t2t_v1_1_trim_20250721_nCATs_Tcells_CROFF_Day28_minion_run2_day8.dna_r9.4.1_e8_sup@v3.3.5mCG.bam"
# norm=normal_sample.bam
# norm_pileup=normal_pileup.bed

modkit pileup ${K562_Zoff_noFilter_bam} ${K562_Zoff_noFilter_bed} \
  --cpg \
  --ref ${ref_genome_fa} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${K562_Zoff_noFilter_bed}
tabix -p bed ${K562_Zoff_noFilter_bed}.gz

printf '%s\n' "K562_Zoff_noFilter_bam: $K562_Zoff_noFilter_bam"
printf '%s\n' "K562_Zoff_noFilter_bed: $K562_Zoff_noFilter_bed"
cat "$K562_Zoff_noFilter_bed"


[0;32m>[0m calculated chunk size: 48, interval size 100000, processing 4800000 positions concurrently
[0;32m>[0m filtering to only CpG motifs
[0;32m>[0m attempting to sample 10042 reads
[0;32m>[0m Using filter threshold 0.7597656 for C.
[0;32m>[0m Done, processed 444 rows. Processed ~1676 reads and skipped zero reads.


K562_Zoff_noFilter_bam: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/pre_filtered_ROI_reads_K562_ZFPoff_PostSort_HIGH_Day16_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
K562_Zoff_noFilter_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed
chr1	206572114	206572115	m	1	-	206572114	206572115	255,0,0	1	0.00	0	1	0	0	0	0	0
chr1	206572174	206572175	m	1	-	206572174	206572175	255,0,0	1	0.00	0	1	0	0	0	0	0
chr1	206572712	206572713	m	1	-	206572712	206572713	255,0,0	1	0.00	0	1	0	0	0	0	0
chr1	206572753	206572754	m	1	-	206572753	206572754	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206573039	206573040	m	1	-	206573039	206573040	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206573044	2

## Pileup columns explore

In [29]:
date_today="20251118"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortLOW/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=data_folder_path+"new_dmr_pileup/"
# mkdir ${pileup_data_folder_path}

# K562_Zoff_noFilter_bam="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortLOW/analyze_single_reads/dimelo_v2_output/pre_filtered_ROI_reads_K562_ZFPoff_PostSort_LOW_Day16_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_avgBaseQ20.bam"
# K562_Zoff_noFilter_bed="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortLOW/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed"

K562_Zoff_noFilter_bam="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/pre_filtered_ROI_reads_K562_ZFPoff_PostSort_HIGH_Day16_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
K562_Zoff_noFilter_bed="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed"

K562_Zoff_noFilter_bam, K562_Zoff_noFilter_bed

('/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/pre_filtered_ROI_reads_K562_ZFPoff_PostSort_HIGH_Day16_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam',
 '/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed')

In [30]:
pileup_K562_Zoff_noFilter_pileup_df = load_pileup_bed(K562_Zoff_noFilter_bed)
pileup_K562_Zoff_noFilter_pileup_df

Reading bedMethyl file: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed
Loaded DataFrame shape: (444, 18)



errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead



Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206572114,206572115,m,1,-,206572114,206572115,25500,1,0.0,0,1,0,0,0,0,0
1,chr1,206572174,206572175,m,1,-,206572174,206572175,25500,1,0.0,0,1,0,0,0,0,0
2,chr1,206572712,206572713,m,1,-,206572712,206572713,25500,1,0.0,0,1,0,0,0,0,0
3,chr1,206572753,206572754,m,1,-,206572753,206572754,25500,1,100.0,1,0,0,0,0,0,0
4,chr1,206573039,206573040,m,1,-,206573039,206573040,25500,1,100.0,1,0,0,0,0,0,0


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206572114,206572115,m,1,-,206572114,206572115,25500,1,0.0,0,1,0,0,0,0,0
1,chr1,206572174,206572175,m,1,-,206572174,206572175,25500,1,0.0,0,1,0,0,0,0,0
2,chr1,206572712,206572713,m,1,-,206572712,206572713,25500,1,0.0,0,1,0,0,0,0,0
3,chr1,206572753,206572754,m,1,-,206572753,206572754,25500,1,100.0,1,0,0,0,0,0,0
4,chr1,206573039,206573040,m,1,-,206573039,206573040,25500,1,100.0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
439,chr1,206606725,206606726,m,1,+,206606725,206606726,25500,1,0.0,0,1,0,0,0,0,0
440,chr1,206607006,206607007,m,1,+,206607006,206607007,25500,1,100.0,1,0,0,0,0,0,0
441,chr1,206607206,206607207,m,1,+,206607206,206607207,25500,1,100.0,1,0,0,0,0,0,0
442,chr1,206607468,206607469,m,1,+,206607468,206607469,25500,1,100.0,1,0,0,0,0,0,0


# Look at CpGs within out target ROI
T2T v2.0

First CG:
206583388,206583390

Last of selected 137 CGs in the ROI:

206589746,206589748 --CpG_137

=> here have each CG position separate: so have 137*2  = 276

In [31]:
137*2, 277-5

(274, 272)

In [32]:
pileup_K562_Zoff_noFilter_pileup_df[pileup_K562_Zoff_noFilter_pileup_df['start'] == 206583387]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
69,chr1,206583387,206583388,m,710,+,206583387,206583388,25500,710,8.03,57,653,0,13,65,52,60


In [33]:
pileup_K562_Zoff_noFilter_pileup_df[pileup_K562_Zoff_noFilter_pileup_df['start'] == 206583388]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
70,chr1,206583388,206583389,m,426,-,206583388,206583389,25500,426,14.79,63,363,0,46,59,25,211


In [34]:
pileup_K562_Zoff_noFilter_pileup_df[pileup_K562_Zoff_noFilter_pileup_df['start'] == 206589746]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
342,chr1,206589746,206589747,m,586,-,206589746,206589747,25500,586,97.95,574,12,0,64,26,35,57


In [None]:
(279-69) / 2

137.0

In [44]:
pileup_K562_Zoff_noFilter_pileup_df_roi = pileup_K562_Zoff_noFilter_pileup_df.iloc[69:343, :]  # Display target region rows
print(pileup_K562_Zoff_noFilter_pileup_df_roi.shape, pileup_K562_Zoff_noFilter_pileup_df_roi.shape[0]/2)
pileup_K562_Zoff_noFilter_pileup_df_roi

(274, 18) 137.0


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
69,chr1,206583387,206583388,m,710,+,206583387,206583388,25500,710,8.03,57,653,0,13,65,52,60
70,chr1,206583388,206583389,m,426,-,206583388,206583389,25500,426,14.79,63,363,0,46,59,25,211
71,chr1,206583707,206583708,m,679,+,206583707,206583708,25500,679,11.93,81,598,0,39,26,24,135
72,chr1,206583708,206583709,m,545,-,206583708,206583709,25500,545,9.36,51,494,0,23,19,45,135
73,chr1,206583766,206583767,m,629,+,206583766,206583767,25500,629,25.12,158,471,0,30,17,51,176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,chr1,206589213,206589214,m,661,-,206589213,206589214,25500,661,93.04,615,46,0,28,31,9,41
339,chr1,206589436,206589437,m,830,+,206589436,206589437,25500,830,89.64,744,86,0,16,11,15,32
340,chr1,206589437,206589438,m,675,-,206589437,206589438,25500,675,90.52,611,64,0,12,13,31,39
341,chr1,206589745,206589746,m,722,+,206589745,206589746,25500,722,99.45,718,4,0,61,22,30,66


<!>
> Threshold of  0.7597656 for base C is low. Consider increasing the filter-percentile or specifying a higher threshold.
> Done, processed 11762972 rows. Processed ~129977 reads and skipped ~150 reads.

In [45]:
# Plot pileup_Unedit_day35_df_roi summary plots (use existing variables/imports in the notebook)
# Saves interactive HTMLs to pileup_data_folder_path and displays inline.

pileup_K562_Zoff_noFilter_pileup_df_roi_stats = plot_pileup_roi_df(df_roi=pileup_K562_Zoff_noFilter_pileup_df_roi, out_dir=pileup_data_folder_path)
pileup_K562_Zoff_noFilter_pileup_df_roi_stats




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

ROI rows: 274
Percent modified: median=70.49, mean=66.98
Coverage (Nvalid_cov): min=216, median=674.0, max=869


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,710,+,206583387,206583388,25500,710,8.03,57,653,0,13,65,52,60,206583387
chr1,206583388,206583389,m,426,-,206583388,206583389,25500,426,14.79,63,363,0,46,59,25,211,206583388
chr1,206583707,206583708,m,679,+,206583707,206583708,25500,679,11.93,81,598,0,39,26,24,135,206583707
chr1,206583708,206583709,m,545,-,206583708,206583709,25500,545,9.36,51,494,0,23,19,45,135,206583708
chr1,206583766,206583767,m,629,+,206583766,206583767,25500,629,25.12,158,471,0,30,17,51,176,206583766
chr1,206583767,206583768,m,575,-,206583767,206583768,25500,575,37.04,213,362,0,20,83,23,66,206583767
chr1,206584104,206584105,m,696,+,206584104,206584105,25500,696,76.87,535,161,0,17,9,123,58,206584104
chr1,206584105,206584106,m,633,-,206584105,206584106,25500,633,73.3,464,169,0,43,19,23,50,206584105
chr1,206584137,206584138,m,864,+,206584137,206584138,25500,864,86.57,748,116,0,2,24,3,10,206584137
chr1,206584138,206584139,m,540,-,206584138,206584139,25500,540,81.11,438,102,0,1,224,1,2,206584138


ROI rows: 274
Percent modified: median=70.49, mean=66.98
Coverage (Nvalid_cov): min=216, median=674.0, max=869


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,710,+,206583387,206583388,25500,710,8.03,57,653,0,13,65,52,60,206583387
chr1,206583388,206583389,m,426,-,206583388,206583389,25500,426,14.79,63,363,0,46,59,25,211,206583388
chr1,206583707,206583708,m,679,+,206583707,206583708,25500,679,11.93,81,598,0,39,26,24,135,206583707
chr1,206583708,206583709,m,545,-,206583708,206583709,25500,545,9.36,51,494,0,23,19,45,135,206583708
chr1,206583766,206583767,m,629,+,206583766,206583767,25500,629,25.12,158,471,0,30,17,51,176,206583766
chr1,206583767,206583768,m,575,-,206583767,206583768,25500,575,37.04,213,362,0,20,83,23,66,206583767
chr1,206584104,206584105,m,696,+,206584104,206584105,25500,696,76.87,535,161,0,17,9,123,58,206584104
chr1,206584105,206584106,m,633,-,206584105,206584106,25500,633,73.3,464,169,0,43,19,23,50,206584105
chr1,206584137,206584138,m,864,+,206584137,206584138,25500,864,86.57,748,116,0,2,24,3,10,206584137
chr1,206584138,206584139,m,540,-,206584138,206584139,25500,540,81.11,438,102,0,1,224,1,2,206584138


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,...,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos,label,Ntotal,Nmod_perc,Ncanonical_perc
69,chr1,206583387,206583388,m,710,+,206583387,206583388,25500,710,...,0,13,65,52,60,206583387,206583387:+,710,8.028169,91.971831
70,chr1,206583388,206583389,m,426,-,206583388,206583389,25500,426,...,0,46,59,25,211,206583388,206583388:-,426,14.788732,85.211268
71,chr1,206583707,206583708,m,679,+,206583707,206583708,25500,679,...,0,39,26,24,135,206583707,206583707:+,679,11.929308,88.070692
72,chr1,206583708,206583709,m,545,-,206583708,206583709,25500,545,...,0,23,19,45,135,206583708,206583708:-,545,9.357798,90.642202
73,chr1,206583766,206583767,m,629,+,206583766,206583767,25500,629,...,0,30,17,51,176,206583766,206583766:+,629,25.119237,74.880763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,chr1,206589213,206589214,m,661,-,206589213,206589214,25500,661,...,0,28,31,9,41,206589213,206589213:-,661,93.040847,6.959153
339,chr1,206589436,206589437,m,830,+,206589436,206589437,25500,830,...,0,16,11,15,32,206589436,206589436:+,830,89.638554,10.361446
340,chr1,206589437,206589438,m,675,-,206589437,206589438,25500,675,...,0,12,13,31,39,206589437,206589437:-,675,90.518519,9.481481
341,chr1,206589745,206589746,m,722,+,206589745,206589746,25500,722,...,0,61,22,30,66,206589745,206589745:+,722,99.445983,0.554017


# ===============================
# dmr modkit K562 ZFPoff vs T cells CRoff 

In [47]:
pileup_data_folder_path

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortLOW/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/'

In [48]:
K562_Zoff_noFilter_bed

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed'

In [None]:
%%bash

pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/"
K562_Zoff_noFilter_bed=${pileup_data_folder_path}"20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed"

echo "K562_Zoff_noFilter_bed: ${K562_Zoff_noFilter_bed}.gz"

ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${K562_Zoff_noFilter_bed}.gz"


chmod u+rwx "${pileup_data_folder_path}"
chmod u+rwx "${K562_Zoff_noFilter_bed}.gz"


ls -ld "${pileup_data_folder_path}"
ls -ld "${K562_Zoff_noFilter_bed}.gz"


K562_Zoff_noFilter_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed.gz
20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed
20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed.gz
20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed.gz.tbi
drwxrwxr-x 2 michalula michalula 4096 Nov 18 10:22 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/
-rw-rw-r-- 1 michalula michalula 9546 Nov 18 10:22 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed.gz


In [52]:
repl2b_pileup_CROFF_day35_bed

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed'

In [56]:
%%bash

pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/"
repl2b_pileup_CROFF_day35_bed=${pileup_data_folder_path}"20251118_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed"

echo "repl2b_pileup_CROFF_day35_bed: ${repl2b_pileup_CROFF_day35_bed}.gz"


ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${repl2b_pileup_CROFF_day35_bed}.gz"


chmod u+rwx "${pileup_data_folder_path}"
chmod u+rwx "${repl2b_pileup_CROFF_day35_bed}.gz"


ls -ld "${pileup_data_folder_path}"
ls -ld "${repl2b_pileup_CROFF_day35_bed}.gz"


repl2b_pileup_CROFF_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed.gz
20251110_repl2b_filtered_mC07_pileup_CROFF_Day35_Tcells.bed
20251110_repl2b_filtered_mC07_pileup_CROFF_Day35_Tcells.bed.gz
20251110_repl2b_filtered_mC07_pileup_CROFF_Day35_Tcells.bed.gz.tbi
20251110_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed
20251110_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed.gz
20251110_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed.gz.tbi
20251118_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed
20251118_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed.gz
20251118_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed.gz.tbi
drwxrwxr-x 2 michalula michalula 4096 Nov 18 10:20 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_rea

In [62]:
%%bash
 
# 3. Detecting differential modification at single base positions
# The modkit dmr pair command has the ability to score individual bases (e.g. differentially methylated CpGs). To run single-base analysis on one or more paired samples, simply omit the --regions (-r) option when running modkit dmr pair. When performing single-base analysis the likelihood ratio score and a MAP-based p-value are available. For details on the likelihood ratio score and the MAP-based p-value, see the scoring details section. For example the above command becomes:
date_today="20251118"

experiment_codition="K562_ZFPoff_vs_T_CRoff_repl2_day35_noFilter_mC07"
dmr_output_path="/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/"
dmr_result=${dmr_output_path}${date_today}"_single_base_"${experiment_codition}".bed"

echo "pileup_Unedit_day35_bed: ${pileup_Unedit_day35_bed}.gz"
ls -l "${pileup_Unedit_day35_bed}.gz"

pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/"
K562_Zoff_noFilter_bed=${pileup_data_folder_path}"20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed"
echo "K562_Zoff_noFilter_bed: ${K562_Zoff_noFilter_bed}.gz"
ls -l "${K562_Zoff_noFilter_bed}.gz"


pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/"
repl2b_pileup_CROFF_day35_bed=${pileup_data_folder_path}"20251118_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed"
echo "repl2b_pileup_CROFF_day35_bed: ${repl2b_pileup_CROFF_day35_bed}.gz"
ls -l "${repl2b_pileup_CROFF_day35_bed}.gz"


ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"

threads=32
 
cd ${dmr_output_path}

modkit dmr pair \
  -a ${K562_Zoff_noFilter_bed}.gz \
  -b ${repl2b_pileup_CROFF_day35_bed}.gz \
  -o ${dmr_result} \
  --ref ${ref_genome_fa} \
  --base C \
  --threads ${threads} \
  --log-filepath dmr.log


echo "dmr_result: $dmr_result"
ls -lah $dmr_result

pileup_Unedit_day35_bed: .gz


ls: cannot access '.gz': No such file or directory


K562_Zoff_noFilter_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed.gz
-rwxrw-r-- 1 michalula michalula 9546 Nov 18 10:22 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/K562/post_sort/ZFPoff_sortHIGH/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_K562_ZoffHigh_v_T_Coff_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed.gz
repl2b_pileup_CROFF_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output/new_dmr_pileup/20251118_repl2b_noFilter_mC07_pileup_CROFF_Day35_Tcells.bed.gz
-rwxrw-r-- 1 michalula michalula 6545 Nov 18 10:20 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_2b/analyze_single_reads/dimelo_v2_output

[0;32m>[0m reading reference FASTA at "/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
[0;32m>[0m 1 common sequence(s) between FASTA and both samples
[0;32m>[0m running single-site analysis
[0;32m>[0m using default prior, Beta(α: 0.55, β: 0.55)
[0;32m>[0m estimating max coverages from data
[0;32m>[0m sampled 444 a records and 286 b records, calculating max coverages for 95th percentile
[0;32m>[0m calculated max coverage for a: 804 and b: 157
[0;32m>[0m calculated max coverage 804 is greater than maximum allowed (100), setting to 100
[0;32m>[0m calculated max coverage 157 is greater than maximum allowed (100), setting to 100
[0;31;1m>[0m errors:
+--------------------------+-------+
| error                    | count |
+--------------------------+-------+
| missing-in-one-condition | 111   |
+--------------------------+-------+

[0;32m>[0m finished, processed 286 sites successfully, 111 failed


dmr_result: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/20251118_single_base_K562_ZFPoff_vs_T_CRoff_repl2_day35_noFilter_mC07.bed
-rw-rw-r-- 1 michalula michalula 56K Nov 18 10:56 /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/20251118_single_base_K562_ZFPoff_vs_T_CRoff_repl2_day35_noFilter_mC07.bed


In [63]:
%%bash

date_today="20251118"

experiment_codition="K562_ZFPoff_vs_T_CRoff_repl2_day35_noFilter_mC07"
dmr_output_path="/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/"

# "/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_noFilter/rel12_croff_dmr_output/"
dmr_result=${dmr_output_path}${date_today}"_single_base_"${experiment_codition}".bed"

echo "dmr_result: $dmr_result"
# ls -lah $dmr_result
ls -lah $dmr_output_path
# cat $dmr_result

dmr_result: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/20251118_single_base_K562_ZFPoff_vs_T_CRoff_repl2_day35_noFilter_mC07.bed
total 1.2M
drwxrwxr-x 2 michalula michalula 4.0K Nov 18 10:56 .
drwxrwxr-x 4 michalula michalula 4.0K Nov 18 10:55 ..
-rw-rw-r-- 1 michalula michalula 1.1M Nov 18 10:36 20251118_K562_ZFPoff_T_CRoff_d35_noFilter_mC07_dmr_modkit.ipynb
-rw-rw-r-- 1 michalula michalula  56K Nov 18 10:56 20251118_single_base_K562_ZFPoff_vs_T_CRoff_repl2_day35_noFilter_mC07.bed
-rw-rw-r-- 1 michalula michalula 7.4K Nov 18 10:56 dmr.log
-rw-rw-r-- 1 michalula michalula 5.5K Nov 18 10:22 log.txt


In [64]:
pwd

'/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff'

## modkit dmr explore output

In [65]:

date_today="20251118"

experiment_codition="K562_ZFPoff_vs_T_CRoff_repl2_day35_noFilter_mC07"
dmr_output_path="/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/"

# dmr_result=dmr_output_path+date_today+"_single_base_noFilter_mC07_"+experiment_codition+".bed"
dmr_path="/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/20251118_single_base_K562_ZFPoff_vs_T_CRoff_repl2_day35_noFilter_mC07.bed"

# "/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_noFilter/new_dmr_output/20251109_single_base_noFilter_mC07_day35_CRoff_vs_Unedit.bed"
dmr_path

'/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/20251118_single_base_K562_ZFPoff_vs_T_CRoff_repl2_day35_noFilter_mC07.bed'

In [66]:
dmr_path

'/home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/20251118_single_base_K562_ZFPoff_vs_T_CRoff_repl2_day35_noFilter_mC07.bed'

In [67]:
# Read DMR BED (robust to header/no-header) and assign canonical column names (uses existing vars: dmr_path, out_dir, date_today, pd, os)
canonical_cols = [
    "chrom", "start", "end", "name", "score", "strand",
    "samplea_counts", "samplea_total", "sampleb_counts", "sampleb_total",
    "samplea_percents", "sampleb_percents",
    "samplea_fraction_modified", "sampleb_fraction_modified",
    "map_pvalue", "effect_size",
    "cohen_h", "cohen_h_low", "cohen_h_high",
]
    # "balanced_map_pvalue", "balanced_effect_size"

# read file with header and fallback to header=None when headers look numeric or columns are unexpected
try:
    dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")

    # dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", engine="python") # , header=0
    # # heuristic: if too many numeric-looking column names, re-read as headerless
    # numeric_headers = sum(1 for c in dmr_df.columns if str(c).strip().isdigit())
    # if numeric_headers >= (len(dmr_df.columns) / 2) or dmr_df.shape[1] < 3:
    #     dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")
except Exception:
    dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")

# assign canonical names up to number of columns present, add generic names for extras
ncols = dmr_df.shape[1]
if ncols <= len(canonical_cols):
    dmr_df.columns = canonical_cols[:ncols]
else:
    extras = [f"col_{i}" for i in range(ncols - len(canonical_cols))]
    dmr_df.columns = canonical_cols + extras

# coerce obvious numeric columns to numeric where present
num_cols_to_try = [
    "start", "end", "score",
    "samplea_total", "sampleb_total",
    "samplea_fraction_modified", "sampleb_fraction_modified",
    "map_pvalue", "effect_size",
    "balanced_map_pvalue", "balanced_effect_size"
]
for c in num_cols_to_try:
    if c in dmr_df.columns:
        dmr_df[c] = pd.to_numeric(dmr_df[c], errors="coerce")

# ensure output directory exists and save parsed table (parquet preferred)
os.makedirs(dmr_output_path, exist_ok=True)
parsed_path = os.path.join(dmr_output_path, f"{date_today}_dmr_parsed.parquet")
try:
    dmr_df.to_parquet(parsed_path, index=False)
    print("Saved parquet:", parsed_path)
except Exception:
    csv_path = os.path.join(dmr_output_path, f"{date_today}_dmr_parsed.csv")
    dmr_df.to_csv(csv_path, index=False)
    print("Parquet not available, saved csv:", csv_path)

print("Loaded DMR:", dmr_path)
print("Assigned columns:", dmr_df.columns.tolist())
print("Shape:", dmr_df.shape)
dmr_df.head()

Parquet not available, saved csv: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/20251118_dmr_parsed.csv
Loaded DMR: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/20251118_single_base_K562_ZFPoff_vs_T_CRoff_repl2_day35_noFilter_mC07.bed
Assigned columns: ['chrom', 'start', 'end', 'name', 'score', 'strand', 'samplea_counts', 'samplea_total', 'sampleb_counts', 'sampleb_total', 'samplea_percents', 'sampleb_percents', 'samplea_fraction_modified', 'sampleb_fraction_modified', 'map_pvalue', 'effect_size', 'cohen_h', 'cohen_h_low', 'cohen_h_high']
Shape: (286, 19)


Unnamed: 0,chrom,start,end,name,score,strand,samplea_counts,samplea_total,sampleb_counts,sampleb_total,samplea_percents,sampleb_percents,samplea_fraction_modified,sampleb_fraction_modified,map_pvalue,effect_size,cohen_h,cohen_h_low,cohen_h_high
0,chr1,206583089,206583090,.,3.218876,+,m:0,12,m:1,1,m:0.00,m:100.00,0.0,1.0,0.039828,-1.0,-3.141593,1.101597,5.181588
1,chr1,206583090,206583091,.,1.860752,-,m:1,12,m:1,1,m:8.33,m:100.00,0.083333,1.0,0.129896,-0.916667,-2.555907,0.515912,4.595902
2,chr1,206583173,206583174,.,-0.194336,+,m:496,572,m:108,122,m:86.71,m:88.52,0.867133,0.885246,1.0,-0.02,-0.055028,-0.140428,0.250484
3,chr1,206583174,206583175,.,2.189273,-,m:610,664,m:86,88,m:91.87,m:97.73,0.918675,0.977273,0.193191,-0.057273,-0.275715,0.053368,0.498062
4,chr1,206583387,206583388,.,151.028229,+,m:57,710,m:103,125,m:8.03,m:82.40,0.080282,0.824,0.0,-0.74,-1.701201,1.51109,1.891312


In [68]:
import os
from IPython.display import display, HTML

# Visualize all columns from dmr_df and save interactive HTMLs to out_dir
import plotly.express as px
import plotly.graph_objects as go

out_dir = dmr_output_path
print("out_dir: ", out_dir)

os.makedirs(out_dir, exist_ok=True)

# Save a table summary
summary = dmr_df.describe(include='all').transpose()
summary_path = os.path.join(out_dir, f"{date_today}_dmr_column_summary.csv")
summary.to_csv(summary_path)

numcols = dmr_df.select_dtypes(include=['number']).columns.tolist()

def _safe_name(name):
    return str(name).replace(os.sep, "_").replace(" ", "_").replace("\t", "_")

# Per-column visualizations
for col in dmr_df.columns:
    safe = _safe_name(col)
    try:
        if col in numcols:
            # Histogram
            fig_h = px.histogram(dmr_df, x=col, nbins=80, title=f"Histogram: {col}")
            # fig_h.write_html(os.path.join(out_dir, f"{date_today}_dmr_hist_{safe}.html"), include_plotlyjs='cdn')
            fig_h.show()

            # Boxplot
            fig_b = px.box(dmr_df, y=col, points="outliers", title=f"Boxplot: {col}")
            # fig_b.write_html(os.path.join(out_dir, f"{date_today}_dmr_box_{safe}.html"), include_plotlyjs='cdn')
            fig_b.show()
        else:
            # Categorical / text: show top value counts (up to 50)
            vc = dmr_df[col].fillna("NA").astype(str).value_counts().head(50)
            if len(vc):
                fig_c = px.bar(x=vc.values[::-1], y=vc.index.astype(str)[::-1], orientation='h',
                               title=f"Top value counts: {col}", labels={'x':'count','y':col})
                fig_c.update_layout(yaxis={'categoryorder':'array','categoryarray':vc.index[::-1].astype(str).tolist()})
                # fig_c.write_html(os.path.join(out_dir, f"{date_today}_dmr_valcounts_{safe}.html"), include_plotlyjs='cdn')
                fig_c.show()
            else:
                # fallback: display empty info
                display(HTML(f"<b>{col}</b>: no values to plot"))
    except Exception as e:
        print(f"Skipped plotting column {col!r} due to error: {e}")

# Correlation heatmap for numeric columns
if len(numcols) >= 2:
    try:
        corr = dmr_df[numcols].corr()
        fig_corr = px.imshow(corr, text_auto=True, aspect="auto", title="Correlation matrix (numeric columns)")
        # fig_corr.write_html(os.path.join(out_dir, f"{date_today}_dmr_correlation_numeric.html"), include_plotlyjs='cdn')
        fig_corr.show()
    except Exception as e:
        print("Failed to create correlation heatmap:", e)

print("Saved summary:", summary_path)
print("Plots saved to:", out_dir)

out_dir:  /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/


Saved summary: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/20251118_dmr_column_summary.csv
Plots saved to: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/


In [69]:
# Select significant CG pairs from DMR results and plot them (new cell at index 69).
# Uses existing notebook variables: dmr_df (parsed modkit dmr), df_roi_stats (pileup ROI stats),
# plotly (px) and out_dir/dmr_folder_path for saving. Does not re-import modules.

# Parameters
pvalue_thresh = 0.05

# ensure numeric columns
dmr_df['map_pvalue'] = pd.to_numeric(dmr_df['map_pvalue'], errors='coerce')
dmr_df['effect_size'] = pd.to_numeric(dmr_df['effect_size'], errors='coerce')
dmr_df['samplea_fraction_modified'] = pd.to_numeric(dmr_df['samplea_fraction_modified'], errors='coerce')
dmr_df['sampleb_fraction_modified'] = pd.to_numeric(dmr_df['sampleb_fraction_modified'], errors='coerce')

# filter significant by MAP-based p-value
sig = dmr_df[dmr_df['map_pvalue'] <= pvalue_thresh].copy()

dmr_df['map_pval_less005'] = dmr_df['map_pvalue'] <= 0.05


# # restrict to ROI positions if df_roi_stats exists
# if 'df_roi_stats' in globals():
#     roi_positions = set(df_roi_stats['start'].astype(int).tolist())
#     sig = sig[sig['start'].isin(roi_positions)].copy()

# quick exit if none
if sig.shape[0] == 0:
    print(f"No significant CG pairs found in ROI at map_pvalue <= {pvalue_thresh}")
else:
    # add convenience cols
    sig['pos'] = sig['start'].astype(str)
    sig['a_perc'] = sig['samplea_fraction_modified'] * 100
    sig['b_perc'] = sig['sampleb_fraction_modified'] * 100
    sig['total_reads'] = sig.get('samplea_total', 0).fillna(0).astype(int) + sig.get('sampleb_total', 0).fillna(0).astype(int)

    # save a table of significant sites
    os.makedirs(out_dir, exist_ok=True)
    sig_table_path = os.path.join(out_dir, f"dmr_significant_p{pvalue_thresh:.3f}_roi.tsv")
    sig.to_csv(sig_table_path, sep='\t', index=False)
    print("Saved significant sites table:", sig_table_path)
    display(sig[['chrom','start','end','strand','map_pvalue','effect_size','a_perc','b_perc','total_reads']].reset_index(drop=True))

    # plot the map_pval_less005 distribution which corresponds to significant sites
    fig_mappval_hist = px.histogram(
        dmr_df,
        x='map_pval_less005',
        nbins=80,
        title=f"MAP-based p-value distribution (highlighting p <= {pvalue_thresh}) <br>{experiment_codition}",
        labels={'map_pval_less005':'MAP-based p-value'}
    )
    fig_mappval_hist.update_layout(height=520)
    mappval_hist_path = os.path.join(out_dir, f"dmr_map_pval_distribution.html")
    # fig_mappval_hist.write_html(mappval_hist_path, include_plotlyjs='cdn')
    fig_mappval_hist.show()
    # print("Saved MAP-based p-value distribution histogram:", mappval_hist_path)

    # plot the percent of significant sites where map_pvalue <= pvalue_thresh is colored red, others blue (color not working)
    # Check https://plotly.com/python/pie-charts/ for coloring instructions
    percent_significant = (sig.shape[0] / dmr_df.shape[0]) * 100
    fig_mappval_pie = px.pie(
        dmr_df,
        names=['Not Significant (p > {})'.format(pvalue_thresh), 'Significant (p <= {})'.format(pvalue_thresh)],
        values=[dmr_df.shape[0] - sig.shape[0],sig.shape[0]],
        title=f"Percentage of significant CGs (map_pvalue <= {pvalue_thresh}): {percent_significant:.2f}% <br>{experiment_codition}",
        # color_discrete_map={'Not Significant (p > {})'.format(pvalue_thresh): 'blue',
        #                     'Significant (p <= {})'.format(pvalue_thresh): 'red'},
        # colors=['blue','red']
    )
    fig_mappval_pie.update_layout(height=520)
    mappval_pie_path = os.path.join(out_dir, f"dmr_map_pval_percentage.html")
    # fig_mappval_pie.write_html(mappval_pie_path, include_plotlyjs='cdn')
    fig_mappval_pie.show()
    # print("Saved MAP-based p-value percentage pie chart:", mappval_pie_path)


    # plot effect size distribution of all sites without sorting and highlight significant which have map_pvalue <= pvalue_thresh
    fig_effectsize_hist = px.histogram(
        dmr_df,
        x='effect_size',
        nbins=80,
        # add a line break and write experiment_codition into the title
        title=f"Effect size distribution (highlighting significant sites with map_pvalue <= {pvalue_thresh})<br>{experiment_codition}",
        color=(dmr_df['map_pvalue'] <= pvalue_thresh), 
        color_discrete_map={True: 'red', False: 'blue'},
    )
            # labels={'effect_size':'Effect size (A - B)'}

    fig_effectsize_hist.update_layout(height=520)
    effectsize_hist_path = os.path.join(out_dir, f"dmr_effect_size_distribution.html")
    # fig_effectsize_hist.write_html(effectsize_hist_path, include_plotlyjs='cdn')
    fig_effectsize_hist.show()
    # print("Saved effect size distribution histogram:", effectsize_hist_path)        

    # plot effect sizes of all sites without sorting and highlight significant which have map_pvalue <= pvalue_thresh 
    # add color legend names as 'Significant: map_pvalue <= pvalue_thresh' and 'Not Significant: map_pvalue > pvalue_thresh'  
    fig_effectsize_scatter = px.scatter(
        dmr_df,
        x=dmr_df.index,
        y='effect_size',
        color_discrete_map={True: 'red', False: 'blue'},
        color=(dmr_df['map_pvalue'] <= pvalue_thresh), 
        labels={'effect_size':'Effect size (A - B)','index':'Index',
                'color':f'Significant: map_pvalue <= {pvalue_thresh}'},
        title=f"Effect sizes for all CGs (highlighting significant sites with map_pvalue <= {pvalue_thresh}) <br>{experiment_codition}",
    )
    fig_effectsize_scatter.update_layout(height=520)
    effectsize_scatter_path = os.path.join(out_dir, f"dmr_effect_size_scatter.html")
    # fig_effectsize_scatter.write_html(effectsize_scatter_path, include_plotlyjs='cdn')
    fig_effectsize_scatter.show()
    # print("Saved effect size scatter plot:", effectsize_scatter_path)       

    # bar plot effect sizes of all sites without sorting and highlight significant which have map_pvalue <= pvalue_thresh 
    fig_effectsize_bar = px.bar(
        dmr_df,
        x=dmr_df.index,         
        y='effect_size',
        color=(dmr_df['map_pvalue'] <= pvalue_thresh),
        labels={'effect_size':'Effect size (A - B)','index':'Index',
                'color':f'Significant: map_pvalue <= {pvalue_thresh}'},
        color_discrete_map={True: 'red', False: 'blue'},
        title=f"Effect sizes for all CGs (n={len(dmr_df)}) (highlighting significant sites with map_pvalue <= {pvalue_thresh}) <br>{experiment_codition}",
    )
    fig_effectsize_bar.update_layout(height=520)
    effectsize_bar_path = os.path.join(out_dir, f"dmr_effect_size_bar.html")
    # fig_effectsize_bar.write_html(effectsize_bar_path, include_plotlyjs='cdn')
    fig_effectsize_bar.show()
    # print("Saved effect size bar plot:", effectsize_bar_path) 


    # Bar: effect size per position (without sorting) with effect size colors 
    # dmr_df['label'] = dmr_df['pos'] + ":" + dmr_df['strand'].astype(str)
    fig_bar_unsorted = px.bar(
        dmr_df,
        x=dmr_df.index, 
        y='effect_size',        
        color='effect_size',
        title=f"Effect size for all CGs (n={len(dmr_df)}) <br>{experiment_codition}",
        labels={'effect_size':'Effect size (A - B)','label':'position:strand'}
    )
            # hover_data=['map_pvalue','a_perc','b_perc','total_reads'],
            # x='label',

    fig_bar_unsorted.update_layout(xaxis_tickangle=45, height=520)
    bar_unsorted_path = os.path.join(out_dir, f"dmr_sig_effectsize_unsorted_p{pvalue_thresh:.3f}.html")
    # fig_bar_unsorted.write_html(bar_unsorted_path, include_plotlyjs='cdn')
    fig_bar_unsorted.show()
    # print("Saved unsorted effect-size bar plot:", bar_unsorted_path)    
        
    # Bar: effect size per position (without sorting)
    sig['label'] = sig['pos'] + ":" + sig['strand'].astype(str)
    fig_bar_unsorted = px.bar(
        sig,
        x='label',
        y='effect_size',        
        color='effect_size',
        hover_data=['map_pvalue','a_perc','b_perc','total_reads'],
        title=f"Effect size for significant CGs (n={len(sig)}) with map_pvalue <= {pvalue_thresh} <br>{experiment_codition}",
        labels={'effect_size':'Effect size (A - B)','label':'position:strand'}
    )
    fig_bar_unsorted.update_layout(xaxis_tickangle=45, height=520)
    bar_unsorted_path = os.path.join(out_dir, f"dmr_sig_effectsize_unsorted_p{pvalue_thresh:.3f}.html")
    # fig_bar_unsorted.write_html(bar_unsorted_path, include_plotlyjs='cdn')
    fig_bar_unsorted.show()
    # print("Saved unsorted effect-size bar plot:", bar_unsorted_path)    
        

    # Bar: effect size per position (sorted)
    sig_sorted = sig.sort_values('effect_size', ascending=False).copy()
    sig_sorted['label'] = sig_sorted['pos'] + ":" + sig_sorted['strand'].astype(str)
    fig_bar = px.bar(
        sig_sorted,
        x='label',
        y='effect_size',
        color='effect_size',
        hover_data=['map_pvalue','a_perc','b_perc','total_reads'],
        title=f"Effect size for significant CGs (n={len(sig_sorted)}) <br>{experiment_codition}",
        labels={'effect_size':'Effect size (A - B)','label':'position:strand'}
    )
    fig_bar.update_layout(xaxis_tickangle=45, height=520)
    bar_path = os.path.join(out_dir, f"dmr_sig_effectsize_p{pvalue_thresh:.3f}.html")
    # fig_bar.write_html(bar_path, include_plotlyjs='cdn')
    fig_bar.show()
    # print("Saved effect-size bar plot:", bar_path)



    # Scatter: sample A vs sample B percent modified (size = total reads, color = effect size)
    fig_scatter = px.scatter(
        sig,
        x='a_perc',
        y='b_perc',
        color='effect_size',
        size='total_reads',
        hover_data=['pos','start','map_pvalue','effect_size','cohen_h'],
        title=f"Significant CGs (map_pvalue <= {pvalue_thresh}) — sample A vs B percent modified <br>{experiment_codition}",
        labels={'a_perc':'Sample A % modified','b_perc':'Sample B % modified'}
    )
    fig_scatter.update_layout(height=520)
    scatter_path = os.path.join(out_dir, f"dmr_sig_scatter_p{pvalue_thresh:.3f}.html")
    # fig_scatter.write_html(scatter_path, include_plotlyjs='cdn')
    fig_scatter.show()
    # print("Saved scatter plot:", scatter_path)


Saved significant sites table: /home/michalula/code/epiCausality/epiCode/modkit_differential_methyl/K562_v_T/mc_07/notFiltered/K562_Zoff_v_T_Coff/dmr_significant_p0.050_roi.tsv


Unnamed: 0,chrom,start,end,strand,map_pvalue,effect_size,a_perc,b_perc,total_reads
0,chr1,206583089,206583090,+,3.982798e-02,-1.000000,0.000000,100.000000,13
1,chr1,206583387,206583388,+,0.000000e+00,-0.740000,8.028169,82.400000,835
2,chr1,206583388,206583389,-,0.000000e+00,-0.699315,14.788732,84.931505,499
3,chr1,206583707,206583708,+,0.000000e+00,-0.810000,11.929308,92.810460,832
4,chr1,206583708,206583709,-,0.000000e+00,-0.830000,9.357798,92.000000,620
...,...,...,...,...,...,...,...,...,...
114,chr1,206588704,206588705,+,3.322291e-05,-0.310000,45.392954,76.388890,882
115,chr1,206588705,206588706,-,8.328278e-07,-0.372326,43.023255,80.232560,688
116,chr1,206588908,206588909,+,1.674631e-06,-0.260000,69.860893,96.212120,779
117,chr1,206588909,206588910,-,3.213615e-08,-0.342025,62.307690,96.202534,599


# TODO: check
- are there really NO diffs between the reads selected with the mC > 70 and mC > 99.5% filtering ??

could be as the mC calles are automatically selected

and in the CRoff the auto threshold 
* in mC > 70 was to 0.79
> Using filter threshold 0.7910156 for C.
* in mC > 99.5 was to  0.79
> Using filter threshold 0.7910156 for C.


and in the Unediter the auto threshold 
* in mC > 70 was to 0.8496
> Using filter threshold 0.8496094 for C.
* in mC > 99.5 was to 0.849
> Using filter threshold 0.8496094 for C.


SAME per condition AUTOMATIC modkit filtering threshold per mC run were set

(NOT 0.995 and not 0.7)

# TODO: check
- are there really NO diffs between the reads selected with the mC > 70 and mC > 99.5% filtering ??

could be as the mC calles are automatically selected

and in the CRoff the auto threshold 
* in mC > 70 was to 0.79
> Using filter threshold 0.7910156 for C.
* in mC > 99.5 was to  0.79
> Using filter threshold 0.7910156 for C.


and in the Unediter the auto threshold 
* in mC > 70 was to 0.8496
> Using filter threshold 0.8496094 for C.
* in mC > 99.5 was to 0.849
> Using filter threshold 0.8496094 for C.


SAME per condition AUTOMATIC modkit filtering threshold per mC run were set

(NOT 0.995 and not 0.7)