# Modkit dmr
## Use my Filtered Reads

In [6]:
from datetime import datetime

def current_time():
    """Returns the current date and time as a formatted string."""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S") 

print("Current Date and Time:", current_time())

Current Date and Time: 2025-11-10 16:23:03


Based on:
https://nanoporetech.github.io/modkit/intro_dmr.html#perform-differential-methylation-scoring

Select kernal: dimelo_v2_modkit_parsing

Preparing the input data
The inputs to all modkit dmr commands are two or more bedMethyl files (created by modkit pileup) that have been compressed with bgzip and indexed with tabix. An example of how to generate the input data is shown below:


ref=grch38.fasta
threads=32

norm=normal_sample.bam
norm_pileup=normal_pileup.bed

modkit pileup ${norm} ${norm_pileup} \
  --cpg \
  --ref ${ref} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${norm_pileup}
tabix -p bed ${norm_pileup}.gz

# pileup and compression can also be done in one step
tumor=tumor_sample.bam
tumor_pileup=tumor_pileup.bed.gz

modkit pileup ${tumor} - \
  --cpg \
  --ref ${ref} \
  --threads ${threads} \
  --log-filepath log.txt | ${bgzip} -c > ${tumor_pileup}

tabix -p bed ${tumor_pileup}

In [7]:
%%bash
echo "hello"

hello


# Use the NEW modkit latest installed version in ipython kernel modkit_new

In [8]:
# ! python3 -m ipykernel install --user --name=modkit_new --display-name "modkit_new Python"
# ! which modkit

In [9]:
import os
os.environ["PATH"] = "/home/michalula/.cargo/bin:" + os.environ["PATH"]
! which modkit
! modkit --version

/home/michalula/.cargo/bin/modkit
modkit 0.5.1


In [10]:
# ! modkit

In [11]:
! modkit --version 

modkit 0.5.1


In [12]:
import os
import pandas as pd

def load_pileup_bed(bed_path):
    # bed_path = existing[0]
    print("Reading bedMethyl file:", bed_path)

    # bedMethyl column names (18 columns as provided)
    colnames = [
        "chrom", "start", "end", "mod_code", "score", "strand",
        "start2", "end2", "color",
        "Nvalid_cov", "percent_modified", "Nmod", "Ncanonical",
        "Nother_mod", "Ndelete", "Nfail", "Ndiff", "Nnocall"
    ]

    # Configure dtypes where reasonable
    dtypes = {
        "chrom": str,
        "start": "Int64",
        "end": "Int64",
        "mod_code": str,
        "score": "Int64",
        "strand": str,
        "start2": "Int64",
        "end2": "Int64",
        "color": str,
        "Nvalid_cov": "Int64",
        "percent_modified": float,
        "Nmod": "Int64",
        "Ncanonical": "Int64",
        "Nother_mod": "Int64",
        "Ndelete": "Int64",
        "Nfail": "Int64",
        "Ndiff": "Int64",
        "Nnocall": "Int64"
    }

    compression = "gzip" if bed_path.endswith(".gz") else None

    # Read file (headerless BED-like table). If file has extra columns, keep them with automatic numeric conversion below.
    df = pd.read_csv(
        bed_path,
        sep="\t",
        header=None,
        comment="#",
        names=colnames,
        dtype=dtypes,
        compression=compression,
        engine="python",
        na_values=[".", "NA", ""],
        keep_default_na=True
    )

    # If file contained more than 18 columns, pandas assigned remaining data to extra columns named like col_18, col_19...
    # Ensure numeric conversion for numeric-like columns
    for c in df.columns:
        if df[c].dtype == object:
            # try safe numeric conversion where appropriate
            try:
                df[c] = pd.to_numeric(df[c], errors="ignore")
            except Exception:
                pass

    print("Loaded DataFrame shape:", df.shape)
    display(df.head())
    return df


In [13]:
import os
from IPython.display import display, HTML
from plotly import express as px
from plotly import graph_objects as go

# ! python3 -m pip install plotly
# ! python3 -m pip install matplotlib
# ! python3 -m pip install nbformat>=4.2.0

def plot_pileup_roi_df(df_roi, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    # ensure numeric types for plotting
    df_roi['pos'] = df_roi['start'].astype(int)
    df_roi['percent_modified'] = df_roi['percent_modified'].astype(float)
    df_roi['Nvalid_cov'] = df_roi['Nvalid_cov'].astype(int)
    df_roi['Nmod'] = df_roi['Nmod'].astype(int)
    df_roi['Ncanonical'] = df_roi['Ncanonical'].astype(int)

    # Scatter: genomic position vs percent modified (point size = coverage)
    fig1 = px.scatter(
        df_roi,
        x='pos',
        y='percent_modified',
        color='strand',
        size='Nvalid_cov',
        hover_data=['Nvalid_cov','Nmod','Ncanonical','Nother_mod','Nnocall'],
        title='Percent modified across ROI (size = Nvalid_cov)',
        height=500
    )
    fig1.update_layout(xaxis_title='Genomic position (start)', yaxis_title='Percent modified')
    fig1.show()
    # fig1.write_html(os.path.join(out_dir, "roi_percent_modified_scatter.html"), include_plotlyjs='cdn')

    # Histogram: coverage distribution
    fig2 = px.histogram(
        df_roi,
        x='Nvalid_cov',
        nbins=40,
        title='Distribution of Nvalid_cov (coverage) in ROI',
        height=400
    )
    fig2.update_layout(xaxis_title='Nvalid_cov', yaxis_title='Count')
    fig2.show()
    # fig2.write_html(os.path.join(out_dir, "roi_nvalidcov_hist.html"), include_plotlyjs='cdn')

    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked)
    topn = 30
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(name='Nmod', x=df_top['label'], y=df_top['Nmod']))
    fig3.add_trace(go.Bar(name='Ncanonical', x=df_top['label'], y=df_top['Ncanonical']))
    fig3.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod / Ncanonical)',
                    xaxis_title='position:strand', yaxis_title='reads', height=520)
    fig3.show()
    # fig3.write_html(os.path.join(out_dir, "roi_top_sites_stacked_counts.html"), include_plotlyjs='cdn')

    # Print simple summaries
    print("ROI rows:", df_roi.shape[0])
    print("Percent modified: median={:.2f}, mean={:.2f}".format(df_roi['percent_modified'].median(), df_roi['percent_modified'].mean()))
    print("Coverage (Nvalid_cov): min={}, median={}, max={}".format(df_roi['Nvalid_cov'].min(), df_roi['Nvalid_cov'].median(), df_roi['Nvalid_cov'].max()))

    # Display first rows table for quick inspection
    display(HTML(df_roi.head(20).to_html(index=False)))


    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked)
    topn = 274
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(name='Nmod', x=df_top['label'], y=df_top['Nmod']))
    fig3.add_trace(go.Bar(name='Ncanonical', x=df_top['label'], y=df_top['Ncanonical']))
    fig3.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod / Ncanonical)',
                    xaxis_title='position:strand', yaxis_title='reads', height=520)
    fig3.show()
    # fig3.write_html(os.path.join(out_dir, "roi_top_sites_stacked_counts.html"), include_plotlyjs='cdn')

    # Print simple summaries
    print("ROI rows:", df_roi.shape[0])
    print("Percent modified: median={:.2f}, mean={:.2f}".format(df_roi['percent_modified'].median(), df_roi['percent_modified'].mean()))
    print("Coverage (Nvalid_cov): min={}, median={}, max={}".format(df_roi['Nvalid_cov'].min(), df_roi['Nvalid_cov'].median(), df_roi['Nvalid_cov'].max()))

    # Display first rows table for quick inspection
    display(HTML(df_roi.head(20).to_html(index=False)))

    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    topn = 30
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig4 = go.Figure()
    fig4.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig4.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig4.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig4.show()
    # fig4.write_html(os.path.join(out_dir, "roi_top_sites_stacked_percentage.html"), include_plotlyjs='cdn')     


    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    topn = 277
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig4 = go.Figure()
    fig4.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig4.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig4.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig4.show()
    # fig4.write_html(os.path.join(out_dir, "roi_top_sites_stacked_percentage.html"), include_plotlyjs='cdn')     

    # Bar: Unsorted sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    df_top = df_roi.copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig5 = go.Figure()
    fig5.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig5.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig5.update_layout(barmode='stack', title=f'All sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig5.show()
    # fig5.write_html(os.path.join(out_dir, "roi_all_sites_stacked_percentage.html"), include_plotlyjs='cdn')    


    return df_top



In [14]:
! ls /home/michalula/data/ref_genomes/t2t_v2_0/

chm13v2.0.fa	  chm13v2.0.fa.fai		   haplotype_vcf
chm13v2.0.fa.amb  chm13v2.0.fa.pac		   up_chm13v2.0.fasta
chm13v2.0.fa.ann  chm13v2.0.fa.sa		   up_chm13v2.0.fasta.fai
chm13v2.0.fa.bwt  convert_to_uppercase_fasta.bash


# Pileups 
## for CRISPRoff filtered data for Day 35 

In [15]:
# ! ls /home/michalula/data/cas9_nanopore/data/20250908_nCATs_T_CRoff_Day_35/5mCG/to_t2t_v2_0/
# ! ls /home/michalula/data/cas9_nanopore/data/20250908_nCATs_T_CRoff_Day_35/5mCG/to_t2t_v2_0/

In [23]:
! ls /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/

CG_137_padded_reads_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_mCthresh0.7_T2Tv2_0_chr1:206583354-206589854_2025-11-09_units_combined_numFWD104_numRVS222.npy
CG_137_padded_reads_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_mCthresh0.995_T2Tv2_0_chr1:206583354-206589854_2025-11-09_units_combined_numFWD104_numRVS222.npy
extracted_reads
filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam.bai
filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
filtered_reads_overlap_MORE_than_0.9

In [24]:
%%bash
date_today="20251110"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_pileup/"
mkdir ${pileup_data_folder_path}

# mC > 99.5% = 0995 filtered 
# filtered_CROFF_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# mC >70% = 07 filtered 
# filtered_CROFF_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# pileup_CROFF_day35_bed=${pileup_data_folder_path}${date_today}"_filtered_mC07""_pileup_CROFF_Day35_Tcells.bed"

# Pre-filtered mC > 70%::
CROFF_day35_bam=${data_folder_path}"pre_filtered_ROI_reads_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
pileup_CROFF_day35_bed=${pileup_data_folder_path}${date_today}"_noFilter_repl1_mC07""_pileup_CROFF_Day35_Tcells.bed"


# use full data unfiltered .BAM file
# data_folder_path="/home/michalula/data/cas9_nanopore/data/20250908_nCATs_T_CRoff_Day_35/5mCG/to_t2t_v2_0/"
# CROFF_day35_bam=${data_folder_path}"sort_align_t2t_v2_0_trim_20250908_Day35_CROFF_Tcells_2Libraries_Minion_R9.dna_r9.4.1_e8_sup@v3.3.5mCG.bam"
# pileup_CROFF_day35_bed=${data_folder_path}${date_today}"_full_data""_pileup_CROFF_Day35_Tcells.bed"


ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref= "/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"
threads=32


modkit pileup ${CROFF_day35_bam} ${pileup_CROFF_day35_bed} \
  --cpg \
  --ref ${ref_genome_fa} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${pileup_CROFF_day35_bed}
tabix -p bed ${pileup_CROFF_day35_bed}.gz

printf '%s\n' "filtered_CROFF_day35_bam: $filtered_CROFF_day35_bam"
printf '%s\n' "pileup_CROFF_day35_bed: $pileup_CROFF_day35_bed"
cat "$pileup_CROFF_day35_bed"

mkdir: cannot create directory ‘/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/new_pileup/’: File exists
[0;32m>[0m calculated chunk size: 48, interval size 100000, processing 4800000 positions concurrently
[0;32m>[0m filtering to only CpG motifs
[0;32m>[0m attempting to sample 10042 reads
[0;32m>[0m Using filter threshold 0.7910156 for C.


ERROR! Session/line number was not unique in database. History logging moved to new session 1344


[0;32m>[0m Done, processed 327 rows. Processed ~355 reads and skipped zero reads.


filtered_CROFF_day35_bam: 
pileup_CROFF_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_repl1_mC07_pileup_CROFF_Day35_Tcells.bed
chr1	206583089	206583090	m	4	+	206583089	206583090	255,0,0	4	75.00	3	1	0	0	0	0	0
chr1	206583090	206583091	m	2	-	206583090	206583091	255,0,0	2	100.00	2	0	0	0	0	0	0
chr1	206583173	206583174	m	152	+	206583173	206583174	255,0,0	152	91.45	139	13	0	6	26	17	7
chr1	206583174	206583175	m	126	-	206583174	206583175	255,0,0	126	95.24	120	6	0	1	2	1	10
chr1	206583387	206583388	m	176	+	206583387	206583388	255,0,0	176	76.14	134	42	0	1	17	10	8
chr1	206583388	206583389	m	107	-	206583388	206583389	255,0,0	107	85.05	91	16	0	6	14	6	9
chr1	206583707	206583708	m	197	+	206583707	206583708	255,0,0	197	92.89	183	14	0	2	4	6	3
chr1	206583708	206583709	m	117	-	206583708	206583709	255,0,0	117	96.58	113	4	0	8	8	2	8
chr1	206583766	206583767	m	177	+	20658376

## Pileup columns explore

bedMethyl column descriptions.

Definitions:

Nmod - Number of calls passing filters that were classified as a residue with a specified base modification.

Ncanonical - Number of calls passing filters were classified as the canonical base rather than modified. The exact base must be inferred by the modification code. For example, if the modification code is m (5mC) then the canonical base is cytosine. If the modification code is a, the canonical base is adenine.

Nother mod - Number of calls passing filters that were classified as modified, but where the modification is different from the listed base (and the corresponding canonical base is equal). For example, for a given cytosine there may be 3 reads with h calls, 1 with a canonical call, and 2 with m calls. In the bedMethyl row for h Nother_mod would be 2. In the m row Nother_mod would be 3.

Nvalid_cov - the valid coverage. Nvalid_cov = Nmod + Nother_mod + Ncanonical, also used as the score in the bedMethyl

Ndiff - Number of reads with a base other than the canonical base for this modification. For example, in a row for h the canonical base is cytosine, if there are 2 reads with C->A substitutions, Ndiff will be 2.

Ndelete - Number of reads with a deletion at this reference position

Nfail - Number of calls where the probability of the call was below the threshold. The threshold can be set on the command line or computed from the data (usually failing the lowest 10th percentile of calls).

Nnocall - Number of reads aligned to this reference position, with the correct canonical base, but without a base modification call. This can happen, for example, if the model requires a CpG dinucleotide and the read has a CG->CH substitution such that no modification call was produced by the basecaller.


column	name	description	type

    1	chrom	name of reference sequence from BAM header	str

    2	start position	0-based start position	int

    3	end position	0-based exclusive end position	int

    4	modified base code and motif	single letter code for modified base and motif when more than one motif is used	str

    5	score	equal to Nvalid_cov	int

    6	strand	'+' for positive strand '-' for negative strand, '.' when strands are combined	str

    7	start position	included for compatibility	int

    8	end position	included for compatibility	int

    9	color	included for compatibility, always 255,0,0	str

    10	Nvalid_cov	see definitions above.	int

    11	percent modified	(Nmod / Nvalid_cov) * 100	float

    12	Nmod	see definitions above	int

    13	Ncanonical	see definitions above	int

    14	Nother_mod	see definitions above	int

    15	Ndelete	see definitions above	int

    16	Nfail	see definitions above	int

    17	Ndiff	see definitions above	int

    18	Nnocall	see definitions above	int


In [25]:
date_today="20251110"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=data_folder_path+"new_pileup/"
# mkdir ${pileup_data_folder_path}


data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=data_folder_path+"new_pileup/"

# mkdir ${pileup_data_folder_path}
# mC > 99.5% = 0995 filtered 
# filtered_CROFF_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# mC >70% = 07 filtered 
# filtered_CROFF_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# pileup_CROFF_day35_bed=${pileup_data_folder_path}${date_today}"_filtered_mC07""_pileup_CROFF_Day35_Tcells.bed"

# mC > 99.5% = 0995 filtered 
# filtered_CROFF_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# filtered_CROFF_day35_bam=data_folder_path+"filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# mC >70% = 07 filtered 
filtered_CROFF_day35_bam=data_folder_path+"filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
pileup_CROFF_day35_bed=pileup_data_folder_path+date_today+"_filtered_mC07""_pileup_CROFF_Day35_Tcells.bed"

# Pre-filtered mC > 70%::
# CROFF_day35_bam=${data_folder_path}"pre_filtered_ROI_reads_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# pileup_CROFF_day35_bed=${pileup_data_folder_path}${date_today}"_noFilter_repl1_mC07""_pileup_CROFF_Day35_Tcells.bed"

# Pre-filtered mC > 70%::
CROFF_day35_bam=data_folder_path+"pre_filtered_ROI_reads_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
pileup_CROFF_day35_bed=pileup_data_folder_path+date_today+"_noFilter_repl1_mC07""_pileup_CROFF_Day35_Tcells.bed"

pileup_CROFF_day35_bed


'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_repl1_mC07_pileup_CROFF_Day35_Tcells.bed'

In [26]:
pileup_CROFF_day35_df = load_pileup_bed(pileup_CROFF_day35_bed)
pileup_CROFF_day35_df

Reading bedMethyl file: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_repl1_mC07_pileup_CROFF_Day35_Tcells.bed
Loaded DataFrame shape: (327, 18)


  df[c] = pd.to_numeric(df[c], errors="ignore")


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583089,206583090,m,4,+,206583089,206583090,25500,4,75.0,3,1,0,0,0,0,0
1,chr1,206583090,206583091,m,2,-,206583090,206583091,25500,2,100.0,2,0,0,0,0,0,0
2,chr1,206583173,206583174,m,152,+,206583173,206583174,25500,152,91.45,139,13,0,6,26,17,7
3,chr1,206583174,206583175,m,126,-,206583174,206583175,25500,126,95.24,120,6,0,1,2,1,10
4,chr1,206583387,206583388,m,176,+,206583387,206583388,25500,176,76.14,134,42,0,1,17,10,8


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583089,206583090,m,4,+,206583089,206583090,25500,4,75.00,3,1,0,0,0,0,0
1,chr1,206583090,206583091,m,2,-,206583090,206583091,25500,2,100.00,2,0,0,0,0,0,0
2,chr1,206583173,206583174,m,152,+,206583173,206583174,25500,152,91.45,139,13,0,6,26,17,7
3,chr1,206583174,206583175,m,126,-,206583174,206583175,25500,126,95.24,120,6,0,1,2,1,10
4,chr1,206583387,206583388,m,176,+,206583387,206583388,25500,176,76.14,134,42,0,1,17,10,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,chr1,206597139,206597140,m,1,+,206597139,206597140,25500,1,100.00,1,0,0,0,0,0,0
323,chr1,206597440,206597441,m,1,+,206597440,206597441,25500,1,100.00,1,0,0,0,0,0,0
324,chr1,206598045,206598046,m,1,+,206598045,206598046,25500,1,100.00,1,0,0,0,0,0,0
325,chr1,206598626,206598627,m,1,+,206598626,206598627,25500,1,100.00,1,0,0,0,0,0,0


# Look at CpGs within out target ROI
T2T v2.0

First CG:
206583388,206583390

Last of selected 137 CGs in the ROI:

206589746,206589748 --CpG_137

=> here have each CG position separate: so have 137*2  = 276

In [27]:
137*2, 277-5

(274, 272)

In [28]:
pileup_CROFF_day35_df[pileup_CROFF_day35_df['start'] == 206583387]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
4,chr1,206583387,206583388,m,176,+,206583387,206583388,25500,176,76.14,134,42,0,1,17,10,8


In [29]:
pileup_CROFF_day35_df[pileup_CROFF_day35_df['start'] == 206583388]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
5,chr1,206583388,206583389,m,107,-,206583388,206583389,25500,107,85.05,91,16,0,6,14,6,9


In [30]:
pileup_CROFF_day35_df[pileup_CROFF_day35_df['start'] == 206589746]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
277,chr1,206589746,206589747,m,108,-,206589746,206589747,25500,108,98.15,106,2,0,18,3,4,10


In [31]:
(279-5) / 2

137.0

In [32]:
pileup_CROFF_day35_df_roi = pileup_CROFF_day35_df.iloc[4:278, :]  # Display target region rows
print(pileup_CROFF_day35_df_roi.shape,pileup_CROFF_day35_df_roi.shape[0]/2)
pileup_CROFF_day35_df_roi

(274, 18) 137.0


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
4,chr1,206583387,206583388,m,176,+,206583387,206583388,25500,176,76.14,134,42,0,1,17,10,8
5,chr1,206583388,206583389,m,107,-,206583388,206583389,25500,107,85.05,91,16,0,6,14,6,9
6,chr1,206583707,206583708,m,197,+,206583707,206583708,25500,197,92.89,183,14,0,2,4,6,3
7,chr1,206583708,206583709,m,117,-,206583708,206583709,25500,117,96.58,113,4,0,8,8,2,8
8,chr1,206583766,206583767,m,177,+,206583766,206583767,25500,177,88.14,156,21,0,4,3,23,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,chr1,206589213,206589214,m,128,-,206589213,206589214,25500,128,96.09,123,5,0,6,5,1,3
274,chr1,206589436,206589437,m,198,+,206589436,206589437,25500,198,92.93,184,14,0,1,3,4,2
275,chr1,206589437,206589438,m,129,-,206589437,206589438,25500,129,96.90,125,4,0,2,5,1,6
276,chr1,206589745,206589746,m,177,+,206589745,206589746,25500,177,100.00,177,0,0,8,3,7,12


<!>
> Threshold of  0.7597656 for base C is low. Consider increasing the filter-percentile or specifying a higher threshold.
> Done, processed 11762972 rows. Processed ~129977 reads and skipped ~150 reads.

In [33]:
# Plot pileup_Unedit_day35_df_roi summary plots (use existing variables/imports in the notebook)
# Saves interactive HTMLs to pileup_data_folder_path and displays inline.
out_dir = pileup_data_folder_path  # existing variable in the notebook

df_roi = pileup_CROFF_day35_df_roi.copy()

df_roi_stats = plot_pileup_roi_df(df_roi=pileup_CROFF_day35_df_roi, out_dir=pileup_data_folder_path)
df_roi_stats


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_roi['pos'] = df_roi['start'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_roi['percent_modified'] = df_roi['percent_modified'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_roi['Nvalid_cov'] = df_roi['Nvalid_cov'].astype(int)
A value is trying to be set on a co

ROI rows: 274
Percent modified: median=81.28, mean=73.92
Coverage (Nvalid_cov): min=18, median=133.5, max=209


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,176,+,206583387,206583388,25500,176,76.14,134,42,0,1,17,10,8,206583387
chr1,206583388,206583389,m,107,-,206583388,206583389,25500,107,85.05,91,16,0,6,14,6,9,206583388
chr1,206583707,206583708,m,197,+,206583707,206583708,25500,197,92.89,183,14,0,2,4,6,3,206583707
chr1,206583708,206583709,m,117,-,206583708,206583709,25500,117,96.58,113,4,0,8,8,2,8,206583708
chr1,206583766,206583767,m,177,+,206583766,206583767,25500,177,88.14,156,21,0,4,3,23,5,206583766
chr1,206583767,206583768,m,118,-,206583767,206583768,25500,118,94.92,112,6,0,1,15,1,8,206583767
chr1,206584104,206584105,m,184,+,206584104,206584105,25500,184,94.57,174,10,0,5,3,17,3,206584104
chr1,206584105,206584106,m,124,-,206584105,206584106,25500,124,95.97,119,5,0,2,2,6,9,206584105
chr1,206584137,206584138,m,205,+,206584137,206584138,25500,205,97.56,200,5,0,0,6,0,1,206584137
chr1,206584138,206584139,m,94,-,206584138,206584139,25500,94,86.17,81,13,0,0,49,0,0,206584138


ROI rows: 274
Percent modified: median=81.28, mean=73.92
Coverage (Nvalid_cov): min=18, median=133.5, max=209


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,176,+,206583387,206583388,25500,176,76.14,134,42,0,1,17,10,8,206583387
chr1,206583388,206583389,m,107,-,206583388,206583389,25500,107,85.05,91,16,0,6,14,6,9,206583388
chr1,206583707,206583708,m,197,+,206583707,206583708,25500,197,92.89,183,14,0,2,4,6,3,206583707
chr1,206583708,206583709,m,117,-,206583708,206583709,25500,117,96.58,113,4,0,8,8,2,8,206583708
chr1,206583766,206583767,m,177,+,206583766,206583767,25500,177,88.14,156,21,0,4,3,23,5,206583766
chr1,206583767,206583768,m,118,-,206583767,206583768,25500,118,94.92,112,6,0,1,15,1,8,206583767
chr1,206584104,206584105,m,184,+,206584104,206584105,25500,184,94.57,174,10,0,5,3,17,3,206584104
chr1,206584105,206584106,m,124,-,206584105,206584106,25500,124,95.97,119,5,0,2,2,6,9,206584105
chr1,206584137,206584138,m,205,+,206584137,206584138,25500,205,97.56,200,5,0,0,6,0,1,206584137
chr1,206584138,206584139,m,94,-,206584138,206584139,25500,94,86.17,81,13,0,0,49,0,0,206584138


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,...,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos,label,Ntotal,Nmod_perc,Ncanonical_perc
4,chr1,206583387,206583388,m,176,+,206583387,206583388,25500,176,...,0,1,17,10,8,206583387,206583387:+,176,76.136364,23.863636
5,chr1,206583388,206583389,m,107,-,206583388,206583389,25500,107,...,0,6,14,6,9,206583388,206583388:-,107,85.046729,14.953271
6,chr1,206583707,206583708,m,197,+,206583707,206583708,25500,197,...,0,2,4,6,3,206583707,206583707:+,197,92.893401,7.106599
7,chr1,206583708,206583709,m,117,-,206583708,206583709,25500,117,...,0,8,8,2,8,206583708,206583708:-,117,96.581197,3.418803
8,chr1,206583766,206583767,m,177,+,206583766,206583767,25500,177,...,0,4,3,23,5,206583766,206583766:+,177,88.135593,11.864407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,chr1,206589213,206589214,m,128,-,206589213,206589214,25500,128,...,0,6,5,1,3,206589213,206589213:-,128,96.093750,3.906250
274,chr1,206589436,206589437,m,198,+,206589436,206589437,25500,198,...,0,1,3,4,2,206589436,206589436:+,198,92.929293,7.070707
275,chr1,206589437,206589438,m,129,-,206589437,206589438,25500,129,...,0,2,5,1,6,206589437,206589437:-,129,96.899225,3.100775
276,chr1,206589745,206589746,m,177,+,206589745,206589746,25500,177,...,0,8,3,7,12,206589745,206589745:+,177,100.000000,0.000000


# Unedited T cells Day 35

In [34]:
! ls "/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output"


CG_137_padded_reads_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_mCthresh0.7_T2Tv2_0_chr1:206583354-206589854_2025-11-09_units_combined_numFWD87_numRVS115.npy
CG_137_padded_reads_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_mCthresh0.995_T2Tv2_0_chr1:206583354-206589854_2025-11-09_units_combined_numFWD87_numRVS114.npy
extracted_reads
filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam.bai
filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_thre

In [None]:
%%bash

date_today="20251110"
data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_pileup/"
mkdir ${pileup_data_folder_path}

# mC > 99.5% = 0995 filtered 
# filtered_Unedit_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# mC >70% = 07 filtered   
# filtered_Unedit_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# pileup_Unedit_day35_bed=${pileup_data_folder_path}${date_today}"_filtered_mC07""_pileup_NT_Day35_Tcells.bed"
# cat "$pileup_Unedit_day35_bed"

# pileup_CROFF_day35_bed=${pileup_data_folder_path}${date_today}"_noFilter_repl1_mC07""_pileup_CROFF_Day35_Tcells.bed"
not_filtered_Unedit_day35_bam=${data_folder_path}"pre_filtered_ROI_reads_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
pileup_Unedit_day35_bed=${pileup_data_folder_path}${date_today}"_noFilter_mC07""_pileup_NT_Day35_Tcells.bed"
cat "$pileup_Unedit_day35_bed"

ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref="/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"
threads=32

modkit pileup ${not_filtered_Unedit_day35_bam} ${pileup_Unedit_day35_bed} \
  --cpg \
  --ref ${ref_genome_fa} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${pileup_Unedit_day35_bed}
tabix -p bed ${pileup_Unedit_day35_bed}.gz

printf '%s\n' "not_filtered_Unedit_day35_bam: $not_filtered_Unedit_day35_bam"
printf '%s\n' "pileup_Unedit_day35_bed: $pileup_Unedit_day35_bed"
cat "$pileup_Unedit_day35_bed"


mkdir: cannot create directory ‘/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/’: File exists
cat: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_NT_Day35_Tcells.bed: No such file or directory
[0;32m>[0m calculated chunk size: 48, interval size 100000, processing 4800000 positions concurrently
[0;32m>[0m filtering to only CpG motifs
[0;32m>[0m attempting to sample 10042 reads
[0;32m>[0m Using filter threshold 0.8496094 for C.


ERROR! Session/line number was not unique in database. History logging moved to new session 1345


[0;32m>[0m Done, processed 285 rows. Processed ~215 reads and skipped zero reads.


not_filtered_Unedit_day35_bam: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/pre_filtered_ROI_reads_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam
pileup_Unedit_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_NT_Day35_Tcells.bed
chr1	206583090	206583091	m	1	-	206583090	206583091	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206583173	206583174	m	80	+	206583173	206583174	255,0,0	80	85.00	68	12	0	3	18	8	4
chr1	206583174	206583175	m	90	-	206583174	206583175	255,0,0	90	95.56	86	4	0	1	3	2	2
chr1	206583387	206583388	m	85	+	206583387	206583388	255,0,0	85	72.94	62	23	0	1	15	10	4
chr1	206583388	206583389	m	75	-	206583388	206583389	255,0,0	75	86.67	65	10	0	3	14	1	5
chr1	206583707	206583708	m	104	+	206583707	206583708	

In [37]:
# date_today="20251110"
# data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/"
# pileup_data_folder_path=data_folder_path+"new_pileup/"
# # mkdir ${pileup_data_folder_path}

# # filtered_Unedit_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# pileup_Unedit_day35_bed=pileup_data_folder_path+date_today+"_filtered_mC07""_pileup_NT_Day35_Tcells.bed"
# print("pileup_Unedit_day35_bed:", pileup_Unedit_day35_bed)


date_today="20251110"
data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/"
pileup_data_folder_path=data_folder_path+"new_pileup/"
# mkdir ${pileup_data_folder_path}

# mC > 99.5% = 0995 filtered 
# filtered_Unedit_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# mC >70% = 07 filtered   
# filtered_Unedit_day35_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# pileup_Unedit_day35_bed=${pileup_data_folder_path}${date_today}"_filtered_mC07""_pileup_NT_Day35_Tcells.bed"
# cat "$pileup_Unedit_day35_bed"

# pileup_CROFF_day35_bed=${pileup_data_folder_path}${date_today}"_noFilter_repl1_mC07""_pileup_CROFF_Day35_Tcells.bed"
# not_filtered_Unedit_day35_bam=${data_folder_path}"pre_filtered_ROI_reads_Tcells_NT_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
pileup_Unedit_day35_bed=pileup_data_folder_path+date_today+"_noFilter_mC07""_pileup_NT_Day35_Tcells.bed"
# cat "$pileup_Unedit_day35_bed"


pileup_Unedit_day35_bed

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_NT_Day35_Tcells.bed'

In [38]:
pileup_Unedit_day35_df = load_pileup_bed(pileup_Unedit_day35_bed)
pileup_Unedit_day35_df

Reading bedMethyl file: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_NT_Day35_Tcells.bed
Loaded DataFrame shape: (285, 18)



errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead



Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583090,206583091,m,1,-,206583090,206583091,25500,1,100.0,1,0,0,0,0,0,0
1,chr1,206583173,206583174,m,80,+,206583173,206583174,25500,80,85.0,68,12,0,3,18,8,4
2,chr1,206583174,206583175,m,90,-,206583174,206583175,25500,90,95.56,86,4,0,1,3,2,2
3,chr1,206583387,206583388,m,85,+,206583387,206583388,25500,85,72.94,62,23,0,1,15,10,4
4,chr1,206583388,206583389,m,75,-,206583388,206583389,25500,75,86.67,65,10,0,3,14,1,5


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583090,206583091,m,1,-,206583090,206583091,25500,1,100.00,1,0,0,0,0,0,0
1,chr1,206583173,206583174,m,80,+,206583173,206583174,25500,80,85.00,68,12,0,3,18,8,4
2,chr1,206583174,206583175,m,90,-,206583174,206583175,25500,90,95.56,86,4,0,1,3,2,2
3,chr1,206583387,206583388,m,85,+,206583387,206583388,25500,85,72.94,62,23,0,1,15,10,4
4,chr1,206583388,206583389,m,75,-,206583388,206583389,25500,75,86.67,65,10,0,3,14,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,chr1,206589931,206589932,m,6,-,206589931,206589932,25500,6,100.00,6,0,0,0,0,0,0
281,chr1,206589955,206589956,m,6,+,206589955,206589956,25500,6,100.00,6,0,0,0,2,0,0
282,chr1,206589956,206589957,m,3,-,206589956,206589957,25500,3,100.00,3,0,0,0,0,0,2
283,chr1,206590032,206590033,m,6,+,206590032,206590033,25500,6,83.33,5,1,0,0,0,2,0


In [39]:
pileup_Unedit_day35_df[pileup_Unedit_day35_df['start'] == 206583388-1]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
3,chr1,206583387,206583388,m,85,+,206583387,206583388,25500,85,72.94,62,23,0,1,15,10,4


In [40]:
pileup_Unedit_day35_df[pileup_Unedit_day35_df['start'] == 206583388]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
4,chr1,206583388,206583389,m,75,-,206583388,206583389,25500,75,86.67,65,10,0,3,14,1,5


In [41]:
pileup_Unedit_day35_df[pileup_Unedit_day35_df['start'] == 206589746-1]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
275,chr1,206589745,206589746,m,101,+,206589745,206589746,25500,101,99.01,100,1,0,7,0,0,7


In [42]:
pileup_Unedit_day35_df[pileup_Unedit_day35_df['start'] == 206589746]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
276,chr1,206589746,206589747,m,74,-,206589746,206589747,25500,74,100.0,74,0,0,9,7,4,6


In [43]:
pileup_Unedit_day35_df_roi = pileup_Unedit_day35_df.iloc[3:277, :]  # Display target region rows
print(pileup_Unedit_day35_df_roi.shape,pileup_Unedit_day35_df_roi.shape[0]/2)
pileup_Unedit_day35_df_roi

(274, 18) 137.0


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
3,chr1,206583387,206583388,m,85,+,206583387,206583388,25500,85,72.94,62,23,0,1,15,10,4
4,chr1,206583388,206583389,m,75,-,206583388,206583389,25500,75,86.67,65,10,0,3,14,1,5
5,chr1,206583707,206583708,m,104,+,206583707,206583708,25500,104,96.15,100,4,0,3,3,0,5
6,chr1,206583708,206583709,m,80,-,206583708,206583709,25500,80,96.25,77,3,0,1,9,6,3
7,chr1,206583766,206583767,m,97,+,206583766,206583767,25500,97,92.78,90,7,0,3,1,12,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,chr1,206589213,206589214,m,87,-,206589213,206589214,25500,87,78.16,68,19,0,2,8,0,3
273,chr1,206589436,206589437,m,110,+,206589436,206589437,25500,110,95.45,105,5,0,0,4,1,0
274,chr1,206589437,206589438,m,92,-,206589437,206589438,25500,92,95.65,88,4,0,0,4,3,1
275,chr1,206589745,206589746,m,101,+,206589745,206589746,25500,101,99.01,100,1,0,7,0,0,7


In [44]:
pileup_Unedit_day35_df_roi

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
3,chr1,206583387,206583388,m,85,+,206583387,206583388,25500,85,72.94,62,23,0,1,15,10,4
4,chr1,206583388,206583389,m,75,-,206583388,206583389,25500,75,86.67,65,10,0,3,14,1,5
5,chr1,206583707,206583708,m,104,+,206583707,206583708,25500,104,96.15,100,4,0,3,3,0,5
6,chr1,206583708,206583709,m,80,-,206583708,206583709,25500,80,96.25,77,3,0,1,9,6,3
7,chr1,206583766,206583767,m,97,+,206583766,206583767,25500,97,92.78,90,7,0,3,1,12,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,chr1,206589213,206589214,m,87,-,206589213,206589214,25500,87,78.16,68,19,0,2,8,0,3
273,chr1,206589436,206589437,m,110,+,206589436,206589437,25500,110,95.45,105,5,0,0,4,1,0
274,chr1,206589437,206589438,m,92,-,206589437,206589438,25500,92,95.65,88,4,0,0,4,3,1
275,chr1,206589745,206589746,m,101,+,206589745,206589746,25500,101,99.01,100,1,0,7,0,0,7


In [45]:
pileup_Unedit_day35_df_roi

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
3,chr1,206583387,206583388,m,85,+,206583387,206583388,25500,85,72.94,62,23,0,1,15,10,4
4,chr1,206583388,206583389,m,75,-,206583388,206583389,25500,75,86.67,65,10,0,3,14,1,5
5,chr1,206583707,206583708,m,104,+,206583707,206583708,25500,104,96.15,100,4,0,3,3,0,5
6,chr1,206583708,206583709,m,80,-,206583708,206583709,25500,80,96.25,77,3,0,1,9,6,3
7,chr1,206583766,206583767,m,97,+,206583766,206583767,25500,97,92.78,90,7,0,3,1,12,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,chr1,206589213,206589214,m,87,-,206589213,206589214,25500,87,78.16,68,19,0,2,8,0,3
273,chr1,206589436,206589437,m,110,+,206589436,206589437,25500,110,95.45,105,5,0,0,4,1,0
274,chr1,206589437,206589438,m,92,-,206589437,206589438,25500,92,95.65,88,4,0,0,4,3,1
275,chr1,206589745,206589746,m,101,+,206589745,206589746,25500,101,99.01,100,1,0,7,0,0,7


In [46]:
pileup_data_folder_path

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/'

In [47]:

# data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/"
# pileup_data_folder_path=data_folder_path+"new_pileup/"
# # mkdir ${pileup_data_folder_path}

# # Plot pileup_Unedit_day35_df_roi summary plots (use existing variables/imports in the notebook)
# # Saves interactive HTMLs to pileup_data_folder_path and displays inline.
# out_dir = pileup_data_folder_path  # existing variable in the notebook

# df_roi = pileup_Unedit_day35_df_roi.copy()

df_roi_stats = plot_pileup_roi_df(df_roi=pileup_Unedit_day35_df_roi, out_dir=pileup_data_folder_path)
df_roi_stats




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

ROI rows: 274
Percent modified: median=2.08, mean=22.27
Coverage (Nvalid_cov): min=14, median=91.0, max=113


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,85,+,206583387,206583388,25500,85,72.94,62,23,0,1,15,10,4,206583387
chr1,206583388,206583389,m,75,-,206583388,206583389,25500,75,86.67,65,10,0,3,14,1,5,206583388
chr1,206583707,206583708,m,104,+,206583707,206583708,25500,104,96.15,100,4,0,3,3,0,5,206583707
chr1,206583708,206583709,m,80,-,206583708,206583709,25500,80,96.25,77,3,0,1,9,6,3,206583708
chr1,206583766,206583767,m,97,+,206583766,206583767,25500,97,92.78,90,7,0,3,1,12,2,206583766
chr1,206583767,206583768,m,78,-,206583767,206583768,25500,78,93.59,73,5,0,3,12,2,4,206583767
chr1,206584104,206584105,m,100,+,206584104,206584105,25500,100,96.0,96,4,0,1,1,12,1,206584104
chr1,206584105,206584106,m,92,-,206584105,206584106,25500,92,92.39,85,7,0,1,2,1,3,206584105
chr1,206584137,206584138,m,109,+,206584137,206584138,25500,109,100.0,109,0,0,1,4,0,1,206584137
chr1,206584138,206584139,m,55,-,206584138,206584139,25500,55,90.91,50,5,0,1,42,0,1,206584138


ROI rows: 274
Percent modified: median=2.08, mean=22.27
Coverage (Nvalid_cov): min=14, median=91.0, max=113


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,85,+,206583387,206583388,25500,85,72.94,62,23,0,1,15,10,4,206583387
chr1,206583388,206583389,m,75,-,206583388,206583389,25500,75,86.67,65,10,0,3,14,1,5,206583388
chr1,206583707,206583708,m,104,+,206583707,206583708,25500,104,96.15,100,4,0,3,3,0,5,206583707
chr1,206583708,206583709,m,80,-,206583708,206583709,25500,80,96.25,77,3,0,1,9,6,3,206583708
chr1,206583766,206583767,m,97,+,206583766,206583767,25500,97,92.78,90,7,0,3,1,12,2,206583766
chr1,206583767,206583768,m,78,-,206583767,206583768,25500,78,93.59,73,5,0,3,12,2,4,206583767
chr1,206584104,206584105,m,100,+,206584104,206584105,25500,100,96.0,96,4,0,1,1,12,1,206584104
chr1,206584105,206584106,m,92,-,206584105,206584106,25500,92,92.39,85,7,0,1,2,1,3,206584105
chr1,206584137,206584138,m,109,+,206584137,206584138,25500,109,100.0,109,0,0,1,4,0,1,206584137
chr1,206584138,206584139,m,55,-,206584138,206584139,25500,55,90.91,50,5,0,1,42,0,1,206584138


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,...,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos,label,Ntotal,Nmod_perc,Ncanonical_perc
3,chr1,206583387,206583388,m,85,+,206583387,206583388,25500,85,...,0,1,15,10,4,206583387,206583387:+,85,72.941176,27.058824
4,chr1,206583388,206583389,m,75,-,206583388,206583389,25500,75,...,0,3,14,1,5,206583388,206583388:-,75,86.666667,13.333333
5,chr1,206583707,206583708,m,104,+,206583707,206583708,25500,104,...,0,3,3,0,5,206583707,206583707:+,104,96.153846,3.846154
6,chr1,206583708,206583709,m,80,-,206583708,206583709,25500,80,...,0,1,9,6,3,206583708,206583708:-,80,96.250000,3.750000
7,chr1,206583766,206583767,m,97,+,206583766,206583767,25500,97,...,0,3,1,12,2,206583766,206583766:+,97,92.783505,7.216495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,chr1,206589213,206589214,m,87,-,206589213,206589214,25500,87,...,0,2,8,0,3,206589213,206589213:-,87,78.160920,21.839080
273,chr1,206589436,206589437,m,110,+,206589436,206589437,25500,110,...,0,0,4,1,0,206589436,206589436:+,110,95.454545,4.545455
274,chr1,206589437,206589438,m,92,-,206589437,206589438,25500,92,...,0,0,4,3,1,206589437,206589437:-,92,95.652174,4.347826
275,chr1,206589745,206589746,m,101,+,206589745,206589746,25500,101,...,0,7,0,0,7,206589745,206589745:+,101,99.009901,0.990099


# dmr modkit CRoff vs Unedited (NT)

3. Detecting differential modification at single base positions
The modkit dmr pair command has the ability to score individual bases (e.g. differentially methylated CpGs). To run single-base analysis on one or more paired samples, simply omit the --regions (-r) option when running modkit dmr pair. When performing single-base analysis the likelihood ratio score and a MAP-based p-value are available. For details on the likelihood ratio score and the MAP-based p-value, see the scoring details section. For example the above command becomes:

dmr_result=single_base_haplotype_dmr.bed

modkit dmr pair \
  -a ${hp1_pileup}.gz \
  -b ${hp2_pileup}.gz \
  -o ${dmr_result} \
  --ref ${ref} \
  --base C \
  --threads ${threads} \
  --log-filepath dmr.log

In [48]:
# ! ls "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG"

In [49]:
# ! ls /home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG/pileup_sorted_align_t2t_v1_1_trim_20250721_nCATs_Tcells_UNEDITED_Day28_minion_merged.dna_r9.4.1_e8_sup@v3.3.5mCG.bam.gz

In [50]:
# /home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/pileup_sorted_align_t2t_v1_1_trim_20250721_nCATs_Tcells_UNEDITED_Day28_minion_merged.dna_r9.4.1_e8_sup.5mCG.bam.gz

In [51]:
# %%bash
# ls -ld /home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs
# ls -l /home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/pileup_sorted_align_t2t_v1_1_trim_20250721_nCATs_Tcells_UNEDITED_Day28_minion_merged.dna_r9.4.1_e8_sup.5mCG.bam.gz

In [None]:
pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/"


In [None]:
%%bash

# pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/"
# pileup_Unedit_day35_bed=${pileup_data_folder_path}"20251109_filtered_mC07_pileup_NT_Day35_Tcells.bed"
pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/"
pileup_Unedit_day35_bed=${pileup_data_folder_path}"20251110_noFilter_mC07_pileup_NT_Day35_Tcells.bed"

# "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG/to_t2t_v2_0/pileup/pileup_Unedit_Day28_Tcells_20250721.bam"

echo "pileup_Unedit_day35_bed: ${pileup_Unedit_day35_bed}.gz"

ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_Unedit_day35_bed}.gz"
# ls -l "${pileup_Unedit_day28_bam}.gz"

chmod u+rwx "${pileup_data_folder_path}"
chmod u+rwx "${pileup_Unedit_day35_bed}.gz"

# ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_Unedit_day35_bed}.gz"


pileup_Unedit_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_NT_Day35_Tcells.bed.gz
20251109_filtered_mC07_pileup_NT_Day35_Tcells.bed
20251109_filtered_mC07_pileup_NT_Day35_Tcells.bed.gz
20251109_filtered_mC07_pileup_NT_Day35_Tcells.bed.gz.tbi
20251109_filtered_mC0995_pileup_NT_Day35_Tcells.bed
20251109_filtered_mC0995_pileup_NT_Day35_Tcells.bed.gz
20251109_filtered_mC0995_pileup_NT_Day35_Tcells.bed.gz.tbi
20251110_noFilter_mC07_pileup_NT_Day35_Tcells.bed
20251110_noFilter_mC07_pileup_NT_Day35_Tcells.bed.gz
20251110_noFilter_mC07_pileup_NT_Day35_Tcells.bed.gz.tbi
20251110_noFilter_repl1_mC077_pileup_NT_Day35_Tcells.bed
drwxrwxr-x 2 michalula michalula 4096 Nov 10 15:49 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/
-rwxrw-r-- 1 michalula michalula 5672

In [62]:
%%bash

date_today="20251110"

# pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/pileup/"
# pileup_Unedit_day35_bam=${pileup_data_folder_path}"20251109_filtered_pileup_NT_Day35_Tcells.bam"
# # "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG/to_t2t_v2_0/pileup/pileup_Unedit_Day28_Tcells_20250721.bam"
# pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/new_pileup/"
# pileup_CROFF_day35_bed=${pileup_data_folder_path}"20251109_filtered_mC07_pileup_CROFF_Day35_Tcells.bed"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_pileup/"
# mkdir ${pileup_data_folder_path}

# Pre-filtered mC > 70%::
CROFF_day35_bam=${data_folder_path}"pre_filtered_ROI_reads_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
pileup_CROFF_day35_bed=${pileup_data_folder_path}${date_today}"_noFilter_repl1_mC07""_pileup_CROFF_Day35_Tcells.bed"


echo "pileup_CROFF_day35_bed: ${pileup_CROFF_day35_bed}.gz"

ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_CROFF_day35_bed}.gz"
# ls -l "${pileup_Unedit_day28_bam}.gz"

chmod u+rwx "${pileup_data_folder_path}"
chmod u+rwx "${pileup_CROFF_day35_bed}.gz"


# ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_CROFF_day35_bed}.gz"


pileup_CROFF_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_repl1_mC07_pileup_CROFF_Day35_Tcells.bed.gz
20251109_dmr_column_summary.csv
20251109_dmr_parsed.csv
20251109_filtered_mC07_pileup_CROFF_Day35_Tcells.bed
20251109_filtered_mC07_pileup_CROFF_Day35_Tcells.bed.gz
20251109_filtered_mC07_pileup_CROFF_Day35_Tcells.bed.gz.tbi
20251109_filtered_mC0995_pileup_CROFF_Day35_Tcells.bed
20251109_filtered_mC0995_pileup_CROFF_Day35_Tcells.bed.gz
20251109_filtered_mC0995_pileup_CROFF_Day35_Tcells.bed.gz.tbi
20251109_filtered_pileup_CROFF_Day35_Tcells.bed
20251109_filtered_pileup_CROFF_Day35_Tcells.bed.gz
20251109_filtered_pileup_CROFF_Day35_Tcells.bed.gz.tbi
20251109_noFilter_repl1_mC07_pileup_CROFF_Day35_Tcells.bed
20251110_noFilter_repl1_mC07_pileup_CROFF_Day35_Tcells.bed
20251110_noFilter_repl1_mC07_pileup_CROFF_Day35_Tcells.bed.gz
20251110_noFilter_repl1_m

In [63]:
%%bash
 
# 3. Detecting differential modification at single base positions
# The modkit dmr pair command has the ability to score individual bases (e.g. differentially methylated CpGs). To run single-base analysis on one or more paired samples, simply omit the --regions (-r) option when running modkit dmr pair. When performing single-base analysis the likelihood ratio score and a MAP-based p-value are available. For details on the likelihood ratio score and the MAP-based p-value, see the scoring details section. For example the above command becomes:
date_today="20251110"

experiment_codition="noFilter_day35_CRoff_vs_Unedit"
dmr_output_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/"
dmr_result=${dmr_output_path}${date_today}"_single_base_noFiltered_mC07_"${experiment_codition}".bed"

# pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/"
# pileup_Unedit_day35_bed=${pileup_data_folder_path}"20251109_filtered_mC07_pileup_NT_Day35_Tcells.bed"
# # "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG/to_t2t_v2_0/pileup/pileup_Unedit_Day28_Tcells_20250721.bam"
pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/"
pileup_Unedit_day35_bed=${pileup_data_folder_path}"20251110_noFilter_mC07_pileup_NT_Day35_Tcells.bed"

echo "pileup_Unedit_day35_bed: ${pileup_Unedit_day35_bed}.gz"
ls -l "${pileup_Unedit_day35_bed}.gz"
# "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/pileup_sorted_align_t2t_v1_1_trim_20250721_nCATs_Tcells_UNEDITED_Day28_minion_merged.dna_r9.4.1_e8_sup.5mCG.bam"
# pileup_Unedit_day28_bam_gz="/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/pileup_sorted_align_t2t_v1_1_trim_20250721_nCATs_Tcells_UNEDITED_Day28_minion_merged.dna_r9.4.1_e8_sup.5mCG.bam.gz"
# pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/new_pileup/"
# pileup_CROFF_day35_bed=${pileup_data_folder_path}"20251109_filtered_mC07_pileup_CROFF_Day35_Tcells.bed"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_pileup/"
# Pre-filtered mC > 70%::
CROFF_day35_bam=${data_folder_path}"pre_filtered_ROI_reads_Tcells_CRISPRoff_Day35_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
pileup_CROFF_day35_bed=${pileup_data_folder_path}${date_today}"_noFilter_repl1_mC07""_pileup_CROFF_Day35_Tcells.bed"

echo "pileup_CROFF_day35_bed: ${pileup_CROFF_day35_bed}.gz"
ls -l "${pileup_CROFF_day35_bed}.gz"

ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref="/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"
threads=32
 
cd ${dmr_output_path}
# '/home/michalula/code/epiCausality/epiCode/differential_methyl'

modkit dmr pair \
  -a ${pileup_CROFF_day35_bed}.gz \
  -b ${pileup_Unedit_day35_bed}.gz \
  -o ${dmr_result} \
  --ref ${ref_genome_fa} \
  --base C \
  --threads ${threads} \
  --log-filepath dmr.log


echo "dmr_result: $dmr_result"
ls -lah $dmr_result

pileup_Unedit_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_NT_Day35_Tcells.bed.gz
-rwxrw-r-- 1 michalula michalula 5672 Nov 10 15:49 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/unedited/merged_2libraries/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_NT_Day35_Tcells.bed.gz
pileup_CROFF_day35_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_repl1_mC07_pileup_CROFF_Day35_Tcells.bed.gz
-rwxrw-r-- 1 michalula michalula 7251 Nov 10 15:36 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/croff/replica_1/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_repl1_mC07_pileup_CROFF_Day35_Tcells.bed.gz


[0;32m>[0m reading reference FASTA at "/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
[0;32m>[0m 1 common sequence(s) between FASTA and both samples
[0;32m>[0m running single-site analysis
[0;32m>[0m using default prior, Beta(α: 0.55, β: 0.55)
[0;32m>[0m estimating max coverages from data
[0;32m>[0m sampled 327 a records and 285 b records, calculating max coverages for 95th percentile
[0;32m>[0m calculated max coverage for a: 194 and b: 108
[0;32m>[0m calculated max coverage 194 is greater than maximum allowed (100), setting to 100
[0;32m>[0m calculated max coverage 108 is greater than maximum allowed (100), setting to 100
[0;31;1m>[0m errors:
+--------------------------+-------+
| error                    | count |
+--------------------------+-------+
| missing-in-one-condition | 42    |
+--------------------------+-------+

[0;32m>[0m finished, processed 285 sites successfully, 42 failed


dmr_result: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/20251110_single_base_noFiltered_mC07_noFilter_day35_CRoff_vs_Unedit.bed
-rw-rw-r-- 1 michalula michalula 57K Nov 10 16:19 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/20251110_single_base_noFiltered_mC07_noFilter_day35_CRoff_vs_Unedit.bed


In [1]:
%%bash
date_today="20251110"

experiment_codition="noFilter_day35_CRoff_vs_Unedit"
dmr_output_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/"
dmr_result=${dmr_output_path}${date_today}"_single_base_noFiltered_mC07_"${experiment_codition}".bed"

# date_today="20251109"
# experiment_codition="day35_CRoff_vs_Unedit"
# dmr_output_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/"
# dmr_result=${dmr_output_path}"_"${date_today}"single_base_filtered_mC07_"${experiment_codition}".bed"

echo "dmr_result: $dmr_result"
# ls -lah $dmr_result
ls -lah $dmr_output_path
# cat $dmr_result

dmr_result: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/20251110_single_base_noFiltered_mC07_noFilter_day35_CRoff_vs_Unedit.bed
total 136K
drwxrwxr-x 2 michalula michalula 4.0K Nov 10 16:19 .
drwxrwxr-x 4 michalula michalula 4.0K Nov 10 14:55 ..
-rw-rw-r-- 1 michalula michalula  57K Nov 10 01:55 20251109_single_base_filtered_mC07_day35_CRoff_vs_Unedit.bed
-rw-rw-r-- 1 michalula michalula  57K Nov 10 16:19 20251110_single_base_noFiltered_mC07_noFilter_day35_CRoff_vs_Unedit.bed
-rw-rw-r-- 1 michalula michalula 4.8K Nov 10 16:19 dmr.log


In [2]:
pwd

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/ROI_no_filtering/mc_07'

## modkit dmr explore output

The full table when performing single-site analysis with equal numbers of samples in groups, when running modkit dmr pair, will have the following schema:

column	name	description	type
1	chrom	name of reference sequence from bedMethyl input samples	str
2	start position	0-based start position, from --regions argument	int
3	end position	0-based exclusive end position, from --regions argument	int
4	name	name column from --regions BED, or chr:start-stop if absent, "." for single sites	str
5	score	difference score, more positive values have increased difference	float
6	strand	strand for the region or single-base position	str
7	samplea counts	counts of each base modification in the region, comma-separated, for sample A	str
8	samplea total	total number of base modification calls in the region, including unmodified, for sample A	int
9	sampleb counts	counts of each base modification in the region, comma-separated, for sample B	str
10	sampleb total	total number of base modification calls in the region, including unmodified, for sample B	int
11	samplea percents	percent of calls for each base modification in the region, comma-separated, for sample A	str
12	sampleb percents	percent of calls for each base modification in the region, comma-separated, for sample B	str
13	samplea fraction modified	fraction modification (of any kind) in sample A	float
14	sampleb fraction modified	fraction modification (of any kind) in sample B	float
15	MAP-based p-value	ratio of the posterior probability of observing the effect size over zero effect size	float
16	effect size	percent modified in sample A (col 12) minus percent modified in sample B (col 13)	float
<!-- 17	balanced MAP-based p-value	MAP-based p-value when all replicates are balanced	float
18	balanced effect size	effect size when all replicates are balanced	float -->


17	cohen_h	Cohen's h statistic (useful with regions and high-depth runs)	float
18	cohen_h_low	95% confidence interval lower bound	float
19	cohen_h_high	95% confidence interval upper bound	float

<!-- Differential methylation output format
The output from modkit dmr pair (and for each pairwise comparison with modkit dmr multi) is (roughly) a BED file with the following schema: -->
<!-- 
column	name	description	type
        1	chrom	name of reference sequence from bedMethyl input samples	str
        2	start position	0-based start position, from --regions argument	int
        3	end position	0-based exclusive end position, from --regions argument	int
        4	name	name column from --regions BED, or chr:start-stop if absent, "." for single sites	str
        5	score	difference score, more positive values have increased difference	float
        6	strand	strand for the region or single-base position	str
        7	samplea counts	counts of each base modification in the region, comma-separated, for sample A	str
        8	samplea total	total number of base modification calls in the region, including unmodified, for sample A	int
        9	sampleb counts	counts of each base modification in the region, comma-separated, for sample B	str
        10	sampleb total	total number of base modification calls in the region, including unmodified, for sample B	int
        11	samplea percents	percent of calls for each base modification in the region, comma-separated, for sample A	str
        12	sampleb percents	percent of calls for each base modification in the region, comma-separated, for sample B	str
        13	samplea fraction modified	fraction modification (of any kind) in sample A	float
        14	sampleb fraction modified	fraction modification (of any kind) in sample B	float
        15	MAP-based p-value	ratio of the posterior probability of observing the effect size over zero effect size	float
        16	effect size	percent modified in sample A (col 12) minus percent modified in sample B (col 13)	float
        17	balanced MAP-based p-value	MAP-based p-value when all replicates are balanced	float
        18	balanced effect size	effect size when all replicates are balanced	float
        19	pct_a_samples	percent of 'a' samples used in statistical test	float
        20	pct_b_samples	percent of 'b' samples used in statistical test	float
        21	per-replicate p-values	MAP-based p-values for matched replicate pairs	float
        22	per-replicate effect sizes	effect sizes matched replicate pairs	float
        23	cohen_h	Cohen's h statistic (useful with regions and high-depth runs)	float
        24	cohen_h_low	95% confidence interval lower bound	float
        25	cohen_h_high	95% confidence interval upper bound	float
        Columns 16-19 are only produced when multiple samples are provided, columns 20 and 21 are only produced when there is an equal number of 'a' and 'b' samples. When using multiple samples, it is possible that not every sample will have a modification fraction at a position. When this happens, the statistical test is still performed and the values of pct_a_samples and pct_b_samples reflect the percent of samples from each condition used in the test. 


     (15)	cohen_h	Cohen's h statistic (useful with regions and high-depth runs)	float
    (16)	cohen_h_low	95% confidence interval lower bound	float
    (17)	cohen_h_high	95% confidence interval upper bound	float
    
    n.b. Columns 15, 16, and 17 are present when the --regions option is passed, but these columns are on the right side of the table when performing single-site analysis (below). It is generally recommended to use the --header flag and standard CSV parsing to make sure the schema's between experiments are maintained.

When performing single-site analysis, the following additional columns are added:

column	name	description	type
Columns 20 and 21 have the replicate pairwise MAP-based p-values and effect sizes which are calculated based on their order provided on the command line. For example in the abbreviated command below:

In [3]:
# dmr_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/20251109_single_base_filtered_mC07_day35_CRoff_vs_Unedit.bed"

date_today="20251110"

experiment_codition="noFilter_day35_CRoff_vs_Unedit"
dmr_output_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/"
dmr_result=dmr_output_path+date_today+"_single_base_noFiltered_mC07_"+experiment_codition+".bed"

dmr_path=dmr_result
dmr_path


'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/20251110_single_base_noFiltered_mC07_noFilter_day35_CRoff_vs_Unedit.bed'

In [4]:
dmr_path

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/20251110_single_base_noFiltered_mC07_noFilter_day35_CRoff_vs_Unedit.bed'

In [19]:
out_dir = dmr_output_path
print("out_dir:", out_dir)

# Read DMR BED (robust to header/no-header) and assign canonical column names (uses existing vars: dmr_path, out_dir, date_today, pd, os)
canonical_cols = [
    "chrom", "start", "end", "name", "score", "strand",
    "samplea_counts", "samplea_total", "sampleb_counts", "sampleb_total",
    "samplea_percents", "sampleb_percents",
    "samplea_fraction_modified", "sampleb_fraction_modified",
    "map_pvalue", "effect_size",
    "cohen_h", "cohen_h_low", "cohen_h_high",
]
    # "balanced_map_pvalue", "balanced_effect_size"

# read file with header and fallback to header=None when headers look numeric or columns are unexpected
try:
    dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")

    # dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", engine="python") # , header=0
    # # heuristic: if too many numeric-looking column names, re-read as headerless
    # numeric_headers = sum(1 for c in dmr_df.columns if str(c).strip().isdigit())
    # if numeric_headers >= (len(dmr_df.columns) / 2) or dmr_df.shape[1] < 3:
    #     dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")
except Exception:
    dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")

# assign canonical names up to number of columns present, add generic names for extras
ncols = dmr_df.shape[1]
if ncols <= len(canonical_cols):
    dmr_df.columns = canonical_cols[:ncols]
else:
    extras = [f"col_{i}" for i in range(ncols - len(canonical_cols))]
    dmr_df.columns = canonical_cols + extras

# coerce obvious numeric columns to numeric where present
num_cols_to_try = [
    "start", "end", "score",
    "samplea_total", "sampleb_total",
    "samplea_fraction_modified", "sampleb_fraction_modified",
    "map_pvalue", "effect_size",
    "balanced_map_pvalue", "balanced_effect_size"
]
for c in num_cols_to_try:
    if c in dmr_df.columns:
        dmr_df[c] = pd.to_numeric(dmr_df[c], errors="coerce")

# ensure output directory exists and save parsed table (parquet preferred)
os.makedirs(out_dir, exist_ok=True)
parsed_path = os.path.join(out_dir, f"{date_today}_dmr_parsed.parquet")
try:
    dmr_df.to_parquet(parsed_path, index=False)
    print("Saved parquet:", parsed_path)
except Exception:
    csv_path = os.path.join(out_dir, f"{date_today}_dmr_parsed.csv")
    dmr_df.to_csv(csv_path, index=False)
    print("Parquet not available, saved csv:", csv_path)

print("Loaded DMR:", dmr_path)
print("Assigned columns:", dmr_df.columns.tolist())
print("Shape:", dmr_df.shape)
dmr_df.head()

out_dir: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/
Parquet not available, saved csv: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/20251110_dmr_parsed.csv
Loaded DMR: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/20251110_single_base_noFiltered_mC07_noFilter_day35_CRoff_vs_Unedit.bed
Assigned columns: ['chrom', 'start', 'end', 'name', 'score', 'strand', 'samplea_counts', 'samplea_total', 'sampleb_counts', 'sampleb_total', 'samplea_percents', 'sampleb_percents', 'samplea_fraction_modified', 'sampleb_fraction_modified', 'map_pvalue', 'effect_size', 'cohen_h', 'cohen_h_low', 'cohen_h_high']
Shape: (285, 19)


Unnamed: 0,chrom,start,end,name,score,strand,samplea_counts,samplea_total,sampleb_counts,sampleb_total,samplea_percents,sampleb_percents,samplea_fraction_modified,sampleb_fraction_modified,map_pvalue,effect_size,cohen_h,cohen_h_low,cohen_h_high
0,chr1,206583090,206583091,.,-0.277632,-,m:2,2,m:1,1,m:100.00,m:100.00,1.0,1.0,1.0,0.0,0.0,-2.400456,2.400456
1,chr1,206583173,206583174,.,0.744937,+,m:139,152,m:68,80,m:91.45,m:85.00,0.914474,0.85,0.47346,0.06,0.201826,-0.068897,0.472549
2,chr1,206583174,206583175,.,-0.333086,-,m:120,126,m:86,90,m:95.24,m:95.56,0.952381,0.955556,1.0,-0.005556,-0.015152,-0.255349,0.285652
3,chr1,206583387,206583388,.,-0.189543,+,m:134,176,m:62,85,m:76.14,m:72.94,0.761364,0.729412,1.0,0.030588,0.073376,-0.185506,0.332259
4,chr1,206583388,206583389,.,-0.295326,-,m:91,107,m:65,75,m:85.05,m:86.67,0.850467,0.866667,1.0,-0.016667,-0.046505,-0.248657,0.341668


In [20]:
import os
from IPython.display import display, HTML

# Visualize all columns from dmr_df and save interactive HTMLs to out_dir
import plotly.express as px
import plotly.graph_objects as go

os.makedirs(out_dir, exist_ok=True)

# Save a table summary
summary = dmr_df.describe(include='all').transpose()
summary_path = os.path.join(out_dir, f"{date_today}_dmr_column_summary.csv")
summary.to_csv(summary_path)

numcols = dmr_df.select_dtypes(include=['number']).columns.tolist()

def _safe_name(name):
    return str(name).replace(os.sep, "_").replace(" ", "_").replace("\t", "_")

# Per-column visualizations
for col in dmr_df.columns:
    safe = _safe_name(col)
    try:
        if col in numcols:
            # Histogram
            fig_h = px.histogram(dmr_df, x=col, nbins=80, title=f"Histogram: {col}")
            # fig_h.write_html(os.path.join(out_dir, f"{date_today}_dmr_hist_{safe}.html"), include_plotlyjs='cdn')
            fig_h.show()

            # Boxplot
            fig_b = px.box(dmr_df, y=col, points="outliers", title=f"Boxplot: {col}")
            # fig_b.write_html(os.path.join(out_dir, f"{date_today}_dmr_box_{safe}.html"), include_plotlyjs='cdn')
            fig_b.show()
        else:
            # Categorical / text: show top value counts (up to 50)
            vc = dmr_df[col].fillna("NA").astype(str).value_counts().head(50)
            if len(vc):
                fig_c = px.bar(x=vc.values[::-1], y=vc.index.astype(str)[::-1], orientation='h',
                               title=f"Top value counts: {col}", labels={'x':'count','y':col})
                fig_c.update_layout(yaxis={'categoryorder':'array','categoryarray':vc.index[::-1].astype(str).tolist()})
                # fig_c.write_html(os.path.join(out_dir, f"{date_today}_dmr_valcounts_{safe}.html"), include_plotlyjs='cdn')
                fig_c.show()
            else:
                # fallback: display empty info
                display(HTML(f"<b>{col}</b>: no values to plot"))
    except Exception as e:
        print(f"Skipped plotting column {col!r} due to error: {e}")

# Correlation heatmap for numeric columns
if len(numcols) >= 2:
    try:
        corr = dmr_df[numcols].corr()
        fig_corr = px.imshow(corr, text_auto=True, aspect="auto", title="Correlation matrix (numeric columns)")
        # fig_corr.write_html(os.path.join(out_dir, f"{date_today}_dmr_correlation_numeric.html"), include_plotlyjs='cdn')
        fig_corr.show()
    except Exception as e:
        print("Failed to create correlation heatmap:", e)

print("Saved summary:", summary_path)
print("Plots saved to:", out_dir)

Saved summary: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/20251110_dmr_column_summary.csv
Plots saved to: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/


In [21]:
# Select significant CG pairs from DMR results and plot them (new cell at index 69).
# Uses existing notebook variables: dmr_df (parsed modkit dmr), df_roi_stats (pileup ROI stats),
# plotly (px) and out_dir/dmr_folder_path for saving. Does not re-import modules.

# Parameters
pvalue_thresh = 0.05

# ensure numeric columns
dmr_df['map_pvalue'] = pd.to_numeric(dmr_df['map_pvalue'], errors='coerce')
dmr_df['effect_size'] = pd.to_numeric(dmr_df['effect_size'], errors='coerce')
dmr_df['samplea_fraction_modified'] = pd.to_numeric(dmr_df['samplea_fraction_modified'], errors='coerce')
dmr_df['sampleb_fraction_modified'] = pd.to_numeric(dmr_df['sampleb_fraction_modified'], errors='coerce')

# filter significant by MAP-based p-value
sig = dmr_df[dmr_df['map_pvalue'] <= pvalue_thresh].copy()

dmr_df['map_pval_less005'] = dmr_df['map_pvalue'] <= 0.05


# # restrict to ROI positions if df_roi_stats exists
# if 'df_roi_stats' in globals():
#     roi_positions = set(df_roi_stats['start'].astype(int).tolist())
#     sig = sig[sig['start'].isin(roi_positions)].copy()

# quick exit if none
if sig.shape[0] == 0:
    print(f"No significant CG pairs found in ROI at map_pvalue <= {pvalue_thresh}")
else:
    # add convenience cols
    sig['pos'] = sig['start'].astype(str)
    sig['a_perc'] = sig['samplea_fraction_modified'] * 100
    sig['b_perc'] = sig['sampleb_fraction_modified'] * 100
    sig['total_reads'] = sig.get('samplea_total', 0).fillna(0).astype(int) + sig.get('sampleb_total', 0).fillna(0).astype(int)

    # save a table of significant sites
    os.makedirs(out_dir, exist_ok=True)
    sig_table_path = os.path.join(out_dir, f"dmr_significant_p{pvalue_thresh:.3f}_roi.tsv")
    sig.to_csv(sig_table_path, sep='\t', index=False)
    print("Saved significant sites table:", sig_table_path)
    display(sig[['chrom','start','end','strand','map_pvalue','effect_size','a_perc','b_perc','total_reads']].reset_index(drop=True))

    # plot the map_pval_less005 distribution which corresponds to significant sites
    fig_mappval_hist = px.histogram(
        dmr_df,
        x='map_pval_less005',
        nbins=80,
        title=f"MAP-based p-value distribution (highlighting p <= {pvalue_thresh})",
        labels={'map_pval_less005':'MAP-based p-value'}
    )
    fig_mappval_hist.update_layout(height=520)
    mappval_hist_path = os.path.join(out_dir, f"dmr_map_pval_distribution.html")
    # fig_mappval_hist.write_html(mappval_hist_path, include_plotlyjs='cdn')
    fig_mappval_hist.show()
    # print("Saved MAP-based p-value distribution histogram:", mappval_hist_path)

    # plot the percent of significant sites where map_pvalue <= pvalue_thresh is colored red, others blue (color not working)
    # Check https://plotly.com/python/pie-charts/ for coloring instructions
    percent_significant = (sig.shape[0] / dmr_df.shape[0]) * 100
    fig_mappval_pie = px.pie(
        dmr_df,
        names=['Not Significant (p > {})'.format(pvalue_thresh), 'Significant (p <= {})'.format(pvalue_thresh)],
        values=[dmr_df.shape[0] - sig.shape[0],sig.shape[0]],
        title=f"Percentage of significant CGs (map_pvalue <= {pvalue_thresh}): {percent_significant:.2f}%",
        # color_discrete_map={'Not Significant (p > {})'.format(pvalue_thresh): 'blue',
        #                     'Significant (p <= {})'.format(pvalue_thresh): 'red'},
        # colors=['blue','red']
    )
    fig_mappval_pie.update_layout(height=520)
    mappval_pie_path = os.path.join(out_dir, f"dmr_map_pval_percentage.html")
    # fig_mappval_pie.write_html(mappval_pie_path, include_plotlyjs='cdn')
    fig_mappval_pie.show()
    # print("Saved MAP-based p-value percentage pie chart:", mappval_pie_path)


    # plot effect size distribution of all sites without sorting and highlight significant which have map_pvalue <= pvalue_thresh
    fig_effectsize_hist = px.histogram(
        dmr_df,
        x='effect_size',
        nbins=80,
        title=f"Effect size distribution (highlighting significant sites with map_pvalue <= {pvalue_thresh})",
        color=(dmr_df['map_pvalue'] <= pvalue_thresh), 
        color_discrete_map={True: 'red', False: 'blue'},
    )
            # labels={'effect_size':'Effect size (A - B)'}

    fig_effectsize_hist.update_layout(height=520)
    effectsize_hist_path = os.path.join(out_dir, f"dmr_effect_size_distribution.html")
    # fig_effectsize_hist.write_html(effectsize_hist_path, include_plotlyjs='cdn')
    fig_effectsize_hist.show()
    # print("Saved effect size distribution histogram:", effectsize_hist_path)        

    # plot effect sizes of all sites without sorting and highlight significant which have map_pvalue <= pvalue_thresh 
    # add color legend names as 'Significant: map_pvalue <= pvalue_thresh' and 'Not Significant: map_pvalue > pvalue_thresh'  
    fig_effectsize_scatter = px.scatter(
        dmr_df,
        x=dmr_df.index,
        y='effect_size',
        color_discrete_map={True: 'red', False: 'blue'},
        color=(dmr_df['map_pvalue'] <= pvalue_thresh), 
        labels={'effect_size':'Effect size (A - B)','index':'Index',
                'color':f'Significant: map_pvalue <= {pvalue_thresh}'},
        title=f"Effect sizes for all CGs (highlighting significant sites with map_pvalue <= {pvalue_thresh})",
    )
    fig_effectsize_scatter.update_layout(height=520)
    effectsize_scatter_path = os.path.join(out_dir, f"dmr_effect_size_scatter.html")
    # fig_effectsize_scatter.write_html(effectsize_scatter_path, include_plotlyjs='cdn')
    fig_effectsize_scatter.show()
    # print("Saved effect size scatter plot:", effectsize_scatter_path)       

    # bar plot effect sizes of all sites without sorting and highlight significant which have map_pvalue <= pvalue_thresh 
    fig_effectsize_bar = px.bar(
        dmr_df,
        x=dmr_df.index,         
        y='effect_size',
        color=(dmr_df['map_pvalue'] <= pvalue_thresh),
        labels={'effect_size':'Effect size (A - B)','index':'Index',
                'color':f'Significant: map_pvalue <= {pvalue_thresh}'},
        color_discrete_map={True: 'red', False: 'blue'},
        title=f"Effect sizes for all CGs (n={len(dmr_df)}) (highlighting significant sites with map_pvalue <= {pvalue_thresh})",
    )
    fig_effectsize_bar.update_layout(height=520)
    effectsize_bar_path = os.path.join(out_dir, f"dmr_effect_size_bar.html")
    # fig_effectsize_bar.write_html(effectsize_bar_path, include_plotlyjs='cdn')
    fig_effectsize_bar.show()
    # print("Saved effect size bar plot:", effectsize_bar_path) 


    # Bar: effect size per position (without sorting)
    sig['label'] = sig['pos'] + ":" + sig['strand'].astype(str)
    fig_bar_unsorted = px.bar(
        sig,
        x='label',
        y='effect_size',        
        color='effect_size',
        hover_data=['map_pvalue','a_perc','b_perc','total_reads'],
        title=f"Effect size for significant CGs (n={len(sig)}) with map_pvalue <= {pvalue_thresh}",
        labels={'effect_size':'Effect size (A - B)','label':'position:strand'}
    )
    fig_bar_unsorted.update_layout(xaxis_tickangle=45, height=520)
    bar_unsorted_path = os.path.join(out_dir, f"dmr_sig_effectsize_unsorted_p{pvalue_thresh:.3f}.html")
    # fig_bar_unsorted.write_html(bar_unsorted_path, include_plotlyjs='cdn')
    fig_bar_unsorted.show()
    # print("Saved unsorted effect-size bar plot:", bar_unsorted_path)    
        

    # Bar: effect size per position (sorted)
    sig_sorted = sig.sort_values('effect_size', ascending=False).copy()
    sig_sorted['label'] = sig_sorted['pos'] + ":" + sig_sorted['strand'].astype(str)
    fig_bar = px.bar(
        sig_sorted,
        x='label',
        y='effect_size',
        color='effect_size',
        hover_data=['map_pvalue','a_perc','b_perc','total_reads'],
        title=f"Effect size for significant CGs (n={len(sig_sorted)})",
        labels={'effect_size':'Effect size (A - B)','label':'position:strand'}
    )
    fig_bar.update_layout(xaxis_tickangle=45, height=520)
    bar_path = os.path.join(out_dir, f"dmr_sig_effectsize_p{pvalue_thresh:.3f}.html")
    # fig_bar.write_html(bar_path, include_plotlyjs='cdn')
    fig_bar.show()
    # print("Saved effect-size bar plot:", bar_path)



    # Scatter: sample A vs sample B percent modified (size = total reads, color = effect size)
    fig_scatter = px.scatter(
        sig,
        x='a_perc',
        y='b_perc',
        color='effect_size',
        size='total_reads',
        hover_data=['pos','start','map_pvalue','effect_size','cohen_h'],
        title=f"Significant CGs (map_pvalue <= {pvalue_thresh}) — sample A vs B percent modified",
        labels={'a_perc':'Sample A % modified','b_perc':'Sample B % modified'}
    )
    fig_scatter.update_layout(height=520)
    scatter_path = os.path.join(out_dir, f"dmr_sig_scatter_p{pvalue_thresh:.3f}.html")
    # fig_scatter.write_html(scatter_path, include_plotlyjs='cdn')
    fig_scatter.show()
    # print("Saved scatter plot:", scatter_path)


Saved significant sites table: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_35/modkit_dmr/mc_07_filtered/new_dmr_output/dmr_significant_p0.050_roi.tsv


Unnamed: 0,chrom,start,end,strand,map_pvalue,effect_size,a_perc,b_perc,total_reads
0,chr1,206584947,206584948,-,0.026001,-0.087952,89.516130,98.795180,207
1,chr1,206585798,206585799,-,0.000189,0.297626,89.411765,59.649120,142
2,chr1,206586020,206586021,+,0.000010,0.342174,69.047620,34.782610,260
3,chr1,206586021,206586022,-,0.002923,0.248780,70.338980,45.121950,200
4,chr1,206586120,206586121,+,0.045312,0.148283,32.417583,17.171717,281
...,...,...,...,...,...,...,...,...,...
222,chr1,206588705,206588706,-,0.003542,0.234286,77.235770,53.571427,207
223,chr1,206588908,206588909,+,0.000009,0.279398,93.373495,65.060240,249
224,chr1,206588909,206588910,-,0.009648,0.156190,92.920350,77.380955,197
225,chr1,206589212,206589213,+,0.001087,0.193736,92.857140,73.626375,273


# Todo:
- plot the raw pileups together too - next too each other (have the separat individual pileup values plots above from the funcs on top)

-- chech how to look at VARIANCE per CG unit position

# TODO: check
- are there really NO diffs between the reads selected with the mC > 70 and mC > 99.5% filtering ??

could be as the mC calles are automatically selected

and in the CRoff the auto threshold 
* in mC > 70 was to 0.79
> Using filter threshold 0.7910156 for C.
* in mC > 99.5 was to  0.79
> Using filter threshold 0.7910156 for C.


and in the Unediter the auto threshold 
* in mC > 70 was to 0.8496
> Using filter threshold 0.8496094 for C.
* in mC > 99.5 was to 0.849
> Using filter threshold 0.8496094 for C.


SAME per condition AUTOMATIC modkit filtering threshold per mC run were set

(NOT 0.995 and not 0.7)