# Modkit dmr
# Day 6 data
## Use my Filtered Reads

In [1]:
from datetime import datetime

def current_time():
    """Returns the current date and time as a formatted string."""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S") 

print("Current Date and Time:", current_time())

Current Date and Time: 2025-11-10 20:26:45


Based on:
https://nanoporetech.github.io/modkit/intro_dmr.html#perform-differential-methylation-scoring

Select kernal: dimelo_v2_modkit_parsing

Preparing the input data
The inputs to all modkit dmr commands are two or more bedMethyl files (created by modkit pileup) that have been compressed with bgzip and indexed with tabix. An example of how to generate the input data is shown below:


ref=grch38.fasta
threads=32

norm=normal_sample.bam
norm_pileup=normal_pileup.bed

modkit pileup ${norm} ${norm_pileup} \
  --cpg \
  --ref ${ref} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${norm_pileup}
tabix -p bed ${norm_pileup}.gz

# pileup and compression can also be done in one step
tumor=tumor_sample.bam
tumor_pileup=tumor_pileup.bed.gz

modkit pileup ${tumor} - \
  --cpg \
  --ref ${ref} \
  --threads ${threads} \
  --log-filepath log.txt | ${bgzip} -c > ${tumor_pileup}

tabix -p bed ${tumor_pileup}

In [2]:
%%bash
echo "hello"

hello


# Use the NEW modkit latest installed version in ipython kernel modkit_new

In [3]:
# ! python3 -m ipykernel install --user --name=modkit_new --display-name "modkit_new Python"
# ! which modkit

In [4]:
import os
os.environ["PATH"] = "/home/michalula/.cargo/bin:" + os.environ["PATH"]
! which modkit
! modkit --version

/home/michalula/.cargo/bin/modkit
modkit 0.5.1


In [5]:
# ! modkit

In [6]:
! modkit --version 

modkit 0.5.1


In [7]:
import os
import pandas as pd

def load_pileup_bed(bed_path):
    # bed_path = existing[0]
    print("Reading bedMethyl file:", bed_path)

    # bedMethyl column names (18 columns as provided)
    colnames = [
        "chrom", "start", "end", "mod_code", "score", "strand",
        "start2", "end2", "color",
        "Nvalid_cov", "percent_modified", "Nmod", "Ncanonical",
        "Nother_mod", "Ndelete", "Nfail", "Ndiff", "Nnocall"
    ]

    # Configure dtypes where reasonable
    dtypes = {
        "chrom": str,
        "start": "Int64",
        "end": "Int64",
        "mod_code": str,
        "score": "Int64",
        "strand": str,
        "start2": "Int64",
        "end2": "Int64",
        "color": str,
        "Nvalid_cov": "Int64",
        "percent_modified": float,
        "Nmod": "Int64",
        "Ncanonical": "Int64",
        "Nother_mod": "Int64",
        "Ndelete": "Int64",
        "Nfail": "Int64",
        "Ndiff": "Int64",
        "Nnocall": "Int64"
    }

    compression = "gzip" if bed_path.endswith(".gz") else None

    # Read file (headerless BED-like table). If file has extra columns, keep them with automatic numeric conversion below.
    df = pd.read_csv(
        bed_path,
        sep="\t",
        header=None,
        comment="#",
        names=colnames,
        dtype=dtypes,
        compression=compression,
        engine="python",
        na_values=[".", "NA", ""],
        keep_default_na=True
    )

    # If file contained more than 18 columns, pandas assigned remaining data to extra columns named like col_18, col_19...
    # Ensure numeric conversion for numeric-like columns
    for c in df.columns:
        if df[c].dtype == object:
            # try safe numeric conversion where appropriate
            try:
                df[c] = pd.to_numeric(df[c], errors="ignore")
            except Exception:
                pass

    print("Loaded DataFrame shape:", df.shape)
    display(df.head())
    return df


In [8]:
import os
from IPython.display import display, HTML
from plotly import express as px
from plotly import graph_objects as go

# ! python3 -m pip install plotly
# ! python3 -m pip install matplotlib
# ! python3 -m pip install nbformat>=4.2.0

def plot_pileup_roi_df(df_roi, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    # ensure numeric types for plotting
    df_roi['pos'] = df_roi['start'].astype(int)
    df_roi['percent_modified'] = df_roi['percent_modified'].astype(float)
    df_roi['Nvalid_cov'] = df_roi['Nvalid_cov'].astype(int)
    df_roi['Nmod'] = df_roi['Nmod'].astype(int)
    df_roi['Ncanonical'] = df_roi['Ncanonical'].astype(int)

    # Scatter: genomic position vs percent modified (point size = coverage)
    fig1 = px.scatter(
        df_roi,
        x='pos',
        y='percent_modified',
        color='strand',
        size='Nvalid_cov',
        hover_data=['Nvalid_cov','Nmod','Ncanonical','Nother_mod','Nnocall'],
        title='Percent modified across ROI (size = Nvalid_cov)',
        height=500
    )
    fig1.update_layout(xaxis_title='Genomic position (start)', yaxis_title='Percent modified')
    fig1.show()
    # fig1.write_html(os.path.join(out_dir, "roi_percent_modified_scatter.html"), include_plotlyjs='cdn')

    # Histogram: coverage distribution
    fig2 = px.histogram(
        df_roi,
        x='Nvalid_cov',
        nbins=40,
        title='Distribution of Nvalid_cov (coverage) in ROI',
        height=400
    )
    fig2.update_layout(xaxis_title='Nvalid_cov', yaxis_title='Count')
    fig2.show()
    # fig2.write_html(os.path.join(out_dir, "roi_nvalidcov_hist.html"), include_plotlyjs='cdn')

    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked)
    topn = 30
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(name='Nmod', x=df_top['label'], y=df_top['Nmod']))
    fig3.add_trace(go.Bar(name='Ncanonical', x=df_top['label'], y=df_top['Ncanonical']))
    fig3.update_layout(barmode='stack', title=f'Sorted Top {topn} sites by percent_modified (stacked Nmod / Ncanonical)',
                    xaxis_title='position:strand', yaxis_title='reads', height=520)
    fig3.show()
    # fig3.write_html(os.path.join(out_dir, "roi_top_sites_stacked_counts.html"), include_plotlyjs='cdn')

    # Print simple summaries
    print("ROI rows:", df_roi.shape[0])
    print("Percent modified: median={:.2f}, mean={:.2f}".format(df_roi['percent_modified'].median(), df_roi['percent_modified'].mean()))
    print("Coverage (Nvalid_cov): min={}, median={}, max={}".format(df_roi['Nvalid_cov'].min(), df_roi['Nvalid_cov'].median(), df_roi['Nvalid_cov'].max()))

    # Display first rows table for quick inspection
    display(HTML(df_roi.head(20).to_html(index=False)))


    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked)
    topn = df_roi.shape[0]
    # df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_roi.copy() #.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(name='Nmod', x=df_top['label'], y=df_top['Nmod']))
    fig3.add_trace(go.Bar(name='Ncanonical', x=df_top['label'], y=df_top['Ncanonical']))
    fig3.update_layout(barmode='stack', title=f'All {topn} CpG sites by percent_modified (stacked Nmod / Ncanonical) [ordered=not s]',
                    xaxis_title='position:strand', yaxis_title='reads', height=520)
    fig3.show()
    # fig3.write_html(os.path.join(out_dir, "roi_top_sites_stacked_counts.html"), include_plotlyjs='cdn')

    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked, NOT SORTED)
    topn = df_roi.shape[0]
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(name='Nmod', x=df_top['label'], y=df_top['Nmod']))
    fig3.add_trace(go.Bar(name='Ncanonical', x=df_top['label'], y=df_top['Ncanonical']))
    fig3.update_layout(barmode='stack', title=f'Sorted Top {topn} sites by percent_modified (stacked Nmod / Ncanonical)',
                    xaxis_title='position:strand', yaxis_title='reads', height=520)
    fig3.show()
    # fig3.write_html(os.path.join(out_dir, "roi_top_sites_stacked_counts.html"), include_plotlyjs='cdn')

    # Print simple summaries
    print("ROI rows:", df_roi.shape[0])
    print("Percent modified: median={:.2f}, mean={:.2f}".format(df_roi['percent_modified'].median(), df_roi['percent_modified'].mean()))
    print("Coverage (Nvalid_cov): min={}, median={}, max={}".format(df_roi['Nvalid_cov'].min(), df_roi['Nvalid_cov'].median(), df_roi['Nvalid_cov'].max()))

    # Display first rows table for quick inspection
    display(HTML(df_roi.head(20).to_html(index=False)))

    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    topn = 30
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig4 = go.Figure()
    fig4.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig4.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig4.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig4.show()
    # fig4.write_html(os.path.join(out_dir, "roi_top_sites_stacked_percentage.html"), include_plotlyjs='cdn')     


    # Bar: top sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    topn = 277
    df_top = df_roi.sort_values('percent_modified', ascending=False).head(topn).copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig4 = go.Figure()
    fig4.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig4.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig4.update_layout(barmode='stack', title=f'Top {topn} sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig4.show()
    # fig4.write_html(os.path.join(out_dir, "roi_top_sites_stacked_percentage.html"), include_plotlyjs='cdn')     

    # Bar: Unsorted sites by percent_modified showing Nmod vs Ncanonical (stacked) percentages
    df_top = df_roi.copy()
    df_top = df_top.assign(label=df_top['pos'].astype(str) + ":" + df_top['strand'])
    df_top = df_top.assign(Ntotal=df_top['Nmod'] + df_top['Ncanonical'])
    df_top = df_top.assign(Nmod_perc=(df_top['Nmod'] / df_top['Ntotal']) * 100)
    df_top = df_top.assign(Ncanonical_perc=(df_top['Ncanonical'] / df_top['Ntotal']) * 100)
    fig5 = go.Figure()
    fig5.add_trace(go.Bar(name='Nmod %', x=df_top['label'], y=df_top['Nmod_perc']))
    fig5.add_trace(go.Bar(name='Ncanonical %', x=df_top['label'], y=df_top['Ncanonical_perc']))
    fig5.update_layout(barmode='stack', title=f'All sites by percent_modified (stacked Nmod % / Ncanonical %)',
                    xaxis_title='position:strand', yaxis_title='percentage', height=520)
    fig5.show()
    # fig5.write_html(os.path.join(out_dir, "roi_all_sites_stacked_percentage.html"), include_plotlyjs='cdn')    


    return df_top



In [9]:
! ls /home/michalula/data/ref_genomes/t2t_v2_0/

chm13v2.0.fa	  chm13v2.0.fa.fai		   haplotype_vcf
chm13v2.0.fa.amb  chm13v2.0.fa.pac		   up_chm13v2.0.fasta
chm13v2.0.fa.ann  chm13v2.0.fa.sa		   up_chm13v2.0.fasta.fai
chm13v2.0.fa.bwt  convert_to_uppercase_fasta.bash


# Pileups 
## for CRISPRoff filtered data for Day 6

In [10]:
! ls /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/

CG_137_padded_reads_day6_CRoff_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_avgBaseQ10_mCthresh0.7_t2t_v2_0_chr1:206583354-206589854_2025-09-24_units_combined_numFWD1098_numRVS2130.npy
CG_137_padded_reads_day6_CRoff_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_avgBaseQ20_mCthresh0.7_t2t_v2_0_chr1:206583354-206589854_2025-09-13_units_combined_numFWD659_numRVS1149.npy
CG_137_padded_reads_day6_CRoff_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_avgBaseQ20_mCthresh0.7_t2t_v2_0_chr1:206583354-206589854_2025-09-24_units_combined_numFWD659_numRVS1149.npy
CG_137_padded_reads_day6_CRoff_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10_mCthresh0.7_t2t_v2_0_chr1:206583354-206589854_2025-09-29_units_combined_numFWD802_numRVS1480.npy
CG_137_padded_reads_day6_CRoff_Tcells_mC0.995_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_avgBaseQ10_mCthresh0.995_t2t_v2_0_chr1:206583354-206589854_2025-09-27_units_combined_numFWD1087_numRVS2083.npy

In [11]:
%%bash
date_today="20251110"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_pileup/"
mkdir ${pileup_data_folder_path}

# mC > 99.5% = 0995 filtered 
# filtered_CROFF_day6_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day6_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# mC >70% = 07 filtered 
# filtered_CROFF_day6_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_Tcells_CRISPRoff_Day6_postEP_R9minion_threshold_mC0.995_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# pileup_CROFF_day6_bed=${pileup_data_folder_path}${date_today}"_noFilter_mC07""_pileup_CROFF_Day6_Tcells.bed"

# Pre-filtered mC > 70%::
CROFF_day6_bam=${data_folder_path}"pre_filtered_ROI_reads_day6_CRoff_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10.bam"
pileup_CROFF_day6_bed=${pileup_data_folder_path}${date_today}"_noFilter_mC07""_pileup_CROFF_Day6_Tcells.bed"


# use full data unfiltered .BAM file
# data_folder_path="/home/michalula/data/cas9_nanopore/data/20250908_nCATs_T_CRoff_Day_6/5mCG/to_t2t_v2_0/"
# CROFF_day6_bam=${data_folder_path}"sort_align_t2t_v2_0_trim_20250908_Day6_CROFF_Tcells_2Libraries_Minion_R9.dna_r9.4.1_e8_sup@v3.3.5mCG.bam"
# pileup_CROFF_day6_bed=${data_folder_path}${date_today}"_full_data""_pileup_CROFF_Day6_Tcells.bed"


ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref= "/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"
threads=32


modkit pileup ${CROFF_day6_bam} ${pileup_CROFF_day6_bed} \
  --cpg \
  --ref ${ref_genome_fa} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${pileup_CROFF_day6_bed}
tabix -p bed ${pileup_CROFF_day6_bed}.gz

printf '%s\n' "CROFF_day6_bam: $CROFF_day6_bam"
printf '%s\n' "pileup_CROFF_day6_bed: $pileup_CROFF_day6_bed"
cat "$pileup_CROFF_day6_bed"

mkdir: cannot create directory ‘/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/new_pileup/’: File exists
[0;32m>[0m calculated chunk size: 48, interval size 100000, processing 4800000 positions concurrently
[0;32m>[0m filtering to only CpG motifs
[0;32m>[0m attempting to sample 10042 reads
[0;32m>[0m Using filter threshold 0.7519531 for C.
[0;32m>[0m Done, processed 439 rows. Processed ~3440 reads and skipped zero reads.
[tabix] the index file exists. Please use '-f' to overwrite.


CROFF_day6_bam: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/pre_filtered_ROI_reads_day6_CRoff_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10.bam
pileup_CROFF_day6_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_CROFF_Day6_Tcells.bed
chr1	206581919	206581920	m	1	-	206581919	206581920	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206582330	206582331	m	1	-	206582330	206582331	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206582445	206582446	m	1	-	206582445	206582446	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206582473	206582474	m	1	-	206582473	206582474	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206582573	206582574	m	1	-	206582573	206582574	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206582903	206582904	m	1	-	206582903	206582904	255,0,0	1	100.00	1	0	0	0	0	0	0
chr1	206583089	206583090	m	27	+

## Pileup columns explore

bedMethyl column descriptions.

Definitions:

Nmod - Number of calls passing filters that were classified as a residue with a specified base modification.

Ncanonical - Number of calls passing filters were classified as the canonical base rather than modified. The exact base must be inferred by the modification code. For example, if the modification code is m (5mC) then the canonical base is cytosine. If the modification code is a, the canonical base is adenine.

Nother mod - Number of calls passing filters that were classified as modified, but where the modification is different from the listed base (and the corresponding canonical base is equal). For example, for a given cytosine there may be 3 reads with h calls, 1 with a canonical call, and 2 with m calls. In the bedMethyl row for h Nother_mod would be 2. In the m row Nother_mod would be 3.

Nvalid_cov - the valid coverage. Nvalid_cov = Nmod + Nother_mod + Ncanonical, also used as the score in the bedMethyl

Ndiff - Number of reads with a base other than the canonical base for this modification. For example, in a row for h the canonical base is cytosine, if there are 2 reads with C->A substitutions, Ndiff will be 2.

Ndelete - Number of reads with a deletion at this reference position

Nfail - Number of calls where the probability of the call was below the threshold. The threshold can be set on the command line or computed from the data (usually failing the lowest 10th percentile of calls).

Nnocall - Number of reads aligned to this reference position, with the correct canonical base, but without a base modification call. This can happen, for example, if the model requires a CpG dinucleotide and the read has a CG->CH substitution such that no modification call was produced by the basecaller.


column	name	description	type

    1	chrom	name of reference sequence from BAM header	str

    2	start position	0-based start position	int

    3	end position	0-based exclusive end position	int

    4	modified base code and motif	single letter code for modified base and motif when more than one motif is used	str

    5	score	equal to Nvalid_cov	int

    6	strand	'+' for positive strand '-' for negative strand, '.' when strands are combined	str

    7	start position	included for compatibility	int

    8	end position	included for compatibility	int

    9	color	included for compatibility, always 255,0,0	str

    10	Nvalid_cov	see definitions above.	int

    11	percent modified	(Nmod / Nvalid_cov) * 100	float

    12	Nmod	see definitions above	int

    13	Ncanonical	see definitions above	int

    14	Nother_mod	see definitions above	int

    15	Ndelete	see definitions above	int

    16	Nfail	see definitions above	int

    17	Ndiff	see definitions above	int

    18	Nnocall	see definitions above	int


In [13]:

date_today="20251110"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=data_folder_path+"new_pileup/"

CROFF_day6_bam=data_folder_path+"filtered_reads_overlap_MORE_than_0.9_day6_CRoff_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10.bam"
pileup_CROFF_day6_bed=pileup_data_folder_path+date_today+"_noFilter_mC07""_pileup_CROFF_Day6_Tcells.bed"

pileup_CROFF_day6_bed

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_CROFF_Day6_Tcells.bed'

In [16]:
pileup_CROFF_day6_df = load_pileup_bed(pileup_CROFF_day6_bed)
pileup_CROFF_day6_df

Reading bedMethyl file: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_CROFF_Day6_Tcells.bed
Loaded DataFrame shape: (439, 18)


  df[c] = pd.to_numeric(df[c], errors="ignore")


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206581919,206581920,m,1,-,206581919,206581920,25500,1,100.0,1,0,0,0,0,0,0
1,chr1,206582330,206582331,m,1,-,206582330,206582331,25500,1,100.0,1,0,0,0,0,0,0
2,chr1,206582445,206582446,m,1,-,206582445,206582446,25500,1,100.0,1,0,0,0,0,0,0
3,chr1,206582473,206582474,m,1,-,206582473,206582474,25500,1,100.0,1,0,0,0,0,0,0
4,chr1,206582573,206582574,m,1,-,206582573,206582574,25500,1,100.0,1,0,0,0,0,0,0


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206581919,206581920,m,1,-,206581919,206581920,25500,1,100.0,1,0,0,0,0,0,0
1,chr1,206582330,206582331,m,1,-,206582330,206582331,25500,1,100.0,1,0,0,0,0,0,0
2,chr1,206582445,206582446,m,1,-,206582445,206582446,25500,1,100.0,1,0,0,0,0,0,0
3,chr1,206582473,206582474,m,1,-,206582473,206582474,25500,1,100.0,1,0,0,0,0,0,0
4,chr1,206582573,206582574,m,1,-,206582573,206582574,25500,1,100.0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434,chr1,206650134,206650135,m,1,+,206650134,206650135,25500,1,0.0,0,1,0,0,0,0,0
435,chr1,206651057,206651058,m,1,+,206651057,206651058,25500,1,100.0,1,0,0,0,0,0,0
436,chr1,206653446,206653447,m,1,+,206653446,206653447,25500,1,100.0,1,0,0,0,0,0,0
437,chr1,206653875,206653876,m,1,+,206653875,206653876,25500,1,100.0,1,0,0,0,0,0,0


In [17]:
pileup_CROFF_day6_df_stats = plot_pileup_roi_df(df_roi=pileup_CROFF_day6_df, out_dir=pileup_data_folder_path)
pileup_CROFF_day6_df_stats


ROI rows: 439
Percent modified: median=87.79, mean=73.46
Coverage (Nvalid_cov): min=1, median=1034.0, max=1792


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206581919,206581920,m,1,-,206581919,206581920,25500,1,100.0,1,0,0,0,0,0,0,206581919
chr1,206582330,206582331,m,1,-,206582330,206582331,25500,1,100.0,1,0,0,0,0,0,0,206582330
chr1,206582445,206582446,m,1,-,206582445,206582446,25500,1,100.0,1,0,0,0,0,0,0,206582445
chr1,206582473,206582474,m,1,-,206582473,206582474,25500,1,100.0,1,0,0,0,0,0,0,206582473
chr1,206582573,206582574,m,1,-,206582573,206582574,25500,1,100.0,1,0,0,0,0,0,0,206582573
chr1,206582903,206582904,m,1,-,206582903,206582904,25500,1,100.0,1,0,0,0,0,0,0,206582903
chr1,206583089,206583090,m,27,+,206583089,206583090,25500,27,77.78,21,6,0,0,1,3,2,206583089
chr1,206583090,206583091,m,19,-,206583090,206583091,25500,19,100.0,19,0,0,2,0,2,2,206583090
chr1,206583173,206583174,m,1112,+,206583173,206583174,25500,1112,89.66,997,115,0,76,202,268,101,206583173
chr1,206583174,206583175,m,1187,-,206583174,206583175,25500,1187,97.22,1154,33,0,19,41,54,95,206583174


ROI rows: 439
Percent modified: median=87.79, mean=73.46
Coverage (Nvalid_cov): min=1, median=1034.0, max=1792


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206581919,206581920,m,1,-,206581919,206581920,25500,1,100.0,1,0,0,0,0,0,0,206581919
chr1,206582330,206582331,m,1,-,206582330,206582331,25500,1,100.0,1,0,0,0,0,0,0,206582330
chr1,206582445,206582446,m,1,-,206582445,206582446,25500,1,100.0,1,0,0,0,0,0,0,206582445
chr1,206582473,206582474,m,1,-,206582473,206582474,25500,1,100.0,1,0,0,0,0,0,0,206582473
chr1,206582573,206582574,m,1,-,206582573,206582574,25500,1,100.0,1,0,0,0,0,0,0,206582573
chr1,206582903,206582904,m,1,-,206582903,206582904,25500,1,100.0,1,0,0,0,0,0,0,206582903
chr1,206583089,206583090,m,27,+,206583089,206583090,25500,27,77.78,21,6,0,0,1,3,2,206583089
chr1,206583090,206583091,m,19,-,206583090,206583091,25500,19,100.0,19,0,0,2,0,2,2,206583090
chr1,206583173,206583174,m,1112,+,206583173,206583174,25500,1112,89.66,997,115,0,76,202,268,101,206583173
chr1,206583174,206583175,m,1187,-,206583174,206583175,25500,1187,97.22,1154,33,0,19,41,54,95,206583174


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,...,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos,label,Ntotal,Nmod_perc,Ncanonical_perc
0,chr1,206581919,206581920,m,1,-,206581919,206581920,25500,1,...,0,0,0,0,0,206581919,206581919:-,1,100.0,0.0
1,chr1,206582330,206582331,m,1,-,206582330,206582331,25500,1,...,0,0,0,0,0,206582330,206582330:-,1,100.0,0.0
2,chr1,206582445,206582446,m,1,-,206582445,206582446,25500,1,...,0,0,0,0,0,206582445,206582445:-,1,100.0,0.0
3,chr1,206582473,206582474,m,1,-,206582473,206582474,25500,1,...,0,0,0,0,0,206582473,206582473:-,1,100.0,0.0
4,chr1,206582573,206582574,m,1,-,206582573,206582574,25500,1,...,0,0,0,0,0,206582573,206582573:-,1,100.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434,chr1,206650134,206650135,m,1,+,206650134,206650135,25500,1,...,0,0,0,0,0,206650134,206650134:+,1,0.0,100.0
435,chr1,206651057,206651058,m,1,+,206651057,206651058,25500,1,...,0,0,0,0,0,206651057,206651057:+,1,100.0,0.0
436,chr1,206653446,206653447,m,1,+,206653446,206653447,25500,1,...,0,0,0,0,0,206653446,206653446:+,1,100.0,0.0
437,chr1,206653875,206653876,m,1,+,206653875,206653876,25500,1,...,0,0,0,0,0,206653875,206653875:+,1,100.0,0.0


#TODO:
# WHAT is going on with the lack of C calls at the target dip????

# <!> WHY is the total number of mC+C reads SO LOW????

# WHY is the last figure with precentages SO MUCH cleaner and hides the LACK of called based info there????

in that C called as something else???

Is that not a real ROI Cas9 dip??

WHAT Are the bases there?????

AAA those are SORTED???? !!!!

based on which column????

# Look at CpGs within out target ROI
T2T v2.0

First CG:
206583388,206583390

Last of selected 137 CGs in the ROI:

206589746,206589748 --CpG_137

=> here have each CG position separate: so have 137*2  = 276

In [18]:
137*2, 277-5

(274, 272)

In [19]:
pileup_CROFF_day6_df[pileup_CROFF_day6_df['start'] == 206583387]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
10,chr1,206583387,206583388,m,1309,+,206583387,206583388,25500,1309,63.87,836,473,0,52,190,195,127,206583387


In [20]:
pileup_CROFF_day6_df[pileup_CROFF_day6_df['start'] == 206583388]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
11,chr1,206583388,206583389,m,938,-,206583388,206583389,25500,938,81.98,769,169,0,153,81,78,229,206583388


In [21]:
pileup_CROFF_day6_df[pileup_CROFF_day6_df['start'] == 206589746]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
283,chr1,206589746,206589747,m,866,-,206589746,206589747,25500,866,94.57,819,47,0,203,64,116,232,206589746


In [22]:
(284-10) / 2

137.0

In [23]:
pileup_CROFF_day6_df_roi = pileup_CROFF_day6_df.iloc[10:284, :]  # Display target region rows
print(pileup_CROFF_day6_df_roi.shape,pileup_CROFF_day6_df_roi.shape[0]/2)
pileup_CROFF_day6_df_roi

(274, 19) 137.0


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
10,chr1,206583387,206583388,m,1309,+,206583387,206583388,25500,1309,63.87,836,473,0,52,190,195,127,206583387
11,chr1,206583388,206583389,m,938,-,206583388,206583389,25500,938,81.98,769,169,0,153,81,78,229,206583388
12,chr1,206583707,206583708,m,1536,+,206583707,206583708,25500,1536,94.27,1448,88,0,81,44,101,119,206583707
13,chr1,206583708,206583709,m,1067,-,206583708,206583709,25500,1067,94.75,1011,56,0,85,85,137,120,206583708
14,chr1,206583766,206583767,m,1208,+,206583766,206583767,25500,1208,88.33,1067,141,0,97,77,307,193,206583766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,chr1,206589213,206589214,m,1213,-,206589213,206589214,25500,1213,96.54,1171,42,0,85,63,49,88,206589213
280,chr1,206589436,206589437,m,1584,+,206589436,206589437,25500,1584,95.27,1509,75,0,69,36,97,73,206589436
281,chr1,206589437,206589438,m,1198,-,206589437,206589438,25500,1198,94.32,1130,68,0,49,24,104,115,206589437
282,chr1,206589745,206589746,m,1341,+,206589745,206589746,25500,1341,98.14,1316,25,0,181,22,117,188,206589745


<!>
From day 6 data:
Threshold of  0.767

From day 35 data:
> Threshold of  0.7597656 for base C is low. Consider increasing the filter-percentile or specifying a higher threshold.
> Done, processed 11762972 rows. Processed ~129977 reads and skipped ~150 reads.

In [24]:
# Plot pileup_Unedit_day6_df_roi summary plots (use existing variables/imports in the notebook)
# Saves interactive HTMLs to pileup_data_folder_path and displays inline.
out_dir = pileup_data_folder_path  # existing variable in the notebook

df_roi = pileup_CROFF_day6_df_roi.copy()

df_roi_stats = plot_pileup_roi_df(df_roi=pileup_CROFF_day6_df_roi, out_dir=pileup_data_folder_path)
df_roi_stats




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

ROI rows: 274
Percent modified: median=75.31, mean=68.76
Coverage (Nvalid_cov): min=467, median=1226.0, max=1792


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,1309,+,206583387,206583388,25500,1309,63.87,836,473,0,52,190,195,127,206583387
chr1,206583388,206583389,m,938,-,206583388,206583389,25500,938,81.98,769,169,0,153,81,78,229,206583388
chr1,206583707,206583708,m,1536,+,206583707,206583708,25500,1536,94.27,1448,88,0,81,44,101,119,206583707
chr1,206583708,206583709,m,1067,-,206583708,206583709,25500,1067,94.75,1011,56,0,85,85,137,120,206583708
chr1,206583766,206583767,m,1208,+,206583766,206583767,25500,1208,88.33,1067,141,0,97,77,307,193,206583766
chr1,206583767,206583768,m,1111,-,206583767,206583768,25500,1111,93.88,1043,68,0,70,118,96,100,206583767
chr1,206584104,206584105,m,1362,+,206584104,206584105,25500,1362,94.93,1293,69,0,78,22,340,89,206584104
chr1,206584105,206584106,m,1126,-,206584105,206584106,25500,1126,95.29,1073,53,0,117,27,87,149,206584105
chr1,206584137,206584138,m,1792,+,206584137,206584138,25500,1792,96.26,1725,67,0,3,82,11,11,206584137
chr1,206584138,206584139,m,1030,-,206584138,206584139,25500,1030,89.22,919,111,0,1,447,21,12,206584138


ROI rows: 274
Percent modified: median=75.31, mean=68.76
Coverage (Nvalid_cov): min=467, median=1226.0, max=1792


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,1309,+,206583387,206583388,25500,1309,63.87,836,473,0,52,190,195,127,206583387
chr1,206583388,206583389,m,938,-,206583388,206583389,25500,938,81.98,769,169,0,153,81,78,229,206583388
chr1,206583707,206583708,m,1536,+,206583707,206583708,25500,1536,94.27,1448,88,0,81,44,101,119,206583707
chr1,206583708,206583709,m,1067,-,206583708,206583709,25500,1067,94.75,1011,56,0,85,85,137,120,206583708
chr1,206583766,206583767,m,1208,+,206583766,206583767,25500,1208,88.33,1067,141,0,97,77,307,193,206583766
chr1,206583767,206583768,m,1111,-,206583767,206583768,25500,1111,93.88,1043,68,0,70,118,96,100,206583767
chr1,206584104,206584105,m,1362,+,206584104,206584105,25500,1362,94.93,1293,69,0,78,22,340,89,206584104
chr1,206584105,206584106,m,1126,-,206584105,206584106,25500,1126,95.29,1073,53,0,117,27,87,149,206584105
chr1,206584137,206584138,m,1792,+,206584137,206584138,25500,1792,96.26,1725,67,0,3,82,11,11,206584137
chr1,206584138,206584139,m,1030,-,206584138,206584139,25500,1030,89.22,919,111,0,1,447,21,12,206584138


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,...,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos,label,Ntotal,Nmod_perc,Ncanonical_perc
10,chr1,206583387,206583388,m,1309,+,206583387,206583388,25500,1309,...,0,52,190,195,127,206583387,206583387:+,1309,63.865546,36.134454
11,chr1,206583388,206583389,m,938,-,206583388,206583389,25500,938,...,0,153,81,78,229,206583388,206583388:-,938,81.982942,18.017058
12,chr1,206583707,206583708,m,1536,+,206583707,206583708,25500,1536,...,0,81,44,101,119,206583707,206583707:+,1536,94.270833,5.729167
13,chr1,206583708,206583709,m,1067,-,206583708,206583709,25500,1067,...,0,85,85,137,120,206583708,206583708:-,1067,94.751640,5.248360
14,chr1,206583766,206583767,m,1208,+,206583766,206583767,25500,1208,...,0,97,77,307,193,206583766,206583766:+,1208,88.327815,11.672185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,chr1,206589213,206589214,m,1213,-,206589213,206589214,25500,1213,...,0,85,63,49,88,206589213,206589213:-,1213,96.537510,3.462490
280,chr1,206589436,206589437,m,1584,+,206589436,206589437,25500,1584,...,0,69,36,97,73,206589436,206589436:+,1584,95.265152,4.734848
281,chr1,206589437,206589438,m,1198,-,206589437,206589438,25500,1198,...,0,49,24,104,115,206589437,206589437:-,1198,94.323873,5.676127
282,chr1,206589745,206589746,m,1341,+,206589745,206589746,25500,1341,...,0,181,22,117,188,206589745,206589745:+,1341,98.135720,1.864280


# Unedited T cells Day 6

In [27]:
! ls /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/unedited/analyze_single_reads/dimelo_v2_output

CG_137_padded_reads_day6_unedited_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10_mCthresh0.7_t2t_v2_0_chr1:206583354-206589854_2025-09-29_units_combined_numFWD490_numRVS644.npy
CG_137_padded_reads_day6_unedited_Tcells_mC0.995_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10_mCthresh0.995_t2t_v2_0_chr1:206583354-206589854_2025-09-29_units_combined_numFWD489_numRVS638.npy
extracted_reads
filtered_reads_overlap_MORE_than_0.9_day6_unedited_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10.bam
filtered_reads_overlap_MORE_than_0.9_day6_unedited_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10.bam.bai
filtered_reads_overlap_MORE_than_0.9_day6_unedited_Tcells_mC0.995_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10.bam
filtered_reads_overlap_MORE_than_0.9_day6_unedited_Tcells_mC0.995_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10.bam.bai
filtered_reads_overlap_MORE_than_0.9_day6_unedited_Tcells_Thr0

In [28]:
%%bash

date_today="20251110"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/unedited/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_pileup/"
mkdir ${pileup_data_folder_path}

filtered_Unedit_day6_bam=${data_folder_path}"pre_filtered_ROI_reads_day6_unedited_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10.bam"
pileup_Unedit_day6_bed=${pileup_data_folder_path}${date_today}"_noFilter_mC07""_pileup_NT_Day6_Tcells.bed"
cat "$pileup_Unedit_day6_bed"

ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref="/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"
threads=32

modkit pileup ${filtered_Unedit_day6_bam} ${pileup_Unedit_day6_bed} \
  --cpg \
  --ref ${ref_genome_fa} \
  --threads ${threads} \
  --log-filepath log.txt

bgzip -k ${pileup_Unedit_day6_bed}
tabix -p bed ${pileup_Unedit_day6_bed}.gz

printf '%s\n' "filtered_Unedit_day6_bam: $filtered_Unedit_day6_bam"
printf '%s\n' "pileup_Unedit_day6_bed: $pileup_Unedit_day6_bed"
cat "$pileup_Unedit_day6_bed"


mkdir: cannot create directory ‘/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/unedited/analyze_single_reads/dimelo_v2_output/new_pileup/’: File exists


chr1	206583089	206583090	m	7	+	206583089	206583090	255,0,0	7	57.14	4	3	0	0	0	1	1
chr1	206583090	206583091	m	9	-	206583090	206583091	255,0,0	9	77.78	7	2	0	1	1	1	0
chr1	206583173	206583174	m	390	+	206583173	206583174	255,0,0	390	91.79	358	32	0	26	76	62	24
chr1	206583174	206583175	m	473	-	206583174	206583175	255,0,0	473	98.31	465	8	0	8	12	10	19
chr1	206583387	206583388	m	459	+	206583387	206583388	255,0,0	459	68.63	315	144	0	7	63	33	32
chr1	206583388	206583389	m	382	-	206583388	206583389	255,0,0	382	81.15	310	72	0	44	23	18	65
chr1	206583707	206583708	m	529	+	206583707	206583708	255,0,0	529	92.63	490	39	0	13	18	17	17
chr1	206583708	206583709	m	426	-	206583708	206583709	255,0,0	426	92.25	393	33	0	18	38	34	17
chr1	206583766	206583767	m	440	+	206583766	206583767	255,0,0	440	92.27	406	34	0	22	27	67	38
chr1	206583767	206583768	m	430	-	206583767	206583768	255,0,0	430	96.74	416	14	0	22	41	21	19
chr1	206584104	206584105	m	463	+	206584104	206584105	255,0,0	463	96.33	446	17	0	18	11	84	18
chr1	2065841

[0;32m>[0m calculated chunk size: 48, interval size 100000, processing 4800000 positions concurrently
[0;32m>[0m filtering to only CpG motifs
[0;32m>[0m attempting to sample 10042 reads
[0;32m>[0m Using filter threshold 0.7480469 for C.
[0;32m>[0m Done, processed 286 rows. Processed ~1594 reads and skipped zero reads.


filtered_Unedit_day6_bam: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/unedited/analyze_single_reads/dimelo_v2_output/pre_filtered_ROI_reads_day6_unedited_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10.bam
pileup_Unedit_day6_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/unedited/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_NT_Day6_Tcells.bed
chr1	206583089	206583090	m	12	+	206583089	206583090	255,0,0	12	75.00	9	3	0	0	0	1	1
chr1	206583090	206583091	m	12	-	206583090	206583091	255,0,0	12	83.33	10	2	0	1	1	1	0
chr1	206583173	206583174	m	516	+	206583173	206583174	255,0,0	516	91.09	470	46	0	41	88	117	42
chr1	206583174	206583175	m	655	-	206583174	206583175	255,0,0	655	98.32	644	11	0	15	16	18	45
chr1	206583387	206583388	m	595	+	206583387	206583388	255,0,0	595	66.05	393	202	0	17	86	62	63
chr1	206583388	206583389	m	497	-	206583388	206583389	255,0,0	

In [29]:
date_today="20251110"
data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/unedited/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=data_folder_path+"new_pileup/"
pileup_Unedit_day6_bed=pileup_data_folder_path+date_today+"_noFilter_mC07_pileup_NT_Day6_Tcells.bed"
# cat "$pileup_Unedit_day6_bed"


pileup_Unedit_day6_bed

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/unedited/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_NT_Day6_Tcells.bed'

In [30]:
pileup_Unedit_day6_df = load_pileup_bed(pileup_Unedit_day6_bed)
pileup_Unedit_day6_df

Reading bedMethyl file: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/unedited/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_NT_Day6_Tcells.bed
Loaded DataFrame shape: (286, 18)



errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead



Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583089,206583090,m,12,+,206583089,206583090,25500,12,75.0,9,3,0,0,0,1,1
1,chr1,206583090,206583091,m,12,-,206583090,206583091,25500,12,83.33,10,2,0,1,1,1,0
2,chr1,206583173,206583174,m,516,+,206583173,206583174,25500,516,91.09,470,46,0,41,88,117,42
3,chr1,206583174,206583175,m,655,-,206583174,206583175,25500,655,98.32,644,11,0,15,16,18,45
4,chr1,206583387,206583388,m,595,+,206583387,206583388,25500,595,66.05,393,202,0,17,86,62,63


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall
0,chr1,206583089,206583090,m,12,+,206583089,206583090,25500,12,75.00,9,3,0,0,0,1,1
1,chr1,206583090,206583091,m,12,-,206583090,206583091,25500,12,83.33,10,2,0,1,1,1,0
2,chr1,206583173,206583174,m,516,+,206583173,206583174,25500,516,91.09,470,46,0,41,88,117,42
3,chr1,206583174,206583175,m,655,-,206583174,206583175,25500,655,98.32,644,11,0,15,16,18,45
4,chr1,206583387,206583388,m,595,+,206583387,206583388,25500,595,66.05,393,202,0,17,86,62,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,chr1,206589931,206589932,m,36,-,206589931,206589932,25500,36,94.44,34,2,0,2,4,36,4
282,chr1,206589955,206589956,m,74,+,206589955,206589956,25500,74,91.89,68,6,0,2,4,7,1
283,chr1,206589956,206589957,m,31,-,206589956,206589957,25500,31,96.77,30,1,0,2,3,6,10
284,chr1,206590032,206590033,m,54,+,206590032,206590033,25500,54,88.89,48,6,0,12,2,8,9


In [31]:
pileup_Unedit_day6_df_stats = plot_pileup_roi_df(df_roi=pileup_Unedit_day6_df, out_dir=pileup_data_folder_path)
pileup_Unedit_day6_df_stats

ROI rows: 286
Percent modified: median=6.05, mean=28.22
Coverage (Nvalid_cov): min=12, median=636.5, max=795


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583089,206583090,m,12,+,206583089,206583090,25500,12,75.0,9,3,0,0,0,1,1,206583089
chr1,206583090,206583091,m,12,-,206583090,206583091,25500,12,83.33,10,2,0,1,1,1,0,206583090
chr1,206583173,206583174,m,516,+,206583173,206583174,25500,516,91.09,470,46,0,41,88,117,42,206583173
chr1,206583174,206583175,m,655,-,206583174,206583175,25500,655,98.32,644,11,0,15,16,18,45,206583174
chr1,206583387,206583388,m,595,+,206583387,206583388,25500,595,66.05,393,202,0,17,86,62,63,206583387
chr1,206583388,206583389,m,497,-,206583388,206583389,25500,497,81.69,406,91,0,86,31,37,117,206583388
chr1,206583707,206583708,m,685,+,206583707,206583708,25500,685,92.55,634,51,0,32,27,38,42,206583707
chr1,206583708,206583709,m,568,-,206583708,206583709,25500,568,92.96,528,40,0,40,56,65,40,206583708
chr1,206583766,206583767,m,554,+,206583766,206583767,25500,554,91.88,509,45,0,37,35,138,60,206583766
chr1,206583767,206583768,m,581,-,206583767,206583768,25500,581,95.87,557,24,0,39,60,45,44,206583767


ROI rows: 286
Percent modified: median=6.05, mean=28.22
Coverage (Nvalid_cov): min=12, median=636.5, max=795


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583089,206583090,m,12,+,206583089,206583090,25500,12,75.0,9,3,0,0,0,1,1,206583089
chr1,206583090,206583091,m,12,-,206583090,206583091,25500,12,83.33,10,2,0,1,1,1,0,206583090
chr1,206583173,206583174,m,516,+,206583173,206583174,25500,516,91.09,470,46,0,41,88,117,42,206583173
chr1,206583174,206583175,m,655,-,206583174,206583175,25500,655,98.32,644,11,0,15,16,18,45,206583174
chr1,206583387,206583388,m,595,+,206583387,206583388,25500,595,66.05,393,202,0,17,86,62,63,206583387
chr1,206583388,206583389,m,497,-,206583388,206583389,25500,497,81.69,406,91,0,86,31,37,117,206583388
chr1,206583707,206583708,m,685,+,206583707,206583708,25500,685,92.55,634,51,0,32,27,38,42,206583707
chr1,206583708,206583709,m,568,-,206583708,206583709,25500,568,92.96,528,40,0,40,56,65,40,206583708
chr1,206583766,206583767,m,554,+,206583766,206583767,25500,554,91.88,509,45,0,37,35,138,60,206583766
chr1,206583767,206583768,m,581,-,206583767,206583768,25500,581,95.87,557,24,0,39,60,45,44,206583767


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,...,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos,label,Ntotal,Nmod_perc,Ncanonical_perc
0,chr1,206583089,206583090,m,12,+,206583089,206583090,25500,12,...,0,0,0,1,1,206583089,206583089:+,12,75.000000,25.000000
1,chr1,206583090,206583091,m,12,-,206583090,206583091,25500,12,...,0,1,1,1,0,206583090,206583090:-,12,83.333333,16.666667
2,chr1,206583173,206583174,m,516,+,206583173,206583174,25500,516,...,0,41,88,117,42,206583173,206583173:+,516,91.085271,8.914729
3,chr1,206583174,206583175,m,655,-,206583174,206583175,25500,655,...,0,15,16,18,45,206583174,206583174:-,655,98.320611,1.679389
4,chr1,206583387,206583388,m,595,+,206583387,206583388,25500,595,...,0,17,86,62,63,206583387,206583387:+,595,66.050420,33.949580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,chr1,206589931,206589932,m,36,-,206589931,206589932,25500,36,...,0,2,4,36,4,206589931,206589931:-,36,94.444444,5.555556
282,chr1,206589955,206589956,m,74,+,206589955,206589956,25500,74,...,0,2,4,7,1,206589955,206589955:+,74,91.891892,8.108108
283,chr1,206589956,206589957,m,31,-,206589956,206589957,25500,31,...,0,2,3,6,10,206589956,206589956:-,31,96.774194,3.225806
284,chr1,206590032,206590033,m,54,+,206590032,206590033,25500,54,...,0,12,2,8,9,206590032,206590032:+,54,88.888889,11.111111


# focus on ROI

In [32]:
pileup_Unedit_day6_df[pileup_Unedit_day6_df['start'] == 206583388-1]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
4,chr1,206583387,206583388,m,595,+,206583387,206583388,25500,595,66.05,393,202,0,17,86,62,63,206583387


In [33]:
pileup_Unedit_day6_df[pileup_Unedit_day6_df['start'] == 206583388]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
5,chr1,206583388,206583389,m,497,-,206583388,206583389,25500,497,81.69,406,91,0,86,31,37,117,206583388


In [34]:
pileup_Unedit_day6_df[pileup_Unedit_day6_df['start'] == 206589746-1]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
276,chr1,206589745,206589746,m,617,+,206589745,206589746,25500,617,98.87,610,7,0,89,17,41,60,206589745


In [35]:
pileup_Unedit_day6_df[pileup_Unedit_day6_df['start'] == 206589746]

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
277,chr1,206589746,206589747,m,459,-,206589746,206589747,25500,459,94.77,435,24,0,110,46,53,99,206589746


In [36]:
pileup_Unedit_day6_df_roi = pileup_Unedit_day6_df.iloc[4:278, :]  # Display target region rows
print(pileup_Unedit_day6_df_roi.shape,pileup_Unedit_day6_df_roi.shape[0]/2)
pileup_Unedit_day6_df_roi

(274, 19) 137.0


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
4,chr1,206583387,206583388,m,595,+,206583387,206583388,25500,595,66.05,393,202,0,17,86,62,63,206583387
5,chr1,206583388,206583389,m,497,-,206583388,206583389,25500,497,81.69,406,91,0,86,31,37,117,206583388
6,chr1,206583707,206583708,m,685,+,206583707,206583708,25500,685,92.55,634,51,0,32,27,38,42,206583707
7,chr1,206583708,206583709,m,568,-,206583708,206583709,25500,568,92.96,528,40,0,40,56,65,40,206583708
8,chr1,206583766,206583767,m,554,+,206583766,206583767,25500,554,91.88,509,45,0,37,35,138,60,206583766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,chr1,206589213,206589214,m,631,-,206589213,206589214,25500,631,92.23,582,49,0,34,37,21,46,206589213
274,chr1,206589436,206589437,m,716,+,206589436,206589437,25500,716,96.23,689,27,0,24,16,34,35,206589436
275,chr1,206589437,206589438,m,648,-,206589437,206589438,25500,648,93.83,608,40,0,25,11,47,38,206589437
276,chr1,206589745,206589746,m,617,+,206589745,206589746,25500,617,98.87,610,7,0,89,17,41,60,206589745


In [37]:
pileup_Unedit_day6_df

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
0,chr1,206583089,206583090,m,12,+,206583089,206583090,25500,12,75.00,9,3,0,0,0,1,1,206583089
1,chr1,206583090,206583091,m,12,-,206583090,206583091,25500,12,83.33,10,2,0,1,1,1,0,206583090
2,chr1,206583173,206583174,m,516,+,206583173,206583174,25500,516,91.09,470,46,0,41,88,117,42,206583173
3,chr1,206583174,206583175,m,655,-,206583174,206583175,25500,655,98.32,644,11,0,15,16,18,45,206583174
4,chr1,206583387,206583388,m,595,+,206583387,206583388,25500,595,66.05,393,202,0,17,86,62,63,206583387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,chr1,206589931,206589932,m,36,-,206589931,206589932,25500,36,94.44,34,2,0,2,4,36,4,206589931
282,chr1,206589955,206589956,m,74,+,206589955,206589956,25500,74,91.89,68,6,0,2,4,7,1,206589955
283,chr1,206589956,206589957,m,31,-,206589956,206589957,25500,31,96.77,30,1,0,2,3,6,10,206589956
284,chr1,206590032,206590033,m,54,+,206590032,206590033,25500,54,88.89,48,6,0,12,2,8,9,206590032


In [38]:
pileup_Unedit_day6_df_roi

Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
4,chr1,206583387,206583388,m,595,+,206583387,206583388,25500,595,66.05,393,202,0,17,86,62,63,206583387
5,chr1,206583388,206583389,m,497,-,206583388,206583389,25500,497,81.69,406,91,0,86,31,37,117,206583388
6,chr1,206583707,206583708,m,685,+,206583707,206583708,25500,685,92.55,634,51,0,32,27,38,42,206583707
7,chr1,206583708,206583709,m,568,-,206583708,206583709,25500,568,92.96,528,40,0,40,56,65,40,206583708
8,chr1,206583766,206583767,m,554,+,206583766,206583767,25500,554,91.88,509,45,0,37,35,138,60,206583766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,chr1,206589213,206589214,m,631,-,206589213,206589214,25500,631,92.23,582,49,0,34,37,21,46,206589213
274,chr1,206589436,206589437,m,716,+,206589436,206589437,25500,716,96.23,689,27,0,24,16,34,35,206589436
275,chr1,206589437,206589438,m,648,-,206589437,206589438,25500,648,93.83,608,40,0,25,11,47,38,206589437
276,chr1,206589745,206589746,m,617,+,206589745,206589746,25500,617,98.87,610,7,0,89,17,41,60,206589745


In [39]:
pileup_data_folder_path

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/unedited/analyze_single_reads/dimelo_v2_output/new_pileup/'

In [40]:
df_roi_stats = plot_pileup_roi_df(df_roi=pileup_Unedit_day6_df_roi, out_dir=pileup_data_folder_path)
df_roi_stats




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

ROI rows: 274
Percent modified: median=5.38, mean=25.47
Coverage (Nvalid_cov): min=225, median=642.0, max=795


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,595,+,206583387,206583388,25500,595,66.05,393,202,0,17,86,62,63,206583387
chr1,206583388,206583389,m,497,-,206583388,206583389,25500,497,81.69,406,91,0,86,31,37,117,206583388
chr1,206583707,206583708,m,685,+,206583707,206583708,25500,685,92.55,634,51,0,32,27,38,42,206583707
chr1,206583708,206583709,m,568,-,206583708,206583709,25500,568,92.96,528,40,0,40,56,65,40,206583708
chr1,206583766,206583767,m,554,+,206583766,206583767,25500,554,91.88,509,45,0,37,35,138,60,206583766
chr1,206583767,206583768,m,581,-,206583767,206583768,25500,581,95.87,557,24,0,39,60,45,44,206583767
chr1,206584104,206584105,m,595,+,206584104,206584105,25500,595,96.3,573,22,0,28,20,148,34,206584104
chr1,206584105,206584106,m,612,-,206584105,206584106,25500,612,95.75,586,26,0,42,7,36,72,206584105
chr1,206584137,206584138,m,795,+,206584137,206584138,25500,795,96.86,770,25,0,1,26,1,2,206584137
chr1,206584138,206584139,m,520,-,206584138,206584139,25500,520,89.62,466,54,0,0,234,9,6,206584138


ROI rows: 274
Percent modified: median=5.38, mean=25.47
Coverage (Nvalid_cov): min=225, median=642.0, max=795


chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,percent_modified,Nmod,Ncanonical,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos
chr1,206583387,206583388,m,595,+,206583387,206583388,25500,595,66.05,393,202,0,17,86,62,63,206583387
chr1,206583388,206583389,m,497,-,206583388,206583389,25500,497,81.69,406,91,0,86,31,37,117,206583388
chr1,206583707,206583708,m,685,+,206583707,206583708,25500,685,92.55,634,51,0,32,27,38,42,206583707
chr1,206583708,206583709,m,568,-,206583708,206583709,25500,568,92.96,528,40,0,40,56,65,40,206583708
chr1,206583766,206583767,m,554,+,206583766,206583767,25500,554,91.88,509,45,0,37,35,138,60,206583766
chr1,206583767,206583768,m,581,-,206583767,206583768,25500,581,95.87,557,24,0,39,60,45,44,206583767
chr1,206584104,206584105,m,595,+,206584104,206584105,25500,595,96.3,573,22,0,28,20,148,34,206584104
chr1,206584105,206584106,m,612,-,206584105,206584106,25500,612,95.75,586,26,0,42,7,36,72,206584105
chr1,206584137,206584138,m,795,+,206584137,206584138,25500,795,96.86,770,25,0,1,26,1,2,206584137
chr1,206584138,206584139,m,520,-,206584138,206584139,25500,520,89.62,466,54,0,0,234,9,6,206584138


Unnamed: 0,chrom,start,end,mod_code,score,strand,start2,end2,color,Nvalid_cov,...,Nother_mod,Ndelete,Nfail,Ndiff,Nnocall,pos,label,Ntotal,Nmod_perc,Ncanonical_perc
4,chr1,206583387,206583388,m,595,+,206583387,206583388,25500,595,...,0,17,86,62,63,206583387,206583387:+,595,66.050420,33.949580
5,chr1,206583388,206583389,m,497,-,206583388,206583389,25500,497,...,0,86,31,37,117,206583388,206583388:-,497,81.690141,18.309859
6,chr1,206583707,206583708,m,685,+,206583707,206583708,25500,685,...,0,32,27,38,42,206583707,206583707:+,685,92.554745,7.445255
7,chr1,206583708,206583709,m,568,-,206583708,206583709,25500,568,...,0,40,56,65,40,206583708,206583708:-,568,92.957746,7.042254
8,chr1,206583766,206583767,m,554,+,206583766,206583767,25500,554,...,0,37,35,138,60,206583766,206583766:+,554,91.877256,8.122744
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,chr1,206589213,206589214,m,631,-,206589213,206589214,25500,631,...,0,34,37,21,46,206589213,206589213:-,631,92.234548,7.765452
274,chr1,206589436,206589437,m,716,+,206589436,206589437,25500,716,...,0,24,16,34,35,206589436,206589436:+,716,96.229050,3.770950
275,chr1,206589437,206589438,m,648,-,206589437,206589438,25500,648,...,0,25,11,47,38,206589437,206589437:-,648,93.827160,6.172840
276,chr1,206589745,206589746,m,617,+,206589745,206589746,25500,617,...,0,89,17,41,60,206589745,206589745:+,617,98.865478,1.134522


# #TODO:
- try MANUAL mC threshold selection for pileup??? set to 0.995 instead of 0.76 that was automatically selected?

# dmr modkit CRoff vs Unedited (NT)

3. Detecting differential modification at single base positions
The modkit dmr pair command has the ability to score individual bases (e.g. differentially methylated CpGs). To run single-base analysis on one or more paired samples, simply omit the --regions (-r) option when running modkit dmr pair. When performing single-base analysis the likelihood ratio score and a MAP-based p-value are available. For details on the likelihood ratio score and the MAP-based p-value, see the scoring details section. For example the above command becomes:

dmr_result=single_base_haplotype_dmr.bed

modkit dmr pair \
  -a ${hp1_pileup}.gz \
  -b ${hp2_pileup}.gz \
  -o ${dmr_result} \
  --ref ${ref} \
  --base C \
  --threads ${threads} \
  --log-filepath dmr.log

In [41]:
# pileup_data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/unedited/merged_2libraries/dimelo_v2_output/new_pileup/"


In [42]:
%%bash

date_today="20251110"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/unedited/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_pileup/"
# mkdir ${pileup_data_folder_path}

filtered_Unedit_day6_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_day6_unedited_Tcells_mC0.995_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10.bam"
pileup_Unedit_day6_bed=${pileup_data_folder_path}${date_today}"_noFilter_mC07""_pileup_NT_Day6_Tcells.bed"
cat "$pileup_Unedit_day6_bed"

# "/home/michalula/data/cas9_nanopore/data/20250721_nCATs_Tcells_UNEDITED_Day28/merged_outputs/5mCG/to_t2t_v2_0/pileup/pileup_Unedit_Day28_Tcells_20250721.bam"

echo "pileup_Unedit_day6_bed: ${pileup_Unedit_day6_bed}.gz"

ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_Unedit_day6_bed}.gz"
# ls -l "${pileup_Unedit_day28_bam}.gz"

chmod u+rwx "${pileup_data_folder_path}"
chmod u+rwx "${pileup_Unedit_day6_bed}.gz"

# ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_Unedit_day6_bed}.gz"


chr1	206583089	206583090	m	12	+	206583089	206583090	255,0,0	12	75.00	9	3	0	0	0	1	1
chr1	206583090	206583091	m	12	-	206583090	206583091	255,0,0	12	83.33	10	2	0	1	1	1	0
chr1	206583173	206583174	m	516	+	206583173	206583174	255,0,0	516	91.09	470	46	0	41	88	117	42
chr1	206583174	206583175	m	655	-	206583174	206583175	255,0,0	655	98.32	644	11	0	15	16	18	45
chr1	206583387	206583388	m	595	+	206583387	206583388	255,0,0	595	66.05	393	202	0	17	86	62	63
chr1	206583388	206583389	m	497	-	206583388	206583389	255,0,0	497	81.69	406	91	0	86	31	37	117
chr1	206583707	206583708	m	685	+	206583707	206583708	255,0,0	685	92.55	634	51	0	32	27	38	42
chr1	206583708	206583709	m	568	-	206583708	206583709	255,0,0	568	92.96	528	40	0	40	56	65	40
chr1	206583766	206583767	m	554	+	206583766	206583767	255,0,0	554	91.88	509	45	0	37	35	138	60
chr1	206583767	206583768	m	581	-	206583767	206583768	255,0,0	581	95.87	557	24	0	39	60	45	44
chr1	206584104	206584105	m	595	+	206584104	206584105	255,0,0	595	96.30	573	22	0	28	20	148	34


In [43]:
%%bash
date_today="20251110"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_pileup/"
mkdir ${pileup_data_folder_path}
# Pre-filtered mC > 70%::
CROFF_day6_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_day6_CRoff_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10.bam"
pileup_CROFF_day6_bed=${pileup_data_folder_path}${date_today}"_noFilter_mC07""_pileup_CROFF_Day6_Tcells.bed"

# data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/replica_1/analyze_single_reads/dimelo_v2_output/"
# pileup_data_folder_path=${data_folder_path}"new_pileup/"
# # mkdir ${pileup_data_folder_path}

# # Pre-filtered mC > 70%::
# CROFF_day6_bam=${data_folder_path}"pre_filtered_ROI_reads_Tcells_CRISPRoff_Day6_postEP_R9minion_threshold_mC0.7_T2Tv2_0_filterMode10_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60.bam"
# pileup_CROFF_day6_bed=${pileup_data_folder_path}${date_today}"_noFilter_mC07""_pileup_CROFF_Day6_Tcells.bed"


echo "pileup_CROFF_day6_bed: ${pileup_CROFF_day6_bed}.gz"

ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_CROFF_day6_bed}.gz"
# ls -l "${pileup_Unedit_day28_bam}.gz"

chmod u+rwx "${pileup_data_folder_path}"
chmod u+rwx "${pileup_CROFF_day6_bed}.gz"


# ls "${pileup_data_folder_path}"
ls -ld "${pileup_data_folder_path}"
ls -ld "${pileup_CROFF_day6_bed}.gz"


mkdir: cannot create directory ‘/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/new_pileup/’: File exists


pileup_CROFF_day6_bed: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_CROFF_Day6_Tcells.bed.gz
20251110_filtered_mC07_pileup_CROFF_Day6_Tcells.bed
20251110_filtered_mC07_pileup_CROFF_Day6_Tcells.bed.gz
20251110_filtered_mC07_pileup_CROFF_Day6_Tcells.bed.gz.tbi
20251110_noFilter_mC07_pileup_CROFF_Day6_Tcells.bed
20251110_noFilter_mC07_pileup_CROFF_Day6_Tcells.bed.gz
20251110_noFilter_mC07_pileup_CROFF_Day6_Tcells.bed.gz.tbi
drwxrwxr-x 2 michalula michalula 4096 Nov 10 20:26 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/new_pileup/
-rw-rw-r-- 1 michalula michalula 10233 Nov 10 20:26 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/new_pileup/20251110_noFilter_mC07_pileup_CROFF_Day6_Tcells.bed.gz

In [44]:
%%bash
 
# 3. Detecting differential modification at single base positions
# The modkit dmr pair command has the ability to score individual bases (e.g. differentially methylated CpGs). To run single-base analysis on one or more paired samples, simply omit the --regions (-r) option when running modkit dmr pair. When performing single-base analysis the likelihood ratio score and a MAP-based p-value are available. For details on the likelihood ratio score and the MAP-based p-value, see the scoring details section. For example the above command becomes:

date_today="20251110"

experiment_codition="noFilter_day6_CRoff_vs_Unedit"
dmr_output_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/"
dmr_result=${dmr_output_path}${date_today}"_single_base_noFiltered_mC07_"${experiment_codition}".bed"


data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/unedited/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_pileup/"
# mkdir ${pileup_data_folder_path}

filtered_Unedit_day6_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_day6_unedited_Tcells_mC0.995_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10.bam"
pileup_Unedit_day6_bed=${pileup_data_folder_path}${date_today}"_noFilter_mC07""_pileup_NT_Day6_Tcells.bed"
cat "$pileup_Unedit_day6_bed"



echo "pileup_Unedit_day6_bed: ${pileup_Unedit_day6_bed}.gz"
ls -l "${pileup_Unedit_day6_bed}.gz"

data_folder_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/"
pileup_data_folder_path=${data_folder_path}"new_pileup/"
mkdir ${pileup_data_folder_path}
# Pre-filtered mC > 70%::
CROFF_day6_bam=${data_folder_path}"filtered_reads_overlap_MORE_than_0.9_day6_CRoff_Tcells_mC0.7_T2Tv2_NoFullyUnmeth_ovrlap0.9_mismat0.7_mapQ60_modBaseQ10.bam"
pileup_CROFF_day6_bed=${pileup_data_folder_path}${date_today}"_noFilter_mC07""_pileup_CROFF_Day6_Tcells.bed"

echo "pileup_CROFF_day6_bed: ${pileup_CROFF_day6_bed}.gz"
ls -l "${pileup_CROFF_day6_bed}.gz"

ref_genome_fa="/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
# ref="/home/michalula/data/ref_genomes/to_t2t_v1_1/chm13.draft_v1.1.fasta"
threads=32
 
cd ${dmr_output_path}
# '/home/michalula/code/epiCausality/epiCode/differential_methyl'

modkit dmr pair \
  -a ${pileup_CROFF_day6_bed}.gz \
  -b ${pileup_Unedit_day6_bed}.gz \
  -o ${dmr_result} \
  --ref ${ref_genome_fa} \
  --base C \
  --threads ${threads} \
  --log-filepath dmr.log


echo "dmr_result: $dmr_result"
ls -lah $dmr_result

chr1	206583089	206583090	m	12	+	206583089	206583090	255,0,0	12	75.00	9	3	0	0	0	1	1
chr1	206583090	206583091	m	12	-	206583090	206583091	255,0,0	12	83.33	10	2	0	1	1	1	0
chr1	206583173	206583174	m	516	+	206583173	206583174	255,0,0	516	91.09	470	46	0	41	88	117	42
chr1	206583174	206583175	m	655	-	2065831ERROR! Session/line number was not unique in database. History logging moved to new session 1354
74	206583175	255,0,0	655	98.32	644	11	0	15	16	18	45
chr1	206583387	206583388	m	595	+	206583387	206583388	255,0,0	595	66.05	393	202	0	17	86	62	63
chr1	206583388	206583389	m	497	-	206583388	206583389	255,0,0	497	81.69	406	91	0	86	31	37	117
chr1	206583707	206583708	m	685	+	206583707	206583708	255,0,0	685	92.55	634	51	0	32	27	38	42
chr1	206583708	206583709	m	568	-	206583708	206583709	255,0,0	568	92.96	528	40	0	40	56	65	40
chr1	206583766	206583767	m	554	+	206583766	206583767	255,0,0	554	91.88	509	45	0	37	35	138	60
chr1	206583767	206583768	m	581	-	206583767	206583768	255,0,0	581	95.87	557	24	0	39	60	45

mkdir: cannot create directory ‘/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/croff/analyze_single_reads/dimelo_v2_output/new_pileup/’: File exists
bash: line 39: cd: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/: No such file or directory
[0;32m>[0m creating directory at "/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output"
[0;32m>[0m reading reference FASTA at "/home/michalula/data/ref_genomes/t2t_v2_0/chm13v2.0.fa"
[0;32m>[0m 1 common sequence(s) between FASTA and both samples
[0;32m>[0m running single-site analysis
[0;32m>[0m using default prior, Beta(α: 0.55, β: 0.55)
[0;32m>[0m estimating max coverages from data
[0;32m>[0m sampled 439 a records and 286 b records, calculating max coverages for 95th percentile
[0;32m>[0m calculated max coverage for a: 1567 a

dmr_result: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/20251110_single_base_noFiltered_mC07_noFilter_day6_CRoff_vs_Unedit.bed
-rw-rw-r-- 1 michalula michalula 57K Nov 10 20:39 /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/20251110_single_base_noFiltered_mC07_noFilter_day6_CRoff_vs_Unedit.bed


In [45]:
%%bash
date_today="20251110"

experiment_codition="noFilter_day6_CRoff_vs_Unedit" 
dmr_output_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/"
#  20251110_single_base_noFiltered_mC07_noFilter_day6_CRoff_vs_Unedit.bed
"/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07_filtered/new_dmr_output/"
dmr_result=${dmr_output_path}${date_today}"_single_base_noFiltered_mC07_"${experiment_codition}".bed"

# date_today="20251109"
# experiment_codition="day6_CRoff_vs_Unedit"
# dmr_output_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07_filtered/new_dmr_output/"
# dmr_result=${dmr_output_path}"_"${date_today}"single_base_noFilter_mC07_"${experiment_codition}".bed"

echo "dmr_result: $dmr_result"
# ls -lah $dmr_result
ls -lah $dmr_output_path
# cat $dmr_result

bash: line 6: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07_filtered/new_dmr_output/: Is a directory


dmr_result: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/20251110_single_base_noFiltered_mC07_noFilter_day6_CRoff_vs_Unedit.bed
total 68K
drwxrwxr-x 2 michalula michalula 4.0K Nov 10 20:39 .
drwxrwxr-x 3 michalula michalula 4.0K Nov 10 20:39 ..
-rw-rw-r-- 1 michalula michalula  57K Nov 10 20:39 20251110_single_base_noFiltered_mC07_noFilter_day6_CRoff_vs_Unedit.bed


In [46]:
pwd

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered'

## modkit dmr explore output

The full table when performing single-site analysis with equal numbers of samples in groups, when running modkit dmr pair, will have the following schema:

column	name	description	type
1	chrom	name of reference sequence from bedMethyl input samples	str
2	start position	0-based start position, from --regions argument	int
3	end position	0-based exclusive end position, from --regions argument	int
4	name	name column from --regions BED, or chr:start-stop if absent, "." for single sites	str
5	score	difference score, more positive values have increased difference	float
6	strand	strand for the region or single-base position	str
7	samplea counts	counts of each base modification in the region, comma-separated, for sample A	str
8	samplea total	total number of base modification calls in the region, including unmodified, for sample A	int
9	sampleb counts	counts of each base modification in the region, comma-separated, for sample B	str
10	sampleb total	total number of base modification calls in the region, including unmodified, for sample B	int
11	samplea percents	percent of calls for each base modification in the region, comma-separated, for sample A	str
12	sampleb percents	percent of calls for each base modification in the region, comma-separated, for sample B	str
13	samplea fraction modified	fraction modification (of any kind) in sample A	float
14	sampleb fraction modified	fraction modification (of any kind) in sample B	float
15	MAP-based p-value	ratio of the posterior probability of observing the effect size over zero effect size	float
16	effect size	percent modified in sample A (col 12) minus percent modified in sample B (col 13)	float
<!-- 17	balanced MAP-based p-value	MAP-based p-value when all replicates are balanced	float
18	balanced effect size	effect size when all replicates are balanced	float -->


17	cohen_h	Cohen's h statistic (useful with regions and high-depth runs)	float
18	cohen_h_low	95% confidence interval lower bound	float
19	cohen_h_high	95% confidence interval upper bound	float

<!-- Differential methylation output format
The output from modkit dmr pair (and for each pairwise comparison with modkit dmr multi) is (roughly) a BED file with the following schema: -->
<!-- 
column	name	description	type
        1	chrom	name of reference sequence from bedMethyl input samples	str
        2	start position	0-based start position, from --regions argument	int
        3	end position	0-based exclusive end position, from --regions argument	int
        4	name	name column from --regions BED, or chr:start-stop if absent, "." for single sites	str
        5	score	difference score, more positive values have increased difference	float
        6	strand	strand for the region or single-base position	str
        7	samplea counts	counts of each base modification in the region, comma-separated, for sample A	str
        8	samplea total	total number of base modification calls in the region, including unmodified, for sample A	int
        9	sampleb counts	counts of each base modification in the region, comma-separated, for sample B	str
        10	sampleb total	total number of base modification calls in the region, including unmodified, for sample B	int
        11	samplea percents	percent of calls for each base modification in the region, comma-separated, for sample A	str
        12	sampleb percents	percent of calls for each base modification in the region, comma-separated, for sample B	str
        13	samplea fraction modified	fraction modification (of any kind) in sample A	float
        14	sampleb fraction modified	fraction modification (of any kind) in sample B	float
        15	MAP-based p-value	ratio of the posterior probability of observing the effect size over zero effect size	float
        16	effect size	percent modified in sample A (col 12) minus percent modified in sample B (col 13)	float
        17	balanced MAP-based p-value	MAP-based p-value when all replicates are balanced	float
        18	balanced effect size	effect size when all replicates are balanced	float
        19	pct_a_samples	percent of 'a' samples used in statistical test	float
        20	pct_b_samples	percent of 'b' samples used in statistical test	float
        21	per-replicate p-values	MAP-based p-values for matched replicate pairs	float
        22	per-replicate effect sizes	effect sizes matched replicate pairs	float
        23	cohen_h	Cohen's h statistic (useful with regions and high-depth runs)	float
        24	cohen_h_low	95% confidence interval lower bound	float
        25	cohen_h_high	95% confidence interval upper bound	float
        Columns 16-19 are only produced when multiple samples are provided, columns 20 and 21 are only produced when there is an equal number of 'a' and 'b' samples. When using multiple samples, it is possible that not every sample will have a modification fraction at a position. When this happens, the statistical test is still performed and the values of pct_a_samples and pct_b_samples reflect the percent of samples from each condition used in the test. 


     (15)	cohen_h	Cohen's h statistic (useful with regions and high-depth runs)	float
    (16)	cohen_h_low	95% confidence interval lower bound	float
    (17)	cohen_h_high	95% confidence interval upper bound	float
    
    n.b. Columns 15, 16, and 17 are present when the --regions option is passed, but these columns are on the right side of the table when performing single-site analysis (below). It is generally recommended to use the --header flag and standard CSV parsing to make sure the schema's between experiments are maintained.

When performing single-site analysis, the following additional columns are added:

column	name	description	type
Columns 20 and 21 have the replicate pairwise MAP-based p-values and effect sizes which are calculated based on their order provided on the command line. For example in the abbreviated command below:

In [52]:
date_today="20251110"

experiment_codition="noFilter_day6_CRoff_vs_Unedit"
dmr_output_path="/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/"
dmr_result=dmr_output_path+date_today+"_single_base_noFiltered_mC07_"+experiment_codition+".bed"


dmr_path=dmr_result
dmr_path


'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/20251110_single_base_noFiltered_mC07_noFilter_day6_CRoff_vs_Unedit.bed'

In [53]:
dmr_path

'/home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/20251110_single_base_noFiltered_mC07_noFilter_day6_CRoff_vs_Unedit.bed'

In [54]:
out_dir = dmr_output_path
print("out_dir:", out_dir)

# Read DMR BED (robust to header/no-header) and assign canonical column names (uses existing vars: dmr_path, out_dir, date_today, pd, os)
canonical_cols = [
    "chrom", "start", "end", "name", "score", "strand",
    "samplea_counts", "samplea_total", "sampleb_counts", "sampleb_total",
    "samplea_percents", "sampleb_percents",
    "samplea_fraction_modified", "sampleb_fraction_modified",
    "map_pvalue", "effect_size",
    "cohen_h", "cohen_h_low", "cohen_h_high",
]
    # "balanced_map_pvalue", "balanced_effect_size"

# read file with header and fallback to header=None when headers look numeric or columns are unexpected
try:
    dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")

    # dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", engine="python") # , header=0
    # # heuristic: if too many numeric-looking column names, re-read as headerless
    # numeric_headers = sum(1 for c in dmr_df.columns if str(c).strip().isdigit())
    # if numeric_headers >= (len(dmr_df.columns) / 2) or dmr_df.shape[1] < 3:
    #     dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")
except Exception:
    dmr_df = pd.read_csv(dmr_path, sep="\t", comment="#", header=None, engine="python")

# assign canonical names up to number of columns present, add generic names for extras
ncols = dmr_df.shape[1]
if ncols <= len(canonical_cols):
    dmr_df.columns = canonical_cols[:ncols]
else:
    extras = [f"col_{i}" for i in range(ncols - len(canonical_cols))]
    dmr_df.columns = canonical_cols + extras

# coerce obvious numeric columns to numeric where present
num_cols_to_try = [
    "start", "end", "score",
    "samplea_total", "sampleb_total",
    "samplea_fraction_modified", "sampleb_fraction_modified",
    "map_pvalue", "effect_size",
    "balanced_map_pvalue", "balanced_effect_size"
]
for c in num_cols_to_try:
    if c in dmr_df.columns:
        dmr_df[c] = pd.to_numeric(dmr_df[c], errors="coerce")

# ensure output directory exists and save parsed table (parquet preferred)
os.makedirs(out_dir, exist_ok=True)
parsed_path = os.path.join(out_dir, f"{date_today}_dmr_parsed.parquet")
try:
    dmr_df.to_parquet(parsed_path, index=False)
    print("Saved parquet:", parsed_path)
except Exception:
    csv_path = os.path.join(out_dir, f"{date_today}_dmr_parsed.csv")
    dmr_df.to_csv(csv_path, index=False)
    print("Parquet not available, saved csv:", csv_path)

print("Loaded DMR:", dmr_path)
print("Assigned columns:", dmr_df.columns.tolist())
print("Shape:", dmr_df.shape)
dmr_df.head()

out_dir: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/
Parquet not available, saved csv: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/20251110_dmr_parsed.csv
Loaded DMR: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/20251110_single_base_noFiltered_mC07_noFilter_day6_CRoff_vs_Unedit.bed
Assigned columns: ['chrom', 'start', 'end', 'name', 'score', 'strand', 'samplea_counts', 'samplea_total', 'sampleb_counts', 'sampleb_total', 'samplea_percents', 'sampleb_percents', 'samplea_fraction_modified', 'sampleb_fraction_modified', 'map_pvalue', 'effect_size', 'cohen_h', 'cohen_h_low', 'cohen_h_high']
Shape: (286, 19)


Unnamed: 0,chrom,start,end,name,score,strand,samplea_counts,samplea_total,sampleb_counts,sampleb_total,samplea_percents,sampleb_percents,samplea_fraction_modified,sampleb_fraction_modified,map_pvalue,effect_size,cohen_h,cohen_h_low,cohen_h_high
0,chr1,206583089,206583090,.,-0.314127,+,m:21,27,m:9,12,m:77.78,m:75.00,0.777778,0.75,1.0,0.027778,0.065432,-0.614566,0.745431
1,chr1,206583090,206583091,.,1.6692,-,m:19,19,m:10,12,m:100.00,m:83.33,1.0,0.833333,0.276264,0.166667,0.841069,0.118363,1.563774
2,chr1,206583173,206583174,.,0.063542,+,m:997,1112,m:470,516,m:89.66,m:91.09,0.896583,0.910853,1.0,-0.01,-0.048408,-0.055992,0.152807
3,chr1,206583174,206583175,.,0.813658,-,m:1154,1187,m:644,655,m:97.22,m:98.32,0.972199,0.983206,1.0,-0.01,-0.075125,-0.020275,0.170524
4,chr1,206583387,206583388,.,0.081907,+,m:836,1309,m:393,595,m:63.87,m:66.05,0.638656,0.660504,1.0,-0.02,-0.0458,-0.051107,0.142706


In [55]:
import os
from IPython.display import display, HTML

# Visualize all columns from dmr_df and save interactive HTMLs to out_dir
import plotly.express as px
import plotly.graph_objects as go

os.makedirs(out_dir, exist_ok=True)

# Save a table summary
summary = dmr_df.describe(include='all').transpose()
summary_path = os.path.join(out_dir, f"{date_today}_dmr_column_summary.csv")
summary.to_csv(summary_path)

numcols = dmr_df.select_dtypes(include=['number']).columns.tolist()

def _safe_name(name):
    return str(name).replace(os.sep, "_").replace(" ", "_").replace("\t", "_")

# Per-column visualizations
for col in dmr_df.columns:
    safe = _safe_name(col)
    try:
        if col in numcols:
            # Histogram
            fig_h = px.histogram(dmr_df, x=col, nbins=80, title=f"Histogram: {col}")
            # fig_h.write_html(os.path.join(out_dir, f"{date_today}_dmr_hist_{safe}.html"), include_plotlyjs='cdn')
            fig_h.show()

            # Boxplot
            fig_b = px.box(dmr_df, y=col, points="outliers", title=f"Boxplot: {col}")
            # fig_b.write_html(os.path.join(out_dir, f"{date_today}_dmr_box_{safe}.html"), include_plotlyjs='cdn')
            fig_b.show()
        else:
            # Categorical / text: show top value counts (up to 50)
            vc = dmr_df[col].fillna("NA").astype(str).value_counts().head(50)
            if len(vc):
                fig_c = px.bar(x=vc.values[::-1], y=vc.index.astype(str)[::-1], orientation='h',
                               title=f"Top value counts: {col}", labels={'x':'count','y':col})
                fig_c.update_layout(yaxis={'categoryorder':'array','categoryarray':vc.index[::-1].astype(str).tolist()})
                # fig_c.write_html(os.path.join(out_dir, f"{date_today}_dmr_valcounts_{safe}.html"), include_plotlyjs='cdn')
                fig_c.show()
            else:
                # fallback: display empty info
                display(HTML(f"<b>{col}</b>: no values to plot"))
    except Exception as e:
        print(f"Skipped plotting column {col!r} due to error: {e}")

# Correlation heatmap for numeric columns
if len(numcols) >= 2:
    try:
        corr = dmr_df[numcols].corr()
        fig_corr = px.imshow(corr, text_auto=True, aspect="auto", title="Correlation matrix (numeric columns)")
        # fig_corr.write_html(os.path.join(out_dir, f"{date_today}_dmr_correlation_numeric.html"), include_plotlyjs='cdn')
        fig_corr.show()
    except Exception as e:
        print("Failed to create correlation heatmap:", e)

print("Saved summary:", summary_path)
print("Plots saved to:", out_dir)

Saved summary: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/20251110_dmr_column_summary.csv
Plots saved to: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/


In [56]:
# Select significant CG pairs from DMR results and plot them (new cell at index 69).
# Uses existing notebook variables: dmr_df (parsed modkit dmr), df_roi_stats (pileup ROI stats),
# plotly (px) and out_dir/dmr_folder_path for saving. Does not re-import modules.

# Parameters
pvalue_thresh = 0.05

# ensure numeric columns
dmr_df['map_pvalue'] = pd.to_numeric(dmr_df['map_pvalue'], errors='coerce')
dmr_df['effect_size'] = pd.to_numeric(dmr_df['effect_size'], errors='coerce')
dmr_df['samplea_fraction_modified'] = pd.to_numeric(dmr_df['samplea_fraction_modified'], errors='coerce')
dmr_df['sampleb_fraction_modified'] = pd.to_numeric(dmr_df['sampleb_fraction_modified'], errors='coerce')

# filter significant by MAP-based p-value
sig = dmr_df[dmr_df['map_pvalue'] <= pvalue_thresh].copy()

dmr_df['map_pval_less005'] = dmr_df['map_pvalue'] <= 0.05


# # restrict to ROI positions if df_roi_stats exists
# if 'df_roi_stats' in globals():
#     roi_positions = set(df_roi_stats['start'].astype(int).tolist())
#     sig = sig[sig['start'].isin(roi_positions)].copy()

# quick exit if none
if sig.shape[0] == 0:
    print(f"No significant CG pairs found in ROI at map_pvalue <= {pvalue_thresh}")
else:
    # add convenience cols
    sig['pos'] = sig['start'].astype(str)
    sig['a_perc'] = sig['samplea_fraction_modified'] * 100
    sig['b_perc'] = sig['sampleb_fraction_modified'] * 100
    sig['total_reads'] = sig.get('samplea_total', 0).fillna(0).astype(int) + sig.get('sampleb_total', 0).fillna(0).astype(int)

    # save a table of significant sites
    os.makedirs(out_dir, exist_ok=True)
    sig_table_path = os.path.join(out_dir, f"dmr_significant_p{pvalue_thresh:.3f}_roi.tsv")
    sig.to_csv(sig_table_path, sep='\t', index=False)
    print("Saved significant sites table:", sig_table_path)
    display(sig[['chrom','start','end','strand','map_pvalue','effect_size','a_perc','b_perc','total_reads']].reset_index(drop=True))

    # plot the map_pval_less005 distribution which corresponds to significant sites
    fig_mappval_hist = px.histogram(
        dmr_df,
        x='map_pval_less005',
        nbins=80,
        title=f"MAP-based p-value distribution (highlighting p <= {pvalue_thresh}) <br>{experiment_codition}",
        labels={'map_pval_less005':'MAP-based p-value'}
    )
    fig_mappval_hist.update_layout(height=520)
    mappval_hist_path = os.path.join(out_dir, f"dmr_map_pval_distribution.html")
    # fig_mappval_hist.write_html(mappval_hist_path, include_plotlyjs='cdn')
    fig_mappval_hist.show()
    # print("Saved MAP-based p-value distribution histogram:", mappval_hist_path)

    # plot the percent of significant sites where map_pvalue <= pvalue_thresh is colored red, others blue (color not working)
    # Check https://plotly.com/python/pie-charts/ for coloring instructions
    percent_significant = (sig.shape[0] / dmr_df.shape[0]) * 100
    fig_mappval_pie = px.pie(
        dmr_df,
        names=['Not Significant (p > {})'.format(pvalue_thresh), 'Significant (p <= {})'.format(pvalue_thresh)],
        values=[dmr_df.shape[0] - sig.shape[0],sig.shape[0]],
        title=f"Percentage of significant CGs (map_pvalue <= {pvalue_thresh}): {percent_significant:.2f}% <br>{experiment_codition}",
        # color_discrete_map={'Not Significant (p > {})'.format(pvalue_thresh): 'blue',
        #                     'Significant (p <= {})'.format(pvalue_thresh): 'red'},
        # colors=['blue','red']
    )
    fig_mappval_pie.update_layout(height=520)
    mappval_pie_path = os.path.join(out_dir, f"dmr_map_pval_percentage.html")
    # fig_mappval_pie.write_html(mappval_pie_path, include_plotlyjs='cdn')
    fig_mappval_pie.show()
    # print("Saved MAP-based p-value percentage pie chart:", mappval_pie_path)


    # plot effect size distribution of all sites without sorting and highlight significant which have map_pvalue <= pvalue_thresh
    fig_effectsize_hist = px.histogram(
        dmr_df,
        x='effect_size',
        nbins=80,
        # add a line break and write experiment_codition into the title
        title=f"Effect size distribution (highlighting significant sites with map_pvalue <= {pvalue_thresh})<br>{experiment_codition}",
        color=(dmr_df['map_pvalue'] <= pvalue_thresh), 
        color_discrete_map={True: 'red', False: 'blue'},
    )
            # labels={'effect_size':'Effect size (A - B)'}

    fig_effectsize_hist.update_layout(height=520)
    effectsize_hist_path = os.path.join(out_dir, f"dmr_effect_size_distribution.html")
    # fig_effectsize_hist.write_html(effectsize_hist_path, include_plotlyjs='cdn')
    fig_effectsize_hist.show()
    # print("Saved effect size distribution histogram:", effectsize_hist_path)        

    # plot effect sizes of all sites without sorting and highlight significant which have map_pvalue <= pvalue_thresh 
    # add color legend names as 'Significant: map_pvalue <= pvalue_thresh' and 'Not Significant: map_pvalue > pvalue_thresh'  
    fig_effectsize_scatter = px.scatter(
        dmr_df,
        x=dmr_df.index,
        y='effect_size',
        color_discrete_map={True: 'red', False: 'blue'},
        color=(dmr_df['map_pvalue'] <= pvalue_thresh), 
        labels={'effect_size':'Effect size (A - B)','index':'Index',
                'color':f'Significant: map_pvalue <= {pvalue_thresh}'},
        title=f"Effect sizes for all CGs (highlighting significant sites with map_pvalue <= {pvalue_thresh}) <br>{experiment_codition}",
    )
    fig_effectsize_scatter.update_layout(height=520)
    effectsize_scatter_path = os.path.join(out_dir, f"dmr_effect_size_scatter.html")
    # fig_effectsize_scatter.write_html(effectsize_scatter_path, include_plotlyjs='cdn')
    fig_effectsize_scatter.show()
    # print("Saved effect size scatter plot:", effectsize_scatter_path)       

    # bar plot effect sizes of all sites without sorting and highlight significant which have map_pvalue <= pvalue_thresh 
    fig_effectsize_bar = px.bar(
        dmr_df,
        x=dmr_df.index,         
        y='effect_size',
        color=(dmr_df['map_pvalue'] <= pvalue_thresh),
        labels={'effect_size':'Effect size (A - B)','index':'Index',
                'color':f'Significant: map_pvalue <= {pvalue_thresh}'},
        color_discrete_map={True: 'red', False: 'blue'},
        title=f"Effect sizes for all CGs (n={len(dmr_df)}) (highlighting significant sites with map_pvalue <= {pvalue_thresh}) <br>{experiment_codition}",
    )
    fig_effectsize_bar.update_layout(height=520)
    effectsize_bar_path = os.path.join(out_dir, f"dmr_effect_size_bar.html")
    # fig_effectsize_bar.write_html(effectsize_bar_path, include_plotlyjs='cdn')
    fig_effectsize_bar.show()
    # print("Saved effect size bar plot:", effectsize_bar_path) 


    # Bar: effect size per position (without sorting) with effect size colors 
    # dmr_df['label'] = dmr_df['pos'] + ":" + dmr_df['strand'].astype(str)
    fig_bar_unsorted = px.bar(
        dmr_df,
        x=dmr_df.index, 
        y='effect_size',        
        color='effect_size',
        title=f"Effect size for all CGs (n={len(dmr_df)}) <br>{experiment_codition}",
        labels={'effect_size':'Effect size (A - B)','label':'position:strand'}
    )
            # hover_data=['map_pvalue','a_perc','b_perc','total_reads'],
            # x='label',

    fig_bar_unsorted.update_layout(xaxis_tickangle=45, height=520)
    bar_unsorted_path = os.path.join(out_dir, f"dmr_sig_effectsize_unsorted_p{pvalue_thresh:.3f}.html")
    # fig_bar_unsorted.write_html(bar_unsorted_path, include_plotlyjs='cdn')
    fig_bar_unsorted.show()
    # print("Saved unsorted effect-size bar plot:", bar_unsorted_path)    
        
    # Bar: effect size per position (without sorting)
    sig['label'] = sig['pos'] + ":" + sig['strand'].astype(str)
    fig_bar_unsorted = px.bar(
        sig,
        x='label',
        y='effect_size',        
        color='effect_size',
        hover_data=['map_pvalue','a_perc','b_perc','total_reads'],
        title=f"Effect size for significant CGs (n={len(sig)}) with map_pvalue <= {pvalue_thresh} <br>{experiment_codition}",
        labels={'effect_size':'Effect size (A - B)','label':'position:strand'}
    )
    fig_bar_unsorted.update_layout(xaxis_tickangle=45, height=520)
    bar_unsorted_path = os.path.join(out_dir, f"dmr_sig_effectsize_unsorted_p{pvalue_thresh:.3f}.html")
    # fig_bar_unsorted.write_html(bar_unsorted_path, include_plotlyjs='cdn')
    fig_bar_unsorted.show()
    # print("Saved unsorted effect-size bar plot:", bar_unsorted_path)    
        

    # Bar: effect size per position (sorted)
    sig_sorted = sig.sort_values('effect_size', ascending=False).copy()
    sig_sorted['label'] = sig_sorted['pos'] + ":" + sig_sorted['strand'].astype(str)
    fig_bar = px.bar(
        sig_sorted,
        x='label',
        y='effect_size',
        color='effect_size',
        hover_data=['map_pvalue','a_perc','b_perc','total_reads'],
        title=f"Effect size for significant CGs (n={len(sig_sorted)}) <br>{experiment_codition}",
        labels={'effect_size':'Effect size (A - B)','label':'position:strand'}
    )
    fig_bar.update_layout(xaxis_tickangle=45, height=520)
    bar_path = os.path.join(out_dir, f"dmr_sig_effectsize_p{pvalue_thresh:.3f}.html")
    # fig_bar.write_html(bar_path, include_plotlyjs='cdn')
    fig_bar.show()
    # print("Saved effect-size bar plot:", bar_path)



    # Scatter: sample A vs sample B percent modified (size = total reads, color = effect size)
    fig_scatter = px.scatter(
        sig,
        x='a_perc',
        y='b_perc',
        color='effect_size',
        size='total_reads',
        hover_data=['pos','start','map_pvalue','effect_size','cohen_h'],
        title=f"Significant CGs (map_pvalue <= {pvalue_thresh}) — sample A vs B percent modified <br>{experiment_codition}",
        labels={'a_perc':'Sample A % modified','b_perc':'Sample B % modified'}
    )
    fig_scatter.update_layout(height=520)
    scatter_path = os.path.join(out_dir, f"dmr_sig_scatter_p{pvalue_thresh:.3f}.html")
    # fig_scatter.write_html(scatter_path, include_plotlyjs='cdn')
    fig_scatter.show()
    # print("Saved scatter plot:", scatter_path)


Saved significant sites table: /home/michalula/code/epiCausality/epiCode/analyze_ont_data/T2T_v2.0_mapped/T_cells/day_6/modkit_dmr/mc_07/notFiltered/new_dmr_output/dmr_significant_p0.050_roi.tsv


Unnamed: 0,chrom,start,end,strand,map_pvalue,effect_size,a_perc,b_perc,total_reads
0,chr1,206586163,206586164,+,1.427041e-07,0.39,72.953450,33.631486,1805
1,chr1,206586164,206586165,-,6.146904e-10,0.45,76.122254,30.656934,1595
2,chr1,206586168,206586169,+,2.685956e-08,0.41,68.416370,26.690390,1686
3,chr1,206586169,206586170,-,8.040981e-07,0.36,57.301295,20.915033,847
4,chr1,206586173,206586174,+,3.541000e-13,0.52,78.795810,27.335640,1724
...,...,...,...,...,...,...,...,...,...
206,chr1,206587963,206587964,-,2.311355e-02,0.20,59.156280,38.543516,1606
207,chr1,206587995,206587996,+,2.755856e-05,0.30,82.496310,52.127660,1918
208,chr1,206587996,206587997,-,2.263020e-11,0.44,90.637946,47.297296,1799
209,chr1,206588089,206588090,-,2.605385e-02,0.17,39.665273,23.464566,1830


# Todo:
- plot the raw pileups together too - next too each other (have the separat individual pileup values plots above from the funcs on top)

-- chech how to look at VARIANCE per CG unit position

# TODO: check
- are there really NO diffs between the reads selected with the mC > 70 and mC > 99.5% filtering ??

could be as the mC calles are automatically selected

and in the CRoff the auto threshold 
* in mC > 70 was to 0.79
> Using filter threshold 0.7910156 for C.
* in mC > 99.5 was to  0.79
> Using filter threshold 0.7910156 for C.


and in the Unediter the auto threshold 
* in mC > 70 was to 0.8496
> Using filter threshold 0.8496094 for C.
* in mC > 99.5 was to 0.849
> Using filter threshold 0.8496094 for C.


SAME per condition AUTOMATIC modkit filtering threshold per mC run were set

(NOT 0.995 and not 0.7)