In [None]:
import random

def find_cpgs(dna: str) -> list[int]:
    """Return the start positions of all CpG dinucleotides in a DNA string."""
    dna = dna.upper()
    return [i for i in range(len(dna) - 1) if dna[i] == "C" and dna[i + 1] == "G"]


def count_surrounding_cpgs(dna: str, n: int) -> dict[int, int]:
    """
    For each CpG in dna, count how many *other* CpGs have their start position
    within N bases (i.e. |pos_other - pos_self| <= N, excluding self).

    Returns a dict mapping each CpG start position -> surrounding CpG count.
    """
    cpg_positions = find_cpgs(dna)
    result = {}
    for pos in cpg_positions:
        count = sum(
            1
            for other in cpg_positions
            if other != pos and abs(other - pos) <= n
        )
        result[pos] = count
    return result


def generate_dna(length: int, cpg_rate: float = 0.05, seed: int = 42) -> str:
    """
    Generate a random DNA string of a given length.
    cpg_rate controls the approximate fraction of positions that start a CpG.
    """
    rng = random.Random(seed)
    bases = list("ACGT")
    seq = []
    i = 0
    while i < length:
        if i < length - 1 and rng.random() < cpg_rate:
            seq.append("C")
            seq.append("G")
            i += 2
        else:
            seq.append(rng.choice(bases))
            i += 1
    return "".join(seq[:length])


if __name__ == "__main__":
    # --- Generate a 100 bp test sequence ---
    SEQ_LEN = 100
    N = 10  # window size (bases on each side)

    dna = generate_dna(SEQ_LEN, cpg_rate=0.08, seed=42)
    print(f"DNA sequence ({SEQ_LEN} bp):")
    print(dna)
    print()

    cpg_positions = find_cpgs(dna)
    print(f"CpG positions (0-based): {cpg_positions}")
    print(f"Total CpGs found: {len(cpg_positions)}")
    print()

    surrounding = count_surrounding_cpgs(dna, n=N)
    print(f"Surrounding CpG counts within +/-{N} bases:")
    print(f"{'CpG pos':>10}  {'context':>20}  {'surrounding CpGs':>16}")
    print("-" * 52)
    for pos, count in surrounding.items():
        # Show a small context window around the CpG
        start = max(0, pos - 3)
        end = min(SEQ_LEN, pos + 5)
        context = dna[start:end]
        print(f"{pos:>10}  {context:>20}  {count:>16}")


In [None]:

# --- Generate a 100 bp test sequence ---
SEQ_LEN = 100
N = 10  # window size (bases on each side)

dna = generate_dna(SEQ_LEN, cpg_rate=0.08, seed=42)
print(f"DNA sequence ({SEQ_LEN} bp):")
print(dna)
print()

cpg_positions = find_cpgs(dna)
print(f"CpG positions (0-based): {cpg_positions}")
print(f"Total CpGs found: {len(cpg_positions)}")
print()

surrounding = count_surrounding_cpgs(dna, n=N)
print(f"Surrounding CpG counts within +/-{N} bases:")
print(f"{'CpG pos':>10}  {'context':>20}  {'surrounding CpGs':>16}")
print("-" * 52)
for pos, count in surrounding.items():
    # Show a small context window around the CpG
    start = max(0, pos - 3)
    end = min(SEQ_LEN, pos + 5)
    context = dna[start:end]
    print(f"{pos:>10}  {context:>20}  {count:>16}")


In [None]:
import sys
from pathlib import Path
from datetime import datetime

# Add the folder containing 'my_functions.py' to sys.path
sys.path.append("/home/michalula/code/epiCausality/epiCode/utils/") # str(Path(__file__).parent / 'utils'))

# Import the module or specific functions
# from /home/michalula/code/epiCausality/epiCode/utils/funcs_extract_mC_profiles_from_BAMs.py
# from funcs_extract_mC_profiles_from_BAMs import system_info, extract_from_bam
from funcs_extract_mC_profiles_from_BAMs import (
    system_info,
    get_reference_sequence,
    create_output_directory,
    extract_from_bam,
    process_extracted_reads,
    visualize_data,
    create_padded_reads,
    plot_padded_reads,
    save_padded_reads,
    process_extracted_reads_no_fully_unmethylated,
    create_padded_reads_no_fully_unmethylated
    # main,
)

In [None]:
system_info()
date_today = datetime.today().strftime('%Y-%m-%d')
ref_genome_path = Path('/home/michalula/data/ref_genomes/t2t_v2_0/up_chm13v2.0.fasta')
reg_genome_version = "t2t_v2_0"
region_chr = 'chr1'

# 6500bps: EXACT cutting region  
# chr1:206,583,334-206,589,873
region_start = 206583334    + 20
region_end = 206589874      - 20 
# CD55 TSS start: chr1:206586828-206606065 (+)
# (before TSS: 3474, after TSS: 3026 bps)

region_str = region_chr + ":" + str(region_start) + "-" + str(region_end) #'chr1:206586162-206586192'
region_length = region_end - region_start
print("region_length", region_length)


motifs=['CG,0']
ref_seq_list = get_reference_sequence(ref_genome_path, region_chr, region_start, region_end)

print('region_chr', region_chr)
print('region_start', region_start)
print('region_end', region_end)
print(f'{region_chr}:{region_start}-{region_end}') 

In [None]:
ref_seq_list

In [None]:
dna_6500_roi_str = "".join(ref_seq_list)         # "abc"
dna_6500_roi_str

In [None]:
len(dna_6500_roi_str)

In [None]:
find_cpgs(dna_6500_roi_str) 

In [None]:

def count_surrounding_cpgs(dna: str, n: int) -> dict[int, int]:
    """
    For each CpG in dna, count how many *other* CpGs have their start position
    within N bases (i.e. |pos_other - pos_self| <= N, excluding self).

    Returns a dict mapping each CpG start position -> surrounding CpG count.
    """
    cpg_positions = find_cpgs(dna)
    result = {}
    for pos in cpg_positions:
        count = sum(
            1
            for other in cpg_positions
            if other != pos and abs(other - pos) <= n
        )
        result[pos] = count
    return result

In [None]:
count_surrounding_cpgs(dna_6500_roi_str, 100)

In [None]:
dna = dna_6500_roi_str
N = 100

surrounding = count_surrounding_cpgs(dna, n=N)
print(f"Surrounding CpG counts within +/-{N} bases:")
print(f"{'CpG pos':>10}  {'context':>20}  {'surrounding CpGs':>16}")
print("-" * 52)
for pos, count in surrounding.items():
    # Show a small context window around the CpG
    start = max(0, pos - 3)
    end = min(SEQ_LEN, pos + 5)
    context = dna[start:end]
    print(f"{pos:>10}  {context:>20}  {count:>16}")

In [None]:
import matplotlib.pyplot as plt

# Extract positions and counts from the surrounding dictionary
positions = list(surrounding.keys())
counts = list(surrounding.values())

# Create a bar plot
plt.figure(figsize=(14, 6))
plt.bar(positions, counts, width=1.0, edgecolor='none')
plt.xlabel('CpG Position')
plt.ylabel('Surrounding CpG Count')
plt.title(f'Surrounding CpG Counts within +/-{N} bases')
plt.tight_layout()a
plt.show()

In [None]:
len(surrounding)

In [None]:
CG_indexes = list(range(0, len(surrounding) ))
CG_indexes

In [None]:
len(list(surrounding.values()))

In [None]:
import matplotlib.pyplot as plt

# Extract positions and counts from the surrounding dictionary
positions =  list(range(0, len(surrounding) )) # CG_indexes #list(surrounding.keys())
counts = list(surrounding.values())

# Create a bar plot
plt.figure(figsize=(14, 6))
plt.bar(positions, counts, width=1.0, edgecolor='none')
plt.xlabel('CpG Position')
plt.ylabel('Surrounding CpG Count')
plt.title(f'Surrounding CpG Counts within +/-{N} bases')
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Extract positions and counts from the surrounding dictionary
positions = list(surrounding.keys())
counts = list(surrounding.values())

# Create a bar plot
plt.figure(figsize=(14, 6))
plt.bar(positions, counts, width=1.0, edgecolor='none')
plt.xlabel('CpG Position')
plt.ylabel('Surrounding CpG Count')
plt.title(f'Surrounding CpG Counts within +/-{N} bases')
plt.tight_layout()
plt.show()

In [None]:
dna = dna_6500_roi_str
N = 100

surrounding = count_surrounding_cpgs(dna, n=N)
print(f"Surrounding CpG counts within +/-{N} bases:")
print(f"{'CpG pos':>10}  {'context':>20}  {'surrounding CpGs':>16}")
print("-" * 52)
for pos, count in surrounding.items():
    # Show a small context window around the CpG
    start = max(0, pos - 3)
    end = min(SEQ_LEN, pos + 5)
    context = dna[start:end]
    print(f"{pos:>10}  {context:>20}  {count:>16}")

import matplotlib.pyplot as plt

# Extract positions and counts from the surrounding dictionary
positions =  list(range(0, len(surrounding) )) # CG_indexes #list(surrounding.keys())
counts = list(surrounding.values())

# Create a bar plot
plt.figure(figsize=(14, 6))
plt.bar(positions, counts, width=1.0, edgecolor='none')
plt.xlabel('CpG Position')
plt.ylabel('Surrounding CpG Count')
plt.title(f'Surrounding CpG Counts within +/-{N} bases')
plt.tight_layout()
plt.show()

In [None]:
import sys
from pathlib import Path
from datetime import datetime

# Add the folder containing 'my_functions.py' to sys.path
sys.path.append("/home/michalula/code/epiCausality/epiCode/utils/") # str(Path(__file__).parent / 'utils'))

# Import the module or specific functions
# from /home/michalula/code/epiCausality/epiCode/utils/funcs_extract_mC_profiles_from_BAMs.py
# from funcs_extract_mC_profiles_from_BAMs import system_info, extract_from_bam
from funcs_extract_mC_profiles_from_BAMs import (
    system_info,
    get_reference_sequence,
    create_output_directory,
    extract_from_bam,
    process_extracted_reads,
    visualize_data,
    create_padded_reads,
    plot_padded_reads,
    save_padded_reads,
    process_extracted_reads_no_fully_unmethylated,
    create_padded_reads_no_fully_unmethylated
    # main,
)


def find_cpgs(dna: str) -> list[int]:
    """Return the start positions of all CpG dinucleotides in a DNA string."""
    dna = dna.upper()
    return [i for i in range(len(dna) - 1) if dna[i] == "C" and dna[i + 1] == "G"]


def count_surrounding_cpgs(dna: str, n: int) -> dict[int, int]:
    """
    For each CpG in dna, count how many *other* CpGs have their start position
    within N bases (i.e. |pos_other - pos_self| <= N, excluding self).

    Returns a dict mapping each CpG start position -> surrounding CpG count.
    """
    cpg_positions = find_cpgs(dna)
    result = {}
    for pos in cpg_positions:
        count = sum(
            1
            for other in cpg_positions
            if other != pos and abs(other - pos) <= n
        )
        result[pos] = count
    return result



system_info()
date_today = datetime.today().strftime('%Y-%m-%d')
ref_genome_path = Path('/home/michalula/data/ref_genomes/t2t_v2_0/up_chm13v2.0.fasta')
reg_genome_version = "t2t_v2_0"
region_chr = 'chr1'

# 6500bps: EXACT cutting region  
# chr1:206,583,334-206,589,873
region_start = 206583334    + 20
region_end = 206589874      - 20 
# CD55 TSS start: chr1:206586828-206606065 (+)
# (before TSS: 3474, after TSS: 3026 bps)

region_str = region_chr + ":" + str(region_start) + "-" + str(region_end) #'chr1:206586162-206586192'
region_length = region_end - region_start
print("region_length", region_length)


motifs=['CG,0']
ref_seq_list = get_reference_sequence(ref_genome_path, region_chr, region_start, region_end)

print('region_chr', region_chr)
print('region_start', region_start)
print('region_end', region_end)
print(f'{region_chr}:{region_start}-{region_end}') 



dna_6500_roi_str = "".join(ref_seq_list)         # "abc"

dna = dna_6500_roi_str
N = 100
SEQ_LEN = len(dna)

surrounding = count_surrounding_cpgs(dna, n=N)
print(f"Surrounding CpG counts within +/-{N} bases:")
print(f"{'CpG pos':>10}  {'context':>20}  {'surrounding CpGs':>16}")
print("-" * 52)
for pos, count in surrounding.items():
    # Show a small context window around the CpG
    start = max(0, pos - 3)
    end = min(SEQ_LEN, pos + 5)
    context = dna[start:end]
    print(f"{pos:>10}  {context:>20}  {count:>16}")

import matplotlib.pyplot as plt

# Extract positions and counts from the surrounding dictionary
positions =  list(range(0, len(surrounding) )) # CG_indexes #list(surrounding.keys())
counts = list(surrounding.values())

# Create a bar plot
plt.figure(figsize=(14, 6))
plt.bar(positions, counts, width=1.0, edgecolor='none')
plt.xlabel('CpG Position')
plt.ylabel('Surrounding CpG Count')
plt.title(f'Surrounding CpG Counts within +/-{N} bases')
plt.tight_layout()
plt.show()