# 1. Install Modules

In [None]:
# Bioinformatics Tools (Ubuntu)
!sudo apt-get update
!sudo apt-get install -y fastp flash bwa samtools

# Python Library
!pip3 install biopython cutadapt pysam --break-system-packages

# 2 Trimming and Discard trimmed sample

In [None]:
import subprocess
import glob
import os

# Specify the folder containing your input files
# Specify the folder where you want to save the untrimmed sequences (adapter-free sequences)

input_folder = "fastq"
untrimmed_output_folder = "fastq/A_Untrimmed_output"

# Define the adapter sequences for R1 and R2
adapter_sequence_r1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
adapter_sequence_r2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"

# Use glob to get a list of all input file pairs (R1 and R2) in the folder
input_file_pairs = []
for input_r1 in glob.glob(os.path.join(input_folder, "*_R1.fastq.gz")):
    # Assuming R2 files have the same naming format as R1 files
    input_r2 = input_r1.replace("_R1.fastq.gz", "_R2.fastq.gz")
    if os.path.exists(input_r2):  # Ensure R2 file exists
        input_file_pairs.append({"r1": input_r1, "r2": input_r2})

# Create the output folder if it doesn't exist
os.makedirs(untrimmed_output_folder, exist_ok=True)

for input_files in input_file_pairs:
    input_r1 = input_files["r1"]
    input_r2 = input_files["r2"]

    # Define output file paths for untrimmed (clean, adapter-free) sequences
    untrimmed_r1 = os.path.join(untrimmed_output_folder, os.path.basename(input_r1).replace(".fastq.gz", "_untrimmed.fastq.gz"))
    untrimmed_r2 = os.path.join(untrimmed_output_folder, os.path.basename(input_r2).replace(".fastq.gz", "_untrimmed.fastq.gz"))

    # Use cutadapt to keep only untrimmed sequences (completely adapter-free)
    result = subprocess.run([
        "cutadapt",
        "-a", adapter_sequence_r1,  # Adapter for R1
        "-A", adapter_sequence_r2,  # Adapter for R2
        "-O", "15",                 # Minimum overlap for adapter trimming
        "--discard-trimmed",        # Discard sequences where trimming occurred
        "-o", untrimmed_r1,         # Save only untrimmed R1 reads
        "-p", untrimmed_r2,         # Save only untrimmed R2 reads
        input_r1, input_r2
    ], capture_output=True, text=True)

    # Log result
    if result.returncode == 0:
        print(f"Untrimmed sequences saved: {untrimmed_r1}, {untrimmed_r2}")
    else:
        print(f"Error processing {input_r1} and {input_r2}:\n{result.stderr}")

# 3. Q filtering

In [None]:
import os
import subprocess

# Quality threshold (Phred score)
quality_threshold = 30

# Set input and output folders
input_folder = "fastq/A_Untrimmed_output"
output_folder = "fastq/B_Qfiltered"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate through files in the input folder, processing only those ending with "_untrimmed.fastq.gz"
for filename in os.listdir(input_folder):
    if filename.endswith("_untrimmed.fastq.gz"):
        # Input file path
        input_file = os.path.join(input_folder, filename)
        
        # Output filename (e.g., sample_untrimmed.fastq.gz -> sample_Qfiltered.fastq.gz)
        output_file = os.path.join(
            output_folder, 
            filename.replace("_untrimmed.fastq.gz", "_Qfiltered.fastq.gz")
        )
        
        # Execute fastp in single-end mode for each file
        subprocess.call([
            "fastp",
            "-i", input_file,                      # Input file
            "-o", output_file,                     # Output file
            "-q", str(quality_threshold),          # Quality threshold for a base to be qualified
            "-u", "15",                            # Discard reads if the percentage of unqualified bases is >= 15%
            "-l", "151",                           # Minimum read length to keep
            "--cut_mean_quality", "30",            # Discard reads if mean quality is less than 30
            "--html", f"{output_file}.html",       # HTML report file path
            "--json", f"{output_file}.json"        # JSON report file path
        ])
        
        print(f"Filtering for {filename} is complete.\n"
              f"Output FASTQ : {output_file}\n"
              f"Reports      : {output_file}.html / {output_file}.json\n")

print("All filtering processes are done.")

# 4. Match Paired-End Read IDs

In [None]:
import gzip
import glob
import os

def extract_matching_reads(r1_path, r2_path, out_r1_path, out_r2_path):
    def get_read_id(header):
        # Extract ID from the FASTQ header
        return header.split()[0].replace('/1', '').replace('/2', '')

    r1_ids = set()
    r2_ids = set()

    # Extract all read IDs from the R1 file
    with gzip.open(r1_path, 'rt') as r1_file:
        while True:
            header = r1_file.readline()
            if not header:
                break
            r1_ids.add(get_read_id(header.strip()))
            # Skip the other 3 lines of the read (sequence, +, quality)
            [r1_file.readline() for _ in range(3)]

    with gzip.open(r2_path, 'rt') as r2_file:
        while True:
            header = r2_file.readline()
            if not header:
                break
            r2_ids.add(get_read_id(header.strip()))
            [r2_file.readline() for _ in range(3)]

    # Find common and unique IDs
    matching_ids = r1_ids & r2_ids
    r1_only = r1_ids - r2_ids
    r2_only = r2_ids - r1_ids

    print(f"Processing {os.path.basename(r1_path)} and {os.path.basename(r2_path)}")
    print(f"Total R1 IDs: {len(r1_ids)}, Total R2 IDs: {len(r2_ids)}, Matching IDs: {len(matching_ids)}")
    print(f"IDs only in R1: {len(r1_only)}, IDs only in R2: {len(r2_only)}\n")

    # Create the output directory if it doesn't exist
    for out_path in [out_r1_path, out_r2_path]:
        os.makedirs(os.path.dirname(out_path), exist_ok=True)

    # Function to write only the reads with matching IDs to a new file
    def write_matching_reads(input_path, output_path, matching_ids):
        with gzip.open(input_path, 'rt') as infile, gzip.open(output_path, 'wt') as outfile:
            while True:
                lines = [infile.readline() for _ in range(4)]
                if not lines[0]:
                    break
                read_id = get_read_id(lines[0].strip())
                if read_id in matching_ids:
                    outfile.writelines(lines)

    # Write the filtered R1 and R2 files
    write_matching_reads(r1_path, out_r1_path, matching_ids)
    write_matching_reads(r2_path, out_r2_path, matching_ids)

# --------------------------
# Apply to all file pairs
# --------------------------

input_folder = "fastq/B_Qfiltered"
output_folder = "fastq/C_ID_matched"

# Find all R1 files
r1_files = glob.glob(os.path.join(input_folder, "*_R1_Qfiltered.fastq.gz"))

# For each R1, find the corresponding R2 file and run the process
for r1_file in r1_files:
    r2_file = r1_file.replace("_R1_Qfiltered.fastq.gz", "_R2_Qfiltered.fastq.gz")
    
    if os.path.exists(r2_file):
        # Set the output file paths
        base_name = os.path.basename(r1_file).replace("_R1_Qfiltered.fastq.gz", "")
        out_r1 = os.path.join(output_folder, f"{base_name}_ID_match_R1.fastq.gz")
        out_r2 = os.path.join(output_folder, f"{base_name}_ID_match_R2.fastq.gz")
        
        # Execute the function
        extract_matching_reads(r1_file, r2_file, out_r1, out_r2)
    else:
        print(f"Warning: {r2_file} not found. Skipping.")

# 5 Merge W/ Flash

## 5.1 [R1_back]-[R2_back] merge (FLASH)

In [None]:
import os
import glob
import subprocess

# === Folder Setup ===
input_folder = "fastq/C_id_matched"
output_folder = "fastq/D_merged_output"
os.makedirs(output_folder, exist_ok=True)

# === Set N-values (Overlap Length) per Sample Prefix ===
sample_n_mapping = {
    "Temp": 122,
}

# === Find List of all R1_B Files ===
r1_files = glob.glob(os.path.join(input_folder, "*_R1.fastq.gz"))

print(f"🔎 Found {len(r1_files)} R1 files.")

# === Process Each R1_B File ===
for r1_path in r1_files:
    sample_base = os.path.basename(r1_path).replace("_R1.fastq.gz", "")
    r2_path = os.path.join(input_folder, f"{sample_base}_R2.fastq.gz")

    if not os.path.exists(r2_path):
        print(f"⚠️ Matching R2 file not found for {sample_base} → Skipping.")
        continue

    # Find the corresponding N value for the filename
    matched_n = None
    for prefix, n_value in sample_n_mapping.items():
        if prefix in sample_base:
            matched_n = n_value
            break

    if matched_n is None:
        print(f"⚠️ No N value matched for {sample_base} → Skipping.")
        continue

    output_name = f"{sample_base}_FLASH"

    print(f"🔵 Running FLASH for sample: {sample_base} (N={matched_n})")

    try:
        # Execute the FLASH command
        subprocess.check_call([
            "flash",
            "-m", str(matched_n),   # minimum overlap
            "-M", str(matched_n),   # Maximum overlap
            "-o", output_name,      # Output file prefix
            "-d", output_folder,    # Output directory
            r1_path,
            r2_path
        ])
        print(f"✅ FLASH merging complete → {os.path.join(output_folder, output_name)}.fastq")
    except subprocess.CalledProcessError as e:
        print(f"❌ FLASH merging failed for {sample_base}: {e}")

# 6. fastq -> fasta

In [None]:
import os
import gzip
from Bio import SeqIO

# Input and output folder paths
input_folder = "fastq/D_merged_output"
output_folder = "fastq/E_fastq_to_fasta"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

for filename in os.listdir(input_folder):
    # Process only files with .fastq or .fastq.gz extensions
    if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
        input_file = os.path.join(input_folder, filename)
        
        # Set output filename (.fasta extension)
        output_file = os.path.join(
            output_folder,
            filename.replace(".fastq.gz", ".fasta").replace(".fastq", ".fasta")
        )

        # Choose open mode based on gzip
        open_func = gzip.open if filename.endswith(".gz") else open

        # Read FASTQ and convert to FASTA
        with open_func(input_file, "rt") as fastq_file:
            # open in text mode
            records = list(SeqIO.parse(fastq_file, "fastq"))

        # Save as FASTA
        with open(output_file, "w") as fasta_file:
            SeqIO.write(records, fasta_file, "fasta")

        print(f"Converted: {filename} → {os.path.basename(output_file)}")

print("All conversions are done.")

# 7. Binary data reference seqeunce data generate

In [None]:
import os
import csv
from itertools import product

# Sequence definitions
LOW = "ACTCAATTAC"
HIGH = "ACACATTATC"
CONNECTOR = "ACTCATATAC"

# Set output directory
output_dir = "fastq/Logger_reference"
os.makedirs(output_dir, exist_ok=True)

# Generate sequences
sequences_cl = []
prefix = CONNECTOR + LOW  # Fixed prefix

for bits in product([0, 1], repeat=8):  # 2^8 combinations
    label = []
    parts = [prefix]

    for b in bits:
        parts.append(CONNECTOR)
        x = HIGH if b else LOW
        label.append("H" if b else "L")
        parts.append(x)

    seq_id = f"seq_CLH_{''.join(label)}"
    sequence = ''.join(parts)
    sequences_cl.append((seq_id, sequence))

# Save FASTA file
fasta_path = os.path.join(output_dir, "Logger_reference.fasta")
with open(fasta_path, "w") as f:
    for seq_id, seq in sequences_cl:
        f.write(f">{seq_id}\n{seq}\n")

# Save CSV file
csv_path = os.path.join(output_dir, "Logger_reference.csv")
with open(csv_path, "w", newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Sequence_ID", "Sequence"])
    for seq_id, seq in sequences_cl:
        writer.writerow([seq_id, seq])

print("✅ Logger_reference.fasta and Logger_reference.csv have been created successfully.")
print(f" - Number of generated sequences: {len(sequences_cl)}")

# 8. Reference sequence - Sample Matching

In [None]:
# Index reference
!bwa index "fastq/Logger_reference/Logger_reference.fasta"

In [None]:
%%bash
# Set the path to the reference sequence file
reference_file="fastq/Logger_reference/Logger_reference.fasta"

# Set the directory containing your filtered FASTA files
fasta_directory="fastq/E_fastq_to_fasta"
# Set the output directory for aligned SAM files
output_dir="fastq/1_align_sam"

# Make sure the output directory exists or create it if necessary
mkdir -p "$output_dir"

# Iterate through filtered FASTA files in the specified directory
for fasta_file in "$fasta_directory"/*extendedFrags.fasta; do
    # Generate an output file name based on the input filename
    output_file="$output_dir/$(basename "$fasta_file" .fasta).sam"

    # Perform the BWA alignment 
    bwa mem -M -t 4 "$reference_file" "$fasta_file" > "$output_file"

    echo "Alignment completed for $fasta_file. Result saved as $output_file"
done

## 8.1 sam to bam

In [None]:
%%bash
# Set the path to the directory containing SAM files
sam_dir="fastq/1_align_sam"
# Set the output directory for BAM files
bam_dir="fastq/2_align_bam"

# Make sure the output directory exists or create it if necessary
mkdir -p "$bam_dir"

# Convert SAM files to BAM
for sam_file in "$sam_dir"/*.sam; do
    bam_file="$bam_dir/$(basename "$sam_file" .sam).bam"
    samtools view -bS "$sam_file" -o "$bam_file"
    echo "Conversion from $sam_file to $bam_file is complete."
done

## 8.2  Convert BAM to CSV

In [65]:
import os
import pysam
import pandas as pd

# Input folder (path where BAM files are located)
input_folder = "fastq/2_align_bam"
# Output folder (path to save CSV files)
output_folder = "fastq/3_align_csv"

# Create the output folder if it does not exist
os.makedirs(output_folder, exist_ok=True)

# Function to convert a BAM file to CSV, including optional fields
def bam_to_csv(bam_file, output_folder):
    output_csv = os.path.join(output_folder, os.path.basename(bam_file).replace(".bam", ".csv"))
    
    # Read the BAM file
    with pysam.AlignmentFile(bam_file, "rb") as bam:
        records = []
        
        for read in bam:
            # Standard BAM fields
            record = {
                "QNAME": read.query_name,
                "FLAG": read.flag,
                "RNAME": bam.get_reference_name(read.reference_id) if read.reference_id >= 0 else "*",
                "POS": read.reference_start + 1,
                "MAPQ": read.mapping_quality,
                "CIGAR": read.cigarstring if read.cigarstring else "*",
                "RNEXT": bam.get_reference_name(read.next_reference_id) if read.next_reference_id >= 0 else "*",
                "PNEXT": read.next_reference_start + 1 if read.next_reference_start >= 0 else 0,
                "TLEN": read.template_length,
                "SEQ": read.query_sequence if read.query_sequence else "*",
                "QUAL": read.qual if read.qual else "*",
            }
            
            # Add optional fields (tags)
            for tag, value in read.tags:
                record[tag] = value

            records.append(record)
    
    # Create a DataFrame from the list of records
    df = pd.DataFrame(records)

    # Fill any missing optional fields with "*" instead of NaN for consistency
    df = df.fillna("*")

    # Save the DataFrame to a CSV file
    df.to_csv(output_csv, index=False)
    return output_csv

# Find all BAM files in the folder
bam_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".bam")]

# Convert all BAM files to CSV
csv_files = []
for bam_file in bam_files:
    csv_file = bam_to_csv(bam_file, output_folder)
    csv_files.append(csv_file)

# Print the list of converted CSV files
csv_files

['fastq/3_align_csv/Temp_30_12h_ID_match_FLASH.extendedFrags.csv',
 'fastq/3_align_csv/Temp_20_12h_ID_match_FLASH.extendedFrags.csv',
 'fastq/3_align_csv/Temp_60_12h_ID_match_FLASH.extendedFrags.csv',
 'fastq/3_align_csv/Temp_40_12h_ID_match_FLASH.extendedFrags.csv',
 'fastq/3_align_csv/Temp_50_12h_ID_match_FLASH.extendedFrags.csv']

## 8.3 Filter Alignments by MAPQ Score

In [66]:
import os
import pandas as pd
from pathlib import Path

# ===== Settings =====
input_dir = Path("fastq/3_align_csv") # Input folder containing CSV files
output_dir = input_dir / "MAPQ_removed"  # Output folder for filtered CSV files
output_dir.mkdir(parents=True, exist_ok=True)

MAPQ_THRESHOLD = 10     # Keep rows where MAPQ > this value
KEEP_NAN = True         # Keep rows with NaN MAPQ values (e.g., unaligned reads)
# ====================

def process_one_csv(in_path: Path, out_dir: Path, mapq_threshold: int, keep_nan: bool = True):
    out_path = out_dir / in_path.name

    # Remove existing output file to avoid duplicates
    if out_path.exists():
        out_path.unlink()

    # Read input CSV
    try:
        df = pd.read_csv(in_path)
    except Exception as e:
        print(f"⚠️  Read fail: {in_path.name} -> {e}")
        return

    # Skip if MAPQ column does not exist
    if "MAPQ" not in df.columns:
        print(f"⚠️  Skip (no MAPQ column): {in_path.name}")
        return

    # Convert MAPQ column to numeric (invalid entries become NaN)
    m = pd.to_numeric(df["MAPQ"], errors="coerce")

    # Filtering mask: keep MAPQ > threshold, optionally keep NaN
    keep_mask = (m > mapq_threshold) | (m.isna() if keep_nan else False)

    kept = int(keep_mask.sum())
    removed = int((~keep_mask).sum())

    # Save filtered CSV
    df.loc[keep_mask].to_csv(out_path, index=False)
    print(
        f"✅ {in_path.name} → {out_path.name} | kept={kept}, removed={removed} "
        f"| threshold={mapq_threshold}, keep_nan={keep_nan}"
    )

def main():
    csv_files = sorted(input_dir.glob("*.csv"))
    if not csv_files:
        print(f"⚠️  No CSV files in {input_dir}")
        return

    for p in csv_files:
        process_one_csv(p, output_dir, MAPQ_THRESHOLD, KEEP_NAN)

if __name__ == "__main__":
    main()

✅ Temp_20_12h_ID_match_FLASH.extendedFrags.csv → Temp_20_12h_ID_match_FLASH.extendedFrags.csv | kept=170, removed=315 | threshold=10, keep_nan=True
✅ Temp_30_12h_ID_match_FLASH.extendedFrags.csv → Temp_30_12h_ID_match_FLASH.extendedFrags.csv | kept=79, removed=672 | threshold=10, keep_nan=True
✅ Temp_40_12h_ID_match_FLASH.extendedFrags.csv → Temp_40_12h_ID_match_FLASH.extendedFrags.csv | kept=190, removed=842 | threshold=10, keep_nan=True
✅ Temp_50_12h_ID_match_FLASH.extendedFrags.csv → Temp_50_12h_ID_match_FLASH.extendedFrags.csv | kept=170, removed=727 | threshold=10, keep_nan=True
✅ Temp_60_12h_ID_match_FLASH.extendedFrags.csv → Temp_60_12h_ID_match_FLASH.extendedFrags.csv | kept=59, removed=730 | threshold=10, keep_nan=True


# Histogram Data Analysis

## A.1. point Histogram

In [72]:
import os
import pandas as pd

def simplify_rname_pattern(input_folder, output_folder=None):
    """
    From each CSV in `input_folder`, keep only QNAME/RNAME, strip 'seq_CLH_' from RNAME,
    split the remaining pattern into per-character columns, and save results to `output_folder`.
    If `output_folder` is None, save to <input_folder>/processed.
    """
    if output_folder is None:
        output_folder = os.path.join(input_folder, "processed")
    os.makedirs(output_folder, exist_ok=True)

    # Collect only source CSVs (avoid re-processing already produced files)
    csv_files = [
        f for f in os.listdir(input_folder)
        if f.endswith(".csv") and not f.endswith("_pattern_processed.csv")
    ]

    for file in csv_files:
        file_path = os.path.join(input_folder, file)
        try:
            df = pd.read_csv(file_path)

            if 'RNAME' not in df.columns or 'QNAME' not in df.columns:
                print(f"\u26a0\ufe0f Skipping {file} (missing 'QNAME' or 'RNAME' column)")
                continue

            # Keep only QNAME and RNAME
            df_simple = df[['QNAME', 'RNAME']].copy()

            # Remove 'seq_CLH_' prefix
            df_simple['Pattern'] = df_simple['RNAME'].str.replace(r'^seq_CLH_', '', regex=True)

            # Split pattern string into individual characters
            pattern_split = df_simple['Pattern'].apply(lambda x: pd.Series(list(x)))
            pattern_split.columns = [f'Pattern{i}' for i in range(pattern_split.shape[1])]

            # Concatenate
            df_final = pd.concat([df_simple, pattern_split], axis=1)

            # Save
            output_name = file.replace(".csv", "_pattern_processed.csv")
            output_path = os.path.join(output_folder, output_name)
            df_final.to_csv(output_path, index=False)

            print(f"\u2705 Processed: {file} \u2192 {output_path}")

        except Exception as e:
            print(f"\u26a0\ufe0f Failed to process {file}: {e}")

# Example usage (custom output folder)
simplify_rname_pattern(
    input_folder="fastq/3_align_csv/MAPQ_removed",
    output_folder="fastq/4_point_histogram"
)

✅ Processed: Temp_60_12h_ID_match_FLASH.extendedFrags.csv → fastq/4_point_histogram/Temp_60_12h_ID_match_FLASH.extendedFrags_pattern_processed.csv
✅ Processed: Temp_50_12h_ID_match_FLASH.extendedFrags.csv → fastq/4_point_histogram/Temp_50_12h_ID_match_FLASH.extendedFrags_pattern_processed.csv
✅ Processed: Temp_40_12h_ID_match_FLASH.extendedFrags.csv → fastq/4_point_histogram/Temp_40_12h_ID_match_FLASH.extendedFrags_pattern_processed.csv
✅ Processed: Temp_20_12h_ID_match_FLASH.extendedFrags.csv → fastq/4_point_histogram/Temp_20_12h_ID_match_FLASH.extendedFrags_pattern_processed.csv
✅ Processed: Temp_30_12h_ID_match_FLASH.extendedFrags.csv → fastq/4_point_histogram/Temp_30_12h_ID_match_FLASH.extendedFrags_pattern_processed.csv


## A.2. point count PNG, Normalized PNG, summary

In [73]:
import os
import pandas as pd
import matplotlib.pyplot as plt

def summarize_and_plot_all_processed_files(
    input_folder,
    output_folder=None,
    csv_dir=None,
    png_dir=None,
    png_norm_dir=None,
    pattern_suffix="_pattern_processed.csv"
):
    # ====== Resolve output directories ======
    if output_folder is not None:
        # If output_folder is given, use its subdirs unless explicit dirs are provided
        if csv_dir is None:
            csv_dir = os.path.join(output_folder, "count_csv")
        if png_dir is None:
            png_dir = os.path.join(output_folder, "count_png")
        if png_norm_dir is None:
            png_norm_dir = os.path.join(output_folder, "count_png_normalized")
    else:
        # Backward-compatible defaults under input_folder
        if csv_dir is None:
            csv_dir = os.path.join(input_folder, "count_csv")
        if png_dir is None:
            png_dir = os.path.join(input_folder, "count_png")
        if png_norm_dir is None:
            png_norm_dir = os.path.join(input_folder, "count_png_normalized")

    # Create folders if they do not exist
    os.makedirs(csv_dir, exist_ok=True)
    os.makedirs(png_dir, exist_ok=True)
    os.makedirs(png_norm_dir, exist_ok=True)

    # Select only files matching *_pattern_processed.csv
    processed_files = [f for f in os.listdir(input_folder) if f.endswith(pattern_suffix)]
    if not processed_files:
        print(f"⚠️ No files matching *{pattern_suffix} in {input_folder}")
        return

    for file in processed_files:
        file_path = os.path.join(input_folder, file)
        try:
            df = pd.read_csv(file_path)

            # Check for columns Pattern0 ~ Pattern7 (8 columns)
            pattern_cols = [f'Pattern{i}' for i in range(8)]
            if not all(col in df.columns for col in pattern_cols):
                print(f"⚠️ Skipping {file} (missing Pattern columns)")
                continue

            # Summarize counts for H/L per position
            summary = {
                val: [df[col].value_counts().get(val, 0) for col in pattern_cols]
                for val in ['H', 'L']
            }

            df_summary = pd.DataFrame(summary, index=pattern_cols).T
            df_summary.index.name = 'Value'

            base_name = file.replace(pattern_suffix, "")

            # 1) Save CSV (absolute counts)
            csv_out = os.path.join(csv_dir, f"{base_name}_pattern_count.csv")
            df_summary.to_csv(csv_out, index=True)

            # 2) Plot absolute counts (bar chart)
            ax = df_summary.T.plot(kind="bar", figsize=(10, 6),
                                   title=file, ylabel="Count")
            plt.xticks(rotation=0)
            plt.tight_layout()
            png_out = os.path.join(png_dir, f"{base_name}_pattern_count.png")
            plt.savefig(png_out)
            plt.close()

            # 3) Plot normalized proportions (stacked bar)
            df_norm = df_summary.T.div(df_summary.T.sum(axis=1), axis=0).fillna(0)
            ax = df_norm.plot(kind="bar", stacked=True, figsize=(10, 6),
                              title=f"{file} (Normalized)", ylabel="Proportion", ylim=(0, 1))
            plt.xticks(rotation=0)
            plt.tight_layout()
            png_norm_out = os.path.join(png_norm_dir, f"{base_name}_pattern_count_normalized.png")
            plt.savefig(png_norm_out)
            plt.close()

            print(f"✅ {file} -> CSV:{os.path.basename(csv_out)}, PNG:{os.path.basename(png_out)}, PNG(Norm):{os.path.basename(png_norm_out)}")

        except Exception as e:
            print(f"⚠️ Failed to process {file}: {e}")

# ===== Example usage =====
summarize_and_plot_all_processed_files(
    input_folder="fastq/4_point_histogram",
    output_folder="fastq/4_point_histogram/point_summary",
    pattern_suffix="_pattern_processed.csv"
)

✅ Temp_20_12h_ID_match_FLASH.extendedFrags_pattern_processed.csv -> CSV:Temp_20_12h_ID_match_FLASH.extendedFrags_pattern_count.csv, PNG:Temp_20_12h_ID_match_FLASH.extendedFrags_pattern_count.png, PNG(Norm):Temp_20_12h_ID_match_FLASH.extendedFrags_pattern_count_normalized.png
✅ Temp_60_12h_ID_match_FLASH.extendedFrags_pattern_processed.csv -> CSV:Temp_60_12h_ID_match_FLASH.extendedFrags_pattern_count.csv, PNG:Temp_60_12h_ID_match_FLASH.extendedFrags_pattern_count.png, PNG(Norm):Temp_60_12h_ID_match_FLASH.extendedFrags_pattern_count_normalized.png
✅ Temp_40_12h_ID_match_FLASH.extendedFrags_pattern_processed.csv -> CSV:Temp_40_12h_ID_match_FLASH.extendedFrags_pattern_count.csv, PNG:Temp_40_12h_ID_match_FLASH.extendedFrags_pattern_count.png, PNG(Norm):Temp_40_12h_ID_match_FLASH.extendedFrags_pattern_count_normalized.png
✅ Temp_30_12h_ID_match_FLASH.extendedFrags_pattern_processed.csv -> CSV:Temp_30_12h_ID_match_FLASH.extendedFrags_pattern_count.csv, PNG:Temp_30_12h_ID_match_FLASH.extendedF

## B.1. total

In [74]:
import os
import pandas as pd

# 📁 Folder paths
input_folder = "fastq/3_align_csv/MAPQ_removed"
histogram_folder = "fastq/4_align_histogram"
os.makedirs(histogram_folder, exist_ok=True)

# 📄 Process all CSV files in the folder
files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

for file_name in files:
    file_path = os.path.join(input_folder, file_name)

    # 🔧 Clean file name (remove unnecessary parts)
    clean_name = file_name
    clean_name = clean_name.replace("assemble", "")
    clean_name = clean_name.replace("ID_match_FLASH.extendedFrags", "")
    clean_name = clean_name.replace("__", "_").strip("_")  # remove duplicate/ending underscores
    output_csv = os.path.join(histogram_folder, f"histogram_{clean_name}")

    try:
        df = pd.read_csv(file_path, dtype=str)
        if 'RNAME' not in df.columns:
            print(f"⚠️ Skipping file: {file_name} (no 'RNAME' column found)")
            continue

        # Count RNAME occurrences and normalize
        rname_counts = df['RNAME'].value_counts().reset_index()
        rname_counts.columns = ['RNAME', 'Count']
        rname_counts.insert(0, 'File_Name', clean_name)
        rname_counts['Count'] = rname_counts['Count'].astype(int)
        total_count = rname_counts['Count'].sum()
        rname_counts['Normalized_Count'] = rname_counts['Count'] / total_count

        rname_counts.to_csv(output_csv, index=False)
        print(f"✅ Saved cleaned RNAME histogram: {output_csv}")

    except Exception as e:
        print(f"❌ Error processing file '{file_name}': {e}")

✅ Saved cleaned RNAME histogram: fastq/4_align_histogram/histogram_Temp_60_12h_.csv
✅ Saved cleaned RNAME histogram: fastq/4_align_histogram/histogram_Temp_50_12h_.csv
✅ Saved cleaned RNAME histogram: fastq/4_align_histogram/histogram_Temp_40_12h_.csv
✅ Saved cleaned RNAME histogram: fastq/4_align_histogram/histogram_Temp_20_12h_.csv
✅ Saved cleaned RNAME histogram: fastq/4_align_histogram/histogram_Temp_30_12h_.csv


## B.2. total L H count

In [76]:
import pandas as pd
import os
import re

input_folder = "fastq/4_align_histogram"
output_path = os.path.join(input_folder, "summary_LH_counts.csv")

summary = []

# Select only target files for analysis
files = [f for f in os.listdir(input_folder) if f.endswith('.csv') and f.startswith("histogram_")]

for file_name in sorted(files):
    file_path = os.path.join(input_folder, file_name)
    try:
        df = pd.read_csv(file_path)
        total_L = 0
        total_H = 0

        for _, row in df.iterrows():
            seq_id = row.get('RNAME') or row.get('Sequence_ID')
            count = int(row['Count'])

            # Extract L/H label (e.g., seq_XC_LHLLLLL → LHLLLLL)
            match = re.search(r'seq_[A-Z]+_([LH]+)', seq_id)
            if not match:
                continue
            label = match.group(1)

            l_count = label.count('L') * count
            h_count = label.count('H') * count

            total_L += l_count
            total_H += h_count

        summary.append({
            "File_Name": file_name,
            "Total_L": total_L,
            "Total_H": total_H
        })
        print(f"✅ Processed {file_name} → L: {total_L}, H: {total_H}")

    except Exception as e:
        print(f"❌ Error processing {file_name}: {e}")

# Save results
summary_df = pd.DataFrame(summary)
summary_df.to_csv(output_path, index=False)
print(f"\n📄 Summary saved to: {output_path}")

✅ Processed histogram_Temp_20_12h_.csv → L: 1294, H: 66
✅ Processed histogram_Temp_30_12h_.csv → L: 416, H: 216
✅ Processed histogram_Temp_40_12h_.csv → L: 555, H: 965
✅ Processed histogram_Temp_50_12h_.csv → L: 130, H: 1230
✅ Processed histogram_Temp_60_12h_.csv → L: 19, H: 453

📄 Summary saved to: fastq/4_align_histogram/summary_LH_counts.csv


## B.3. total L H count plot

In [77]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# File paths
input_path = "fastq/4_align_histogram/summary_LH_counts.csv"
output_dir = "fastq/4_align_histogram"
output_grouped = os.path.join(output_dir, "summary_LH_grouped_barplot.png")
output_stacked = os.path.join(output_dir, "summary_LH_stacked_barplot.png")
output_normalized = os.path.join(output_dir, "summary_LH_normalized_stacked_barplot.png")

# Load data
df = pd.read_csv(input_path)

# Clean label
df["Label"] = df["File_Name"].str.replace("histogram_", "").str.replace(".csv", "")
df = df.sort_values("Label").reset_index(drop=True)  # Sort by label

# X-axis settings
x = range(len(df))
labels = df["Label"]
bar_width = 0.4

# ----------------------------------
# 1. Grouped Bar Chart
# ----------------------------------
plt.figure(figsize=(14, 6))
plt.bar([i - bar_width/2 for i in x], df["Total_L"], width=bar_width, label="Total L", color='skyblue')
plt.bar([i + bar_width/2 for i in x], df["Total_H"], width=bar_width, label="Total H", color='salmon')
plt.xticks(ticks=x, labels=labels, rotation=45, ha='right')
plt.xlabel("File")
plt.ylabel("Base Count")
plt.title("Total L and H Base Counts per File (Grouped Bar Chart)")
plt.legend()
plt.tight_layout()
plt.savefig(output_grouped)
plt.close()
print(f"📊 Grouped bar plot saved to: {output_grouped}")

# ----------------------------------
# 2. Stacked Bar Chart (Raw Counts)
# ----------------------------------
plt.figure(figsize=(14, 6))
plt.bar(x, df["Total_L"], label="Total L", color='skyblue')
plt.bar(x, df["Total_H"], bottom=df["Total_L"], label="Total H", color='salmon')
plt.xticks(ticks=x, labels=labels, rotation=45, ha='right')
plt.xlabel("File")
plt.ylabel("Base Count")
plt.title("Total L and H Base Counts per File (Stacked Bar Chart)")
plt.legend()
plt.tight_layout()
plt.savefig(output_stacked)
plt.close()
print(f"📊 Stacked bar plot saved to: {output_stacked}")

# ----------------------------------
# 3. Normalized Stacked Bar Chart (Ratio 0~1)
# ----------------------------------
# Compute ratios
df["Total"] = df["Total_L"] + df["Total_H"]
df["L_ratio"] = df["Total_L"] / df["Total"]
df["H_ratio"] = df["Total_H"] / df["Total"]

plt.figure(figsize=(14, 6))
plt.bar(x, df["L_ratio"], label="L (Ratio)", color='skyblue')
plt.bar(x, df["H_ratio"], bottom=df["L_ratio"], label="H (Ratio)", color='salmon')
plt.xticks(ticks=x, labels=labels, rotation=45, ha='right')
plt.xlabel("File")
plt.ylabel("Ratio (0–1)")
plt.title("Normalized L and H Base Ratio per File (Stacked Bar Chart)")
plt.ylim(0, 1)
plt.legend()
plt.tight_layout()
plt.savefig(output_normalized)
plt.close()
print(f"📊 Normalized stacked bar plot saved to: {output_normalized}")

📊 Grouped bar plot saved to: fastq/4_align_histogram/summary_LH_grouped_barplot.png
📊 Stacked bar plot saved to: fastq/4_align_histogram/summary_LH_stacked_barplot.png
📊 Normalized stacked bar plot saved to: fastq/4_align_histogram/summary_LH_normalized_stacked_barplot.png
