# 1. Install Modules

In [1]:
# Bioinformatics Tools (Ubuntu)
!sudo apt-get update
!sudo apt-get install -y fastp flash bwa samtools

# Python Library
!pip3 install biopython cutadapt pysam --break-system-packages

Hit:1 https://packages.microsoft.com/repos/code stable InRelease
Hit:2 http://ports.ubuntu.com/ubuntu-ports noble InRelease                     
Hit:3 http://ports.ubuntu.com/ubuntu-ports noble-updates InRelease             
Hit:4 http://ports.ubuntu.com/ubuntu-ports noble-backports InRelease
Hit:5 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu noble InRelease
Hit:6 http://ports.ubuntu.com/ubuntu-ports noble-security InRelease
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fastp is already the newest version (0.23.4+dfsg-1).
flash is already the newest version (1.2.11-2).
bwa is already the newest version (0.7.17-7).
samtools is already the newest version (1.19.2-1build2).
The following packages were automatically installed and are no longer required:
  pigz python3-xopen
Use 'sudo apt autoremove' to remove them.
0 upgraded, 0 newly installed, 0 to remove and 275 not upgraded.
[0m

# 2 Trimming and Discard trimmed sample

In [8]:
import subprocess
import glob
import os

# Specify the folder containing your input files.
# Specify the folder where you want to save the untrimmed (adapter-free) sequences.
input_folder = "fastq"
untrimmed_output_folder = "sequence_merge_method/A_untrimmed_output"

# Define the adapter sequences for R1 and R2.
adapter_sequence_r1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
adapter_sequence_r2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"

# Use glob to get a list of all input file pairs (R1 and R2) in the folder.
input_file_pairs = []
for input_r1 in glob.glob(os.path.join(input_folder, "*_R1.fastq.gz")):
    # Assuming R2 files have the same naming format as R1 files.
    input_r2 = input_r1.replace("_R1.fastq.gz", "_R2.fastq.gz")
    if os.path.exists(input_r2):  # Ensure R2 file exists.
        input_file_pairs.append({"r1": input_r1, "r2": input_r2})

# Create the output folder if it doesn't exist.
os.makedirs(untrimmed_output_folder, exist_ok=True)

for input_files in input_file_pairs:
    input_r1 = input_files["r1"]
    input_r2 = input_files["r2"]

    # Define output file paths for untrimmed (clean, adapter-free) sequences.
    untrimmed_r1 = os.path.join(untrimmed_output_folder, os.path.basename(input_r1).replace(".fastq.gz", "_untrimmed.fastq.gz"))
    untrimmed_r2 = os.path.join(untrimmed_output_folder, os.path.basename(input_r2).replace(".fastq.gz", "_untrimmed.fastq.gz"))

    # Use cutadapt to keep only untrimmed sequences (completely adapter-free).
    result = subprocess.run([
        "cutadapt",
        "-a", adapter_sequence_r1,  # Adapter for R1
        "-A", adapter_sequence_r2,  # Adapter for R2
        "-O", "15",                  # Minimum overlap for adapter trimming
        "--discard-trimmed",         # Discard sequences where trimming occurred
        "-o", untrimmed_r1,          # Save only untrimmed R1 reads
        "-p", untrimmed_r2,          # Save only untrimmed R2 reads
        input_r1, input_r2
    ], capture_output=True, text=True)

    # Log the result.
    if result.returncode == 0:
        print(f"Untrimmed sequences saved: {untrimmed_r1}, {untrimmed_r2}")
    else:
        print(f"Error processing {input_r1} and {input_r2}:\n{result.stderr}")

Untrimmed sequences saved: sequence_merge_method/A_untrimmed_output/DNA_Data_3SP26_R1_untrimmed.fastq.gz, sequence_merge_method/A_untrimmed_output/DNA_Data_3SP26_R2_untrimmed.fastq.gz
Untrimmed sequences saved: sequence_merge_method/A_untrimmed_output/DNA_Data_1D_R1_untrimmed.fastq.gz, sequence_merge_method/A_untrimmed_output/DNA_Data_1D_R2_untrimmed.fastq.gz
Untrimmed sequences saved: sequence_merge_method/A_untrimmed_output/DNA_Data_0N_R1_untrimmed.fastq.gz, sequence_merge_method/A_untrimmed_output/DNA_Data_0N_R2_untrimmed.fastq.gz
Untrimmed sequences saved: sequence_merge_method/A_untrimmed_output/DNA_Data_5I_R1_untrimmed.fastq.gz, sequence_merge_method/A_untrimmed_output/DNA_Data_5I_R2_untrimmed.fastq.gz
Untrimmed sequences saved: sequence_merge_method/A_untrimmed_output/DNA_Data_2S_R1_untrimmed.fastq.gz, sequence_merge_method/A_untrimmed_output/DNA_Data_2S_R2_untrimmed.fastq.gz
Untrimmed sequences saved: sequence_merge_method/A_untrimmed_output/DNA_Data_7T_R1_untrimmed.fastq.gz, s

# 3. Q filtering

In [9]:
import os
import subprocess

# Quality threshold (Phred score)
quality_threshold = 30

# Set input and output folders
input_folder = "sequence_merge_method/A_untrimmed_output"
output_folder = "sequence_merge_method/B_Qfiltered"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate through files in the input folder, processing only those ending with "_untrimmed.fastq.gz"
for filename in os.listdir(input_folder):
    if filename.endswith("_untrimmed.fastq.gz"):
        # Input file path
        input_file = os.path.join(input_folder, filename)
        
        # Output filename (e.g., sample_untrimmed.fastq.gz -> sample_Qfiltered.fastq.gz)
        output_file = os.path.join(
            output_folder, 
            filename.replace("_untrimmed.fastq.gz", "_Qfiltered.fastq.gz")
        )
        
        # Execute fastp in single-end mode for each file
        subprocess.call([
            "fastp",
            "-i", input_file,                      # Input file
            "-o", output_file,                     # Output file
            "-q", str(quality_threshold),          # Quality threshold for a base to be qualified
            "-u", "15",                            # Discard reads if the percentage of unqualified bases is >= 15%
            "-l", "151",                           # Minimum read length to keep
            "--cut_mean_quality", "30",            # Discard reads if mean quality is less than 30
            "--html", f"{output_file}.html",       # HTML report file path
            "--json", f"{output_file}.json"        # JSON report file path
        ])
        
        print(f"Filtering for {filename} is complete.\n"
              f"Output FASTQ : {output_file}\n"
              f"Reports      : {output_file}.html / {output_file}.json\n")

print("All filtering processes are done.")

Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 867
total bases: 130917
Q20 bases: 110998(84.785%)
Q30 bases: 96576(73.7689%)

Read1 after filtering:
total reads: 213
total bases: 32163
Q20 bases: 30934(96.1788%)
Q30 bases: 28893(89.833%)

Filtering result:
reads passed filter: 213
reads failed due to low quality: 654
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 12.9181%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_4G_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_4G_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_4G_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_4G_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Da

Filtering for DNA_Data_4G_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_4G_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_4G_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_4G_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 774
total bases: 116874
Q20 bases: 99962(85.5297%)
Q30 bases: 87120(74.5418%)

Read1 after filtering:
total reads: 212
total bases: 32012
Q20 bases: 31048(96.9886%)
Q30 bases: 29026(90.6722%)

Filtering result:
reads passed filter: 212
reads failed due to low quality: 562
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 14.9871%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_5I_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_5I_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_5I_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_5I_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_5I_R1_Qfiltered.fastq.gz.html --json sequence_merge_method/B_Qfilte

Filtering for DNA_Data_5I_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_5I_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_5I_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_5I_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 771
total bases: 116421
Q20 bases: 99711(85.6469%)
Q30 bases: 86468(74.2718%)

Read1 after filtering:
total reads: 214
total bases: 32314
Q20 bases: 31248(96.7011%)
Q30 bases: 29065(89.9455%)

Filtering result:
reads passed filter: 214
reads failed due to low quality: 557
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 14.5266%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_7T_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_7T_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_7T_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_7T_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_7T_R1_Qfiltered.fastq.gz.html --json sequence_merge_method/B_Qfilte

Filtering for DNA_Data_7T_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_7T_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_7T_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_7T_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 774
total bases: 116874
Q20 bases: 100577(86.0559%)
Q30 bases: 86450(73.9685%)

Read1 after filtering:
total reads: 216
total bases: 32616
Q20 bases: 31631(96.98%)
Q30 bases: 29247(89.6707%)

Filtering result:
reads passed filter: 216
reads failed due to low quality: 558
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 14.4703%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_5I_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_5I_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_5I_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_5I_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_5I_R2_Qfiltered.fastq.gz.html --json sequence_merge_method/B_Qfilter

Filtering for DNA_Data_5I_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_5I_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_5I_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_5I_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 771
total bases: 116421
Q20 bases: 99857(85.7723%)
Q30 bases: 85933(73.8123%)

Read1 after filtering:
total reads: 230
total bases: 34730
Q20 bases: 33795(97.3078%)
Q30 bases: 31170(89.7495%)

Filtering result:
reads passed filter: 230
reads failed due to low quality: 541
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 18.9364%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_7T_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_7T_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_7T_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_7T_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_7T_R2_Qfiltered.fastq.gz.html --json sequence_merge_method/B_Qfilte

Filtering for DNA_Data_7T_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_7T_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_7T_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_7T_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 867
total bases: 130917
Q20 bases: 113222(86.4838%)
Q30 bases: 97238(74.2745%)

Read1 after filtering:
total reads: 250
total bases: 37750
Q20 bases: 36731(97.3007%)
Q30 bases: 33973(89.9947%)

Filtering result:
reads passed filter: 250
reads failed due to low quality: 617
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 20.7612%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_4G_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_4G_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_4G_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_4G_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_4G_R2_Qfiltered.fastq.gz.html --json sequence_merge_method/B_Qfilt

Filtering for DNA_Data_4G_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_4G_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_4G_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_4G_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 4144
total bases: 625744
Q20 bases: 567365(90.6705%)
Q30 bases: 523150(83.6045%)

Read1 after filtering:
total reads: 2527
total bases: 381577
Q20 bases: 374028(98.0216%)
Q30 bases: 359194(94.1341%)

Filtering result:
reads passed filter: 2527
reads failed due to low quality: 1617
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 41.4817%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_3SP26_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R2_Qfiltered.fastq.gz.html --json sequen

Filtering for DNA_Data_3SP26_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 804
total bases: 121404
Q20 bases: 105164(86.6232%)
Q30 bases: 90694(74.7043%)

Read1 after filtering:
total reads: 231
total bases: 34881
Q20 bases: 33915(97.2306%)
Q30 bases: 31283(89.6849%)

Filtering result:
reads passed filter: 231
reads failed due to low quality: 573
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 21.7662%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_1D_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_1D_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_1D_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_1D_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_1D_R2_Qfiltered.fastq.gz.html --json sequence_merge_method/B_Qfilt

Filtering for DNA_Data_1D_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_1D_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_1D_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_1D_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 804
total bases: 121404
Q20 bases: 104389(85.9848%)
Q30 bases: 91086(75.0272%)

Read1 after filtering:
total reads: 218
total bases: 32918
Q20 bases: 31587(95.9566%)
Q30 bases: 29409(89.3402%)

Filtering result:
reads passed filter: 218
reads failed due to low quality: 586
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 13.3085%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_1D_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_1D_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_1D_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_1D_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_1D_R1_Qfiltered.fastq.gz.html --json sequence_merge_method/B_Qfilt

Filtering for DNA_Data_1D_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_1D_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_1D_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_1D_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 4144
total bases: 625744
Q20 bases: 569626(91.0318%)
Q30 bases: 520618(83.1998%)

Read1 after filtering:
total reads: 2438
total bases: 368138
Q20 bases: 362244(98.399%)
Q30 bases: 348230(94.5922%)

Filtering result:
reads passed filter: 2438
reads failed due to low quality: 1706
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 45.6081%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_3SP26_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R1_Qfiltered.fastq.gz.html --json sequenc

Filtering for DNA_Data_3SP26_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_3SP26_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 825
total bases: 124575
Q20 bases: 106281(85.3149%)
Q30 bases: 92297(74.0895%)

Read1 after filtering:
total reads: 227
total bases: 34277
Q20 bases: 33136(96.6712%)
Q30 bases: 30917(90.1975%)

Filtering result:
reads passed filter: 227
reads failed due to low quality: 598
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 14.5455%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_6S_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_6S_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_6S_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_6S_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_6S_R1_Qfiltered.fastq.gz.html --json sequence_merge_method/B_Qfilt

Filtering for DNA_Data_6S_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_6S_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_6S_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_6S_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 808
total bases: 122008
Q20 bases: 104834(85.9239%)
Q30 bases: 89977(73.7468%)

Read1 after filtering:
total reads: 200
total bases: 30200
Q20 bases: 29240(96.8212%)
Q30 bases: 26938(89.1987%)

Filtering result:
reads passed filter: 200
reads failed due to low quality: 608
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 16.2129%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_2S_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_2S_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_2S_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_2S_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_2S_R2_Qfiltered.fastq.gz.html --json sequence_merge_method/B_Qfilt

Filtering for DNA_Data_2S_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_2S_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_2S_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_2S_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 721
total bases: 108871
Q20 bases: 94451(86.755%)
Q30 bases: 81732(75.0723%)

Read1 after filtering:
total reads: 236
total bases: 35636
Q20 bases: 34651(97.2359%)
Q30 bases: 32101(90.0803%)

Filtering result:
reads passed filter: 236
reads failed due to low quality: 485
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 23.8558%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_0N_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_0N_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_0N_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_0N_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_0N_R2_Qfiltered.fastq.gz.html --json sequence_merge_method/B_Qfilter

Filtering for DNA_Data_0N_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_0N_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_0N_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_0N_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 808
total bases: 122008
Q20 bases: 104048(85.2797%)
Q30 bases: 90596(74.2541%)

Read1 after filtering:
total reads: 200
total bases: 30200
Q20 bases: 29104(96.3709%)
Q30 bases: 27065(89.6192%)

Filtering result:
reads passed filter: 200
reads failed due to low quality: 608
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 9.77723%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_2S_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_2S_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_2S_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_2S_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_2S_R1_Qfiltered.fastq.gz.html --json sequence_merge_method/B_Qfilt

Filtering for DNA_Data_2S_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_2S_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_2S_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_2S_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 721
total bases: 108871
Q20 bases: 92957(85.3827%)
Q30 bases: 81028(74.4257%)

Read1 after filtering:
total reads: 194
total bases: 29294
Q20 bases: 28171(96.1665%)
Q30 bases: 26218(89.4996%)

Filtering result:
reads passed filter: 194
reads failed due to low quality: 527
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 14.9792%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_0N_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_0N_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_0N_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_0N_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_0N_R1_Qfiltered.fastq.gz.html --json sequence_merge_method/B_Qfilte

Filtering for DNA_Data_0N_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_0N_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_0N_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_0N_R1_Qfiltered.fastq.gz.json

Filtering for DNA_Data_6S_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/DNA_Data_6S_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/DNA_Data_6S_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/DNA_Data_6S_R2_Qfiltered.fastq.gz.json

All filtering processes are done.


Read1 before filtering:
total reads: 825
total bases: 124575
Q20 bases: 107887(86.6041%)
Q30 bases: 92988(74.6442%)

Read1 after filtering:
total reads: 228
total bases: 34428
Q20 bases: 33388(96.9792%)
Q30 bases: 30766(89.3633%)

Filtering result:
reads passed filter: 228
reads failed due to low quality: 597
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 15.3939%

JSON report: sequence_merge_method/B_Qfiltered/DNA_Data_6S_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/DNA_Data_6S_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/DNA_Data_6S_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/DNA_Data_6S_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/DNA_Data_6S_R2_Qfiltered.fastq.gz.html --json sequence_merge_method/B_Qfilt

# 4. Match Paired-End Read IDs

In [10]:
import gzip
import glob
import os

def extract_matching_reads(r1_path, r2_path, out_r1_path, out_r2_path):
    def get_read_id(header):
        # Extract ID from the FASTQ header
        return header.split()[0].replace('/1', '').replace('/2', '')

    r1_ids = set()
    r2_ids = set()

    # Extract all read IDs from the R1 file
    with gzip.open(r1_path, 'rt') as r1_file:
        while True:
            header = r1_file.readline()
            if not header:
                break
            r1_ids.add(get_read_id(header.strip()))
            # Skip the other 3 lines of the read (sequence, +, quality)
            [r1_file.readline() for _ in range(3)] 

    # Extract all read IDs from the R2 file
    with gzip.open(r2_path, 'rt') as r2_file:
        while True:
            header = r2_file.readline()
            if not header:
                break
            r2_ids.add(get_read_id(header.strip()))
            [r2_file.readline() for _ in range(3)]

    # Find common and unique IDs
    matching_ids = r1_ids & r2_ids
    r1_only = r1_ids - r2_ids
    r2_only = r2_ids - r1_ids

    print(f"Processing {os.path.basename(r1_path)} and {os.path.basename(r2_path)}")
    print(f"Total R1 IDs: {len(r1_ids)}, Total R2 IDs: {len(r2_ids)}, Matching IDs: {len(matching_ids)}")
    print(f"IDs only in R1: {len(r1_only)}, IDs only in R2: {len(r2_only)}\n")

    # Create the output directory if it doesn't exist
    os.makedirs(os.path.dirname(out_r1_path), exist_ok=True)

    # Function to write only the reads with matching IDs to a new file
    def write_matching_reads(input_path, output_path, matching_ids):
        with gzip.open(input_path, 'rt') as infile, gzip.open(output_path, 'wt') as outfile:
            while True:
                lines = [infile.readline() for _ in range(4)]
                if not lines[0]:
                    break
                read_id = get_read_id(lines[0].strip())
                if read_id in matching_ids:
                    outfile.writelines(lines)

    # Write the filtered R1 and R2 files
    write_matching_reads(r1_path, out_r1_path, matching_ids)
    write_matching_reads(r2_path, out_r2_path, matching_ids)

# --------------------------
# Apply to all file pairs
# --------------------------

input_folder = "sequence_merge_method/B_Qfiltered"
output_folder = "sequence_merge_method/C_id_matched"

# Find all R1 files
r1_files = glob.glob(os.path.join(input_folder, "*_R1_Qfiltered.fastq.gz"))

# For each R1, find the corresponding R2 file and run the process
for r1_file in r1_files:
    r2_file = r1_file.replace("_R1_Qfiltered.fastq.gz", "_R2_Qfiltered.fastq.gz")
    
    if os.path.exists(r2_file):
        # Set the output file paths
        base_name = os.path.basename(r1_file).replace("_R1_Qfiltered.fastq.gz", "")
        out_r1 = os.path.join(output_folder, f"{base_name}_ID_match_R1.fastq.gz")
        out_r2 = os.path.join(output_folder, f"{base_name}_ID_match_R2.fastq.gz")
        
        # Execute the function
        extract_matching_reads(r1_file, r2_file, out_r1, out_r2)
    else:
        print(f"Warning: Corresponding R2 file not found for {r1_file}. Skipping.")

Processing DNA_Data_1D_R1_Qfiltered.fastq.gz and DNA_Data_1D_R2_Qfiltered.fastq.gz
Total R1 IDs: 218, Total R2 IDs: 231, Matching IDs: 146
IDs only in R1: 72, IDs only in R2: 85

Processing DNA_Data_3SP26_R1_Qfiltered.fastq.gz and DNA_Data_3SP26_R2_Qfiltered.fastq.gz
Total R1 IDs: 2438, Total R2 IDs: 2527, Matching IDs: 2148
IDs only in R1: 290, IDs only in R2: 379

Processing DNA_Data_0N_R1_Qfiltered.fastq.gz and DNA_Data_0N_R2_Qfiltered.fastq.gz
Total R1 IDs: 194, Total R2 IDs: 236, Matching IDs: 136
IDs only in R1: 58, IDs only in R2: 100

Processing DNA_Data_2S_R1_Qfiltered.fastq.gz and DNA_Data_2S_R2_Qfiltered.fastq.gz
Total R1 IDs: 200, Total R2 IDs: 200, Matching IDs: 119
IDs only in R1: 81, IDs only in R2: 81

Processing DNA_Data_6S_R1_Qfiltered.fastq.gz and DNA_Data_6S_R2_Qfiltered.fastq.gz
Total R1 IDs: 227, Total R2 IDs: 228, Matching IDs: 140
IDs only in R1: 87, IDs only in R2: 88

Processing DNA_Data_4G_R1_Qfiltered.fastq.gz and DNA_Data_4G_R2_Qfiltered.fastq.gz
Total R1 I

# 5 Merge W/ Flash

## 5.1 R1(Front, Back), R2(Front, Back) Fragmentation

In [11]:
import gzip
import glob
import os

def split_fastq_by_position(r1_path, r2_path, n, output_dir):
    """Splits each read in R1 and R2 files into front and back parts."""
    os.makedirs(output_dir, exist_ok=True)

    sample_base = os.path.basename(r1_path).replace("_ID_match_R1.fastq.gz", "")
    r1_f_path = os.path.join(output_dir, f"{sample_base}_R1_F.fastq.gz")
    r1_b_path = os.path.join(output_dir, f"{sample_base}_R1_B.fastq.gz")
    r2_f_path = os.path.join(output_dir, f"{sample_base}_R2_F.fastq.gz")
    r2_b_path = os.path.join(output_dir, f"{sample_base}_R2_B.fastq.gz")

    with gzip.open(r1_path, 'rt') as r1_file, \
         gzip.open(r2_path, 'rt') as r2_file, \
         gzip.open(r1_f_path, 'wt') as r1_f_out, \
         gzip.open(r1_b_path, 'wt') as r1_b_out, \
         gzip.open(r2_f_path, 'wt') as r2_f_out, \
         gzip.open(r2_b_path, 'wt') as r2_b_out:

        while True:
            r1_lines = [r1_file.readline() for _ in range(4)]
            r2_lines = [r2_file.readline() for _ in range(4)]

            if not r1_lines[0] or not r2_lines[0]:
                break

            header1, seq1, plus1, qual1 = [line.strip() for line in r1_lines]
            header2, seq2, plus2, qual2 = [line.strip() for line in r2_lines]

            # Split R1 read
            r1_f_out.write(f"{header1}\n{seq1[:151-n]}\n{plus1}\n{qual1[:151-n]}\n")
            r1_b_out.write(f"{header1}\n{seq1[-n:]}\n{plus1}\n{qual1[-n:]}\n")
            # Split R2 read
            r2_f_out.write(f"{header2}\n{seq2[:151-n]}\n{plus2}\n{qual2[:151-n]}\n")
            r2_b_out.write(f"{header2}\n{seq2[-n:]}\n{plus2}\n{qual2[-n:]}\n")

    print(f"✅ Split complete for: {sample_base} → {output_dir} (N={n})")

# -----------------------------------
# Apply the split function to all files
# -----------------------------------

input_folder = "sequence_merge_method/C_id_matched"
output_folder = "sequence_merge_method/D_split_reads"
os.makedirs(output_folder, exist_ok=True)

# Define the N-value (length of the back part) for each sample prefix
sample_n_mapping = {
    "0N": 126,
    "1D": 126,
    "2S": 126,
    "3G": 124,
    "4I": 128,
    "5S": 124,
    "6T": 122,
    "5K": 124,
    "1X8": 116,
    "0X8": 132,
    "4G": 126,
    "5I": 126,
    "6S": 124,
    "7T": 120,
    "3SP26": 122,
    "3SP31": 118   
}


# Find all R1 files
r1_files = glob.glob(os.path.join(input_folder, "*_ID_match_R1.fastq.gz"))

for r1_file in r1_files:
    r2_file = r1_file.replace("_R1.fastq.gz", "_R2.fastq.gz")

    if not os.path.exists(r2_file):
        print(f"⚠️ Matching R2 file not found: {r2_file}")
        continue

    # Find the corresponding N value based on the filename prefix
    matched_n = None
    for prefix, n_value in sample_n_mapping.items():
        if prefix in os.path.basename(r1_file):
            matched_n = n_value
            break

    if matched_n is None:
        print(f"⚠️ Could not find N value for: {r1_file} → Skipping")
        continue

    # Execute the split function
    split_fastq_by_position(r1_file, r2_file, matched_n, output_folder)

✅ Split complete for: DNA_Data_2S → sequence_merge_method/D_split_reads (N=126)
✅ Split complete for: DNA_Data_4G → sequence_merge_method/D_split_reads (N=126)
✅ Split complete for: DNA_Data_6S → sequence_merge_method/D_split_reads (N=124)
✅ Split complete for: DNA_Data_1D → sequence_merge_method/D_split_reads (N=126)
✅ Split complete for: DNA_Data_7T → sequence_merge_method/D_split_reads (N=120)
✅ Split complete for: DNA_Data_3SP26 → sequence_merge_method/D_split_reads (N=122)
✅ Split complete for: DNA_Data_5I → sequence_merge_method/D_split_reads (N=126)
✅ Split complete for: DNA_Data_0N → sequence_merge_method/D_split_reads (N=126)


## 5.2 R2 DNA reverse complementary

In [12]:
import gzip
import glob
import os
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

def reverse_complement_fastq(input_fastq_path, output_fastq_path):
    # Reads a FASTQ file, creates the reverse complement of each record, and writes it to a new file.
    with gzip.open(input_fastq_path, "rt") as infile, gzip.open(output_fastq_path, "wt") as outfile:
        for record in SeqIO.parse(infile, "fastq"):
            # Create the reverse complement record, preserving the ID and description
            rev_comp_record = record.reverse_complement(id=True, description=True)
            SeqIO.write(rev_comp_record, outfile, "fastq")
            
    print(f"✅ Reverse complemented: {os.path.basename(output_fastq_path)}")

# --------------------------------------------------
# Perform reverse complement on all relevant R2 files
# --------------------------------------------------

input_folder = "sequence_merge_method/D_split_reads"
os.makedirs(input_folder, exist_ok=True)

# Find only the R2 front (F) and back (B) fragment files
input_files = glob.glob(os.path.join(input_folder, "*_R2_[BF].fastq.gz"))

for input_path in input_files:
    base = os.path.basename(input_path)
    # Remove the .fastq.gz extension to create a new filename
    name_without_ext = base.replace(".fastq.gz", "")
    output_path = os.path.join(input_folder, f"{name_without_ext}_revcomp.fastq.gz")
    
    reverse_complement_fastq(input_path, output_path)

✅ Reverse complemented: DNA_Data_6S_R2_F_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_7T_R2_F_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_1D_R2_B_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_5I_R2_B_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_3SP26_R2_F_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_0N_R2_B_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_4G_R2_B_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_2S_R2_F_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_4G_R2_F_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_2S_R2_B_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_0N_R2_F_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_3SP26_R2_B_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_5I_R2_F_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_7T_R2_B_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_6S_R2_B_revcomp.fastq.gz
✅ Reverse complemented: DNA_Data_1D_R2_F_revcomp.fastq.gz


## 5.3 [R1_back]-[R2_back] merge (FLASH)

In [13]:
import os
import glob
import subprocess

# === Folder Setup ===
input_folder = "sequence_merge_method/D_split_reads"
output_folder = "sequence_merge_method/E_merged_output"
os.makedirs(output_folder, exist_ok=True)

# === Set N-values (Overlap Length) per Sample Prefix ===
sample_n_mapping = {
    "0N": 126,
    "1D": 126,
    "2S": 126,
    "3G": 124,
    "4I": 128,
    "5S": 124,
    "6T": 122,
    "5K": 124,
    "1X8": 116,
    "0X8": 132,
    "4G": 126,
    "5I": 126,
    "6S": 124,
    "7T": 120,
    "3SP26": 122,
    "3SP31": 118   
}

# === Find List of all R1_B Files ===
r1_files = glob.glob(os.path.join(input_folder, "*_R1_B.fastq.gz"))

print(f"🔎 Found {len(r1_files)} R1_B files.")

# === Process Each R1_B File ===
for r1_path in r1_files:
    sample_base = os.path.basename(r1_path).replace("_R1_B.fastq.gz", "")
    r2_path = os.path.join(input_folder, f"{sample_base}_R2_B.fastq.gz")

    if not os.path.exists(r2_path):
        print(f"⚠️ Matching R2_B file not found for {sample_base} → Skipping.")
        continue

    # Find the corresponding N value for the filename
    matched_n = None
    for prefix, n_value in sample_n_mapping.items():
        if prefix in sample_base:
            matched_n = n_value
            break

    if matched_n is None:
        print(f"⚠️ No N value matched for {sample_base} → Skipping.")
        continue

    output_name = f"{sample_base}_FLASH"

    print(f"🔵 Running FLASH for sample: {sample_base} (N={matched_n})")

    try:
        # Execute the FLASH command
        subprocess.check_call([
            "flash",
            "-m", str(matched_n),   # minimum overlap
            "-M", str(matched_n),   # Maximum overlap
            "-o", output_name,      # Output file prefix
            "-d", output_folder,    # Output directory
            r1_path,
            r2_path
        ])
        print(f"✅ FLASH merging complete → {os.path.join(output_folder, output_name)}.extendedFrags.fastq")
    except subprocess.CalledProcessError as e:
        print(f"❌ FLASH merging failed for {sample_base}: {e}")

🔎 Found 8 R1_B files.
🔵 Running FLASH for sample: DNA_Data_3SP26 (N=122)
[FLASH] Starting FLASH v1.2.11
[FLASH] Fast Length Adjustment of SHort reads
[FLASH]  
[FLASH] Input files:
[FLASH]     sequence_merge_method/D_split_reads/DNA_Data_3SP26_R1_B.fastq.gz
[FLASH]     sequence_merge_method/D_split_reads/DNA_Data_3SP26_R2_B.fastq.gz
[FLASH]  
[FLASH] Output files:
[FLASH]     sequence_merge_method/E_merged_output/DNA_Data_3SP26_FLASH.extendedFrags.fastq
[FLASH]     sequence_merge_method/E_merged_output/DNA_Data_3SP26_FLASH.notCombined_1.fastq
[FLASH]     sequence_merge_method/E_merged_output/DNA_Data_3SP26_FLASH.notCombined_2.fastq
[FLASH]     sequence_merge_method/E_merged_output/DNA_Data_3SP26_FLASH.hist
[FLASH]     sequence_merge_method/E_merged_output/DNA_Data_3SP26_FLASH.histogram
[FLASH]  
[FLASH] Parameters:
[FLASH]     Min overlap:           122
[FLASH]     Max overlap:           122
[FLASH]     Max mismatch density:  0.250000
[FLASH]     Allow "outie" pairs:   false
[FLASH]   

[FLASH]  
[FLASH] Read combination statistics:
[FLASH]     Total pairs:      135
[FLASH]     Combined pairs:   47
[FLASH]     Uncombined pairs: 88
[FLASH]     Percent combined: 34.81%
[FLASH]  
[FLASH] Writing histogram files.
[FLASH]  
[FLASH] FLASH v1.2.11 complete!
[FLASH] 0.016 seconds elapsed
✅ FLASH merging complete → sequence_merge_method/E_merged_output/DNA_Data_5I_FLASH.extendedFrags.fastq


## 5.4 Assemble 
## R1_Front - [R1_Back]-[R2_Back]_merged (FLASH) - R2_Front_ReverseComplement

In [14]:
import os
import gzip
import glob
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

def load_fastq_to_dict(file_path):
    """Loads a FASTQ file into a dictionary: key=read_id, value=(sequence, quality)."""
    data = {}
    open_func = gzip.open if file_path.endswith(".gz") else open

    with open_func(file_path, "rt") as handle:
        for record in SeqIO.parse(handle, "fastq"):
            seq = str(record.seq)
            qual = record.letter_annotations["phred_quality"]
            data[record.id] = (seq, qual)
    return data

def assemble_fastq(r1_path, merged_path, r2_path, output_path):
    """Assembles the final sequence from R1_F, Merged, and R2_F_revcomp fragments."""
    print(f"🔄 Assembling for sample: {os.path.basename(output_path)}")
    r1_dict = load_fastq_to_dict(r1_path)
    r2_dict = load_fastq_to_dict(r2_path)

    with open(merged_path, "r") as merged_file, gzip.open(output_path, "wt") as output_file:
        for record in SeqIO.parse(merged_file, "fastq"):
            read_id = record.id
            merged_seq = str(record.seq)
            merged_qual = record.letter_annotations["phred_quality"]

            # A read must have corresponding R1 and R2 fragments to be assembled.
            if read_id not in r1_dict or read_id not in r2_dict:
                continue  

            r1_seq, r1_qual = r1_dict[read_id]
            r2_seq, r2_qual = r2_dict[read_id]

            # Concatenate in order: R1_F → Merged_Fragment → R2_F_revcomp
            full_seq = r1_seq + merged_seq + r2_seq
            full_qual = r1_qual + merged_qual + r2_qual

            new_record = SeqRecord(
                Seq(full_seq),
                id=read_id,
                description="",
                letter_annotations={"phred_quality": full_qual}
            )

            SeqIO.write(new_record, output_file, "fastq")

    print(f"✅ Assembled FASTQ saved: {output_path}")

# ===== Batch processing =====

# Path setup
input_merged_folder = "sequence_merge_method/E_merged_output"
input_split_folder = "sequence_merge_method/D_split_reads"
output_folder = "sequence_merge_method/1_assemble"
os.makedirs(output_folder, exist_ok=True)

# Get a list of all merged files from FLASH
merged_files = glob.glob(os.path.join(input_merged_folder, "*_FLASH.extendedFrags.fastq"))

print(f"🔍 Found {len(merged_files)} merged samples to assemble.")

for merged_file in merged_files:
    sample_base = os.path.basename(merged_file).replace("_FLASH.extendedFrags.fastq", "")

    r1_path = os.path.join(input_split_folder, f"{sample_base}_R1_F.fastq.gz")
    r2_path = os.path.join(input_split_folder, f"{sample_base}_R2_F_revcomp.fastq.gz")
    output_path = os.path.join(output_folder, f"{sample_base}_assemble.fastq.gz")

    if os.path.exists(r1_path) and os.path.exists(r2_path):
        assemble_fastq(r1_path, merged_file, r2_path, output_path)
    else:
        print(f"⚠️ Missing split files for {sample_base}, skipping.")

🔍 Found 8 merged samples to assemble.
🔄 Assembling for sample: DNA_Data_7T_assemble.fastq.gz
✅ Assembled FASTQ saved: sequence_merge_method/1_assemble/DNA_Data_7T_assemble.fastq.gz
🔄 Assembling for sample: DNA_Data_5I_assemble.fastq.gz
✅ Assembled FASTQ saved: sequence_merge_method/1_assemble/DNA_Data_5I_assemble.fastq.gz
🔄 Assembling for sample: DNA_Data_6S_assemble.fastq.gz
✅ Assembled FASTQ saved: sequence_merge_method/1_assemble/DNA_Data_6S_assemble.fastq.gz
🔄 Assembling for sample: DNA_Data_4G_assemble.fastq.gz
✅ Assembled FASTQ saved: sequence_merge_method/1_assemble/DNA_Data_4G_assemble.fastq.gz
🔄 Assembling for sample: DNA_Data_1D_assemble.fastq.gz
✅ Assembled FASTQ saved: sequence_merge_method/1_assemble/DNA_Data_1D_assemble.fastq.gz
🔄 Assembling for sample: DNA_Data_3SP26_assemble.fastq.gz
✅ Assembled FASTQ saved: sequence_merge_method/1_assemble/DNA_Data_3SP26_assemble.fastq.gz
🔄 Assembling for sample: DNA_Data_2S_assemble.fastq.gz
✅ Assembled FASTQ saved: sequence_merge_met

# 6. fastq -> fasta

In [15]:
import os
import gzip
from Bio import SeqIO

# Input and output folder paths
input_folder = "sequence_merge_method/1_assemble"
output_folder = "sequence_merge_method/2_fastq_to_fasta"

# Create the output folder if it doesn't exist.
os.makedirs(output_folder, exist_ok=True)  

for filename in os.listdir(input_folder):
    # Process only files with .fastq or .fastq.gz extensions
    if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
        input_file = os.path.join(input_folder, filename)
        
        # Set output filename (.fasta extension)
        output_file = os.path.join(
            output_folder,
            filename.replace(".fastq.gz", ".fasta").replace(".fastq", ".fasta")
        )

        # Choose open mode based on gzip
        open_func = gzip.open if filename.endswith(".gz") else open

        # Read FASTQ and convert to FASTA
        with open_func(input_file, "rt") as fastq_file:
            # open in text mode
            records = list(SeqIO.parse(fastq_file, "fastq"))

        # Save as FASTA
        with open(output_file, "w") as fasta_file:
            SeqIO.write(records, fasta_file, "fasta")

        print(f"Converted: {filename} → {os.path.basename(output_file)}")

print("All conversions are done.")

Converted: DNA_Data_1D_assemble.fastq.gz → DNA_Data_1D_assemble.fasta
Converted: DNA_Data_4G_assemble.fastq.gz → DNA_Data_4G_assemble.fasta
Converted: DNA_Data_0N_assemble.fastq.gz → DNA_Data_0N_assemble.fasta
Converted: DNA_Data_5I_assemble.fastq.gz → DNA_Data_5I_assemble.fasta
Converted: DNA_Data_6S_assemble.fastq.gz → DNA_Data_6S_assemble.fasta
Converted: DNA_Data_7T_assemble.fastq.gz → DNA_Data_7T_assemble.fasta
Converted: DNA_Data_2S_assemble.fastq.gz → DNA_Data_2S_assemble.fasta
Converted: DNA_Data_3SP26_assemble.fastq.gz → DNA_Data_3SP26_assemble.fasta
All conversions are done.


# 7. Binary data reference seqeunce data generate

In [16]:
from pathlib import Path

def generate_sequences_for_bit(bit_length: int):
    """
    Generate DNA sequences for all binary combinations of the given bit_length.
    (bit_length=8 -> 256 barcodes)
    """
    sequences = {}

    seq_0 = "ACTCATATACACACTTAATC"
    seq_1 = "ACTCATATACATACACTTAATC"
    prefix = "ACACTTAATC"

    for i in range(2 ** bit_length):
        binary_str = format(i, f'0{bit_length}b')
        sequence = ''.join(seq_1 if bit == '1' else seq_0 for bit in binary_str)
        full_sequence = prefix + sequence
        seq_id = f"seq_{i:04d}_{binary_str}"
        sequences[seq_id] = full_sequence

    return sequences

def write_fasta(sequences: dict, output_path: str):
    """Write sequences to a FASTA file."""
    with open(output_path, "w") as f:
        for seq_id, sequence in sequences.items():
            f.write(f">{seq_id}\n{sequence}\n")

# ===== Settings: exactly 8 bits (256 barcodes) =====
BIT_LENGTH = 8  # 2^8 = 256
output_dir = Path("reference_sequence")
output_dir.mkdir(parents=True, exist_ok=True)

output_path = output_dir / "8bit_reference.fasta"
# ==================================================

seqs = generate_sequences_for_bit(BIT_LENGTH)
write_fasta(seqs, output_path)
print(f"✅ 8-bit (256) barcodes FASTA saved: {output_path}")

✅ 8-bit (256) barcodes FASTA saved: reference_sequence/8bit_reference.fasta


# 8. Reference sequence - Sample Matching

In [17]:
# Index reference
!bwa index "reference_sequence/8bit_reference.fasta"

[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index reference_sequence/8bit_reference.fasta
[main] Real time: 0.049 sec; CPU: 0.012 sec


In [18]:
%%bash
# Set the path to the reference sequence file
reference_file="reference_sequence/8bit_reference.fasta"

# Set the directory containing your filtered FASTA files
fasta_directory="sequence_merge_method/2_fastq_to_fasta"
# Set the output directory for aligned SAM files
output_dir="sequence_merge_method/3_align_sam"

# Make sure the output directory exists or create it if necessary
mkdir -p "$output_dir"

# Iterate through filtered FASTA files in the specified directory
for fasta_file in "$fasta_directory"/*_assemble.fasta; do
    # Generate an output file name based on the input filename
    output_file="$output_dir/$(basename "$fasta_file" .fasta).sam"

    # Perform the BWA alignment 
    bwa mem -M -t 4 "$reference_file" "$fasta_file" > "$output_file"

    echo "Alignment completed for $fasta_file. Result saved as $output_file"
done

[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 50 sequences (8800 bp)...
[M::mem_process_seqs] Processed 50 reads in 0.024 CPU sec, 0.008 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 reference_sequence/8bit_reference.fasta sequence_merge_method/2_fastq_to_fasta/DNA_Data_0N_assemble.fasta
[main] Real time: 0.033 sec; CPU: 0.027 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/DNA_Data_0N_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/DNA_Data_0N_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 63 sequences (11088 bp)...
[M::mem_process_seqs] Processed 63 reads in 0.024 CPU sec, 0.011 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 reference_sequence/8bit_reference.fasta sequence_merge_method/2_fastq_to_fasta/DNA_Data_1D_assemble.fasta
[main] Real time: 0.041 sec; CPU: 0.028 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/DNA_Data_1D_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/DNA_Data_1D_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 47 sequences (8272 bp)...
[M::mem_process_seqs] Processed 47 reads in 0.027 CPU sec, 0.014 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 reference_sequence/8bit_reference.fasta sequence_merge_method/2_fastq_to_fasta/DNA_Data_2S_assemble.fasta
[main] Real time: 0.040 sec; CPU: 0.031 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/DNA_Data_2S_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/DNA_Data_2S_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 420 sequences (75600 bp)...
[M::mem_process_seqs] Processed 420 reads in 0.356 CPU sec, 0.100 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 reference_sequence/8bit_reference.fasta sequence_merge_method/2_fastq_to_fasta/DNA_Data_3SP26_assemble.fasta
[main] Real time: 0.136 sec; CPU: 0.360 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/DNA_Data_3SP26_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/DNA_Data_3SP26_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 61 sequences (10736 bp)...
[M::mem_process_seqs] Processed 61 reads in 0.042 CPU sec, 0.015 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 reference_sequence/8bit_reference.fasta sequence_merge_method/2_fastq_to_fasta/DNA_Data_4G_assemble.fasta
[main] Real time: 0.038 sec; CPU: 0.045 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/DNA_Data_4G_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/DNA_Data_4G_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 47 sequences (8272 bp)...
[M::mem_process_seqs] Processed 47 reads in 0.019 CPU sec, 0.012 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 reference_sequence/8bit_reference.fasta sequence_merge_method/2_fastq_to_fasta/DNA_Data_5I_assemble.fasta
[main] Real time: 0.041 sec; CPU: 0.023 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/DNA_Data_5I_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/DNA_Data_5I_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 47 sequences (8366 bp)...
[M::mem_process_seqs] Processed 47 reads in 0.025 CPU sec, 0.008 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 reference_sequence/8bit_reference.fasta sequence_merge_method/2_fastq_to_fasta/DNA_Data_6S_assemble.fasta
[main] Real time: 0.031 sec; CPU: 0.028 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/DNA_Data_6S_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/DNA_Data_6S_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 50 sequences (9100 bp)...
[M::mem_process_seqs] Processed 50 reads in 0.037 CPU sec, 0.011 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 reference_sequence/8bit_reference.fasta sequence_merge_method/2_fastq_to_fasta/DNA_Data_7T_assemble.fasta
[main] Real time: 0.037 sec; CPU: 0.041 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/DNA_Data_7T_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/DNA_Data_7T_assemble.sam


## 8.1 sam to bam

In [19]:
%%bash

# Set the path to the directory containing SAM files
sam_dir="sequence_merge_method/3_align_sam"
# Set the output directory for BAM files
bam_dir="sequence_merge_method/4_align_bam"

# Make sure the output directory exists or create it if necessary
mkdir -p "$bam_dir"

# Convert SAM files to BAM
for sam_file in "$sam_dir"/*.sam; do
    bam_file="$bam_dir/$(basename "$sam_file" .sam).bam"
    samtools view -bS "$sam_file" -o "$bam_file"
    echo "Conversion from $sam_file to $bam_file is complete."
done

Conversion from sequence_merge_method/3_align_sam/DNA_Data_0N_assemble.sam to sequence_merge_method/4_align_bam/DNA_Data_0N_assemble.bam is complete.
Conversion from sequence_merge_method/3_align_sam/DNA_Data_1D_assemble.sam to sequence_merge_method/4_align_bam/DNA_Data_1D_assemble.bam is complete.
Conversion from sequence_merge_method/3_align_sam/DNA_Data_2S_assemble.sam to sequence_merge_method/4_align_bam/DNA_Data_2S_assemble.bam is complete.
Conversion from sequence_merge_method/3_align_sam/DNA_Data_3SP26_assemble.sam to sequence_merge_method/4_align_bam/DNA_Data_3SP26_assemble.bam is complete.
Conversion from sequence_merge_method/3_align_sam/DNA_Data_4G_assemble.sam to sequence_merge_method/4_align_bam/DNA_Data_4G_assemble.bam is complete.
Conversion from sequence_merge_method/3_align_sam/DNA_Data_5I_assemble.sam to sequence_merge_method/4_align_bam/DNA_Data_5I_assemble.bam is complete.
Conversion from sequence_merge_method/3_align_sam/DNA_Data_6S_assemble.sam to sequence_merge_m

## 8.2  Convert BAM to CSV

In [20]:
import os
import pysam
import pandas as pd

# Input folder (path where BAM files are located)
input_folder = "sequence_merge_method/4_align_bam"
# Output folder (path to save CSV files)
output_folder = "sequence_merge_method/4_align_bam/csv"

# Create the output folder if it does not exist
os.makedirs(output_folder, exist_ok=True)

# Function to convert a BAM file to CSV, including optional fields
def bam_to_csv(bam_file, output_folder):
    output_csv = os.path.join(output_folder, os.path.basename(bam_file).replace(".bam", ".csv"))
    
    # Read the BAM file.
    with pysam.AlignmentFile(bam_file, "rb") as bam:
        records = []
        
        for read in bam:
            # Standard BAM fields.
            record = {
                "QNAME": read.query_name,
                "FLAG": read.flag,
                "RNAME": bam.get_reference_name(read.reference_id) if read.reference_id >= 0 else "*",
                "POS": read.reference_start + 1,
                "MAPQ": read.mapping_quality,
                "CIGAR": read.cigarstring if read.cigarstring else "*",
                "RNEXT": bam.get_reference_name(read.next_reference_id) if read.next_reference_id >= 0 else "*",
                "PNEXT": read.next_reference_start + 1 if read.next_reference_start >= 0 else 0,
                "TLEN": read.template_length,
                "SEQ": read.query_sequence if read.query_sequence else "*",
                "QUAL": read.qual if read.qual else "*",
            }
            
            # Add optional fields (tags).
            for tag, value in read.tags:
                record[tag] = value

            records.append(record)
    
    # Create a DataFrame from the list of records.
    df = pd.DataFrame(records)

    # Fill any missing optional fields with "*" instead of NaN for consistency.
    df = df.fillna("*")

    # Save the DataFrame to a CSV file.
    df.to_csv(output_csv, index=False)
    print(f"Converted: {os.path.basename(bam_file)} -> {os.path.basename(output_csv)}")
    return output_csv

# Find all BAM files in the input folder.
bam_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".bam")]

# Convert all found BAM files to CSV.
csv_files = []
for bam_file in bam_files:
    csv_file = bam_to_csv(bam_file, output_folder)
    csv_files.append(csv_file)

# Print the list of newly created CSV files.
csv_files

Converted: DNA_Data_6S_assemble.bam -> DNA_Data_6S_assemble.csv
Converted: DNA_Data_2S_assemble.bam -> DNA_Data_2S_assemble.csv
Converted: DNA_Data_0N_assemble.bam -> DNA_Data_0N_assemble.csv
Converted: DNA_Data_4G_assemble.bam -> DNA_Data_4G_assemble.csv
Converted: DNA_Data_1D_assemble.bam -> DNA_Data_1D_assemble.csv
Converted: DNA_Data_3SP26_assemble.bam -> DNA_Data_3SP26_assemble.csv
Converted: DNA_Data_5I_assemble.bam -> DNA_Data_5I_assemble.csv
Converted: DNA_Data_7T_assemble.bam -> DNA_Data_7T_assemble.csv


['sequence_merge_method/4_align_bam/csv/DNA_Data_6S_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/DNA_Data_2S_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/DNA_Data_0N_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/DNA_Data_4G_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/DNA_Data_1D_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/DNA_Data_3SP26_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/DNA_Data_5I_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/DNA_Data_7T_assemble.csv']

## 8.3 Filter Alignments by MAPQ Score

In [21]:
import os
import pandas as pd
from pathlib import Path

# ===== Settings =====
input_dir = Path("sequence_merge_method/4_align_bam/csv") # Input folder containing CSV files
output_dir = input_dir / "MAPQ_removed"  # Output folder for filtered CSV files
output_dir.mkdir(parents=True, exist_ok=True)

MAPQ_THRESHOLD = 10     # Keep rows where MAPQ > this value
KEEP_NAN = True         # Keep rows with NaN MAPQ values (e.g., unaligned reads)
# ====================

def process_one_csv(in_path: Path, out_dir: Path, mapq_threshold: int, keep_nan: bool = True):
    out_path = out_dir / in_path.name

    # Remove existing output file to avoid duplicates
    if out_path.exists():
        out_path.unlink()

    # Read input CSV
    try:
        df = pd.read_csv(in_path)
    except Exception as e:
        print(f"⚠️  Read fail: {in_path.name} -> {e}")
        return

    # Skip if MAPQ column does not exist
    if "MAPQ" not in df.columns:
        print(f"⚠️  Skip (no MAPQ column): {in_path.name}")
        return

    # Convert MAPQ column to numeric (invalid entries become NaN)
    m = pd.to_numeric(df["MAPQ"], errors="coerce")

    # Filtering mask: keep MAPQ > threshold, optionally keep NaN
    keep_mask = (m > mapq_threshold) | (m.isna() if keep_nan else False)

    kept = int(keep_mask.sum())
    removed = int((~keep_mask).sum())

    # Save filtered CSV
    df.loc[keep_mask].to_csv(out_path, index=False)
    print(
        f"✅ {in_path.name} → {out_path.name} | kept={kept}, removed={removed} "
        f"| threshold={mapq_threshold}, keep_nan={keep_nan}"
    )

def main():
    csv_files = sorted(input_dir.glob("*.csv"))
    if not csv_files:
        print(f"⚠️  No CSV files in {input_dir}")
        return

    for p in csv_files:
        process_one_csv(p, output_dir, MAPQ_THRESHOLD, KEEP_NAN)

if __name__ == "__main__":
    main()

✅ DNA_Data_0N_assemble.csv → DNA_Data_0N_assemble.csv | kept=48, removed=2 | threshold=10, keep_nan=True
✅ DNA_Data_1D_assemble.csv → DNA_Data_1D_assemble.csv | kept=59, removed=4 | threshold=10, keep_nan=True
✅ DNA_Data_2S_assemble.csv → DNA_Data_2S_assemble.csv | kept=43, removed=4 | threshold=10, keep_nan=True
✅ DNA_Data_3SP26_assemble.csv → DNA_Data_3SP26_assemble.csv | kept=366, removed=54 | threshold=10, keep_nan=True
✅ DNA_Data_4G_assemble.csv → DNA_Data_4G_assemble.csv | kept=48, removed=13 | threshold=10, keep_nan=True
✅ DNA_Data_5I_assemble.csv → DNA_Data_5I_assemble.csv | kept=46, removed=1 | threshold=10, keep_nan=True
✅ DNA_Data_6S_assemble.csv → DNA_Data_6S_assemble.csv | kept=43, removed=4 | threshold=10, keep_nan=True
✅ DNA_Data_7T_assemble.csv → DNA_Data_7T_assemble.csv | kept=42, removed=8 | threshold=10, keep_nan=True


# Histogram Data Analysis

## A. Generate Histogram Data from Aligned Reads(MAPQ filtered)

In [22]:
import os
import pandas as pd

# Folder setup
input_folder = "sequence_merge_method/4_align_bam/csv/MAPQ_removed"
histogram_folder = "sequence_merge_method/5_histogram"
os.makedirs(histogram_folder, exist_ok=True)

# Process all CSV files in the input folder
files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

for file_name in files:
    file_path = os.path.join(input_folder, file_name)
    output_csv = os.path.join(histogram_folder, f"histogram_{file_name}")

    try:
        df = pd.read_csv(file_path, dtype=str)
        if 'RNAME' not in df.columns:
            print(f"Skipping file: {file_name} (no 'RNAME' column found)")
            continue

        # Count the occurrences of each unique RNAME
        rname_counts = df['RNAME'].value_counts().reset_index()
        rname_counts.columns = ['RNAME', 'Count']
        
        # Add metadata and calculate normalized counts
        rname_counts.insert(0, 'File_Name', file_name)
        rname_counts['Count'] = rname_counts['Count'].astype(int)
        total_count = rname_counts['Count'].sum()
        rname_counts['Normalized_Count'] = rname_counts['Count'] / total_count

        # Save the histogram data to a new CSV file
        rname_counts.to_csv(output_csv, index=False)
        print(f"✅ Saved full RNAME histogram: {output_csv}")

    except Exception as e:
        print(f"❌ Error processing file '{file_name}': {e}")

✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_DNA_Data_7T_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_DNA_Data_5I_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_DNA_Data_3SP26_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_DNA_Data_1D_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_DNA_Data_4G_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_DNA_Data_0N_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_DNA_Data_6S_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_DNA_Data_2S_assemble.csv


## B. Create Top 5 Histogram Plots for Each Sample

In [23]:
import os, re
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt

# ---------------- Path setup ----------------
histogram_folder = "sequence_merge_method/5_histogram"
summary_folder = os.path.join(histogram_folder, "graph_top5")
os.makedirs(summary_folder, exist_ok=True)

# ---------------- Expected RNAME highlight mapping ----------------
highlight_mapping = {
    "0N": "seq_013_00001101",
    "1D": "seq_035_00100011",
    "2S": "seq_082_01010010",
    "3G": "seq_102_01100110",
    "4I": "seq_136_10001000",
    "5S": "seq_178_10110010",
    "6T": "seq_211_11010011",
    "5K": "seq_170_10101010",
    "1X8": "seq_255_11111111",
    "0X8": "seq_000_00000000",
    "4G": "seq_134_10000110",
    "5I": "seq_168_10101000",
    "6S": "seq_210_11010010",
    "7T": "seq_243_11110011",
    "3SP26": "seq_122_01111010",
    "3SP31": "seq_127_01111111",
}

# ---------------- Collect matching CSV files (support both uppercase/lowercase) ----------------
patterns = [
    os.path.join(histogram_folder, "histogram_DNA_Data_*_assemble.csv"),
    os.path.join(histogram_folder, "Histogram_DNA_Data_*_assemble.csv"),
]
csv_files = sorted({p for pat in patterns for p in glob(pat)})

print(f"[INFO] Matched files: {len(csv_files)}")
for f in csv_files:
    print("  -", os.path.basename(f))
if not csv_files:
    print("[WARN] No files matched. Check folder path or filename pattern.")

summary_rows = []

for file_path in csv_files:
    file_name = os.path.basename(file_path)

    # Extract sample key from filename (histogram_DNA_Data_<SAMPLE>_assemble.csv)
    m = re.match(r"[Hh]istogram_DNA_Data_([A-Za-z0-9]+)_assemble\.csv", file_name)
    sample_key = m.group(1) if m else None
    if not sample_key:
        print(f"[WARN] Failed to extract sample key from {file_name}; highlight may not apply.")

    try:
        df = pd.read_csv(file_path)
        cols = df.columns.tolist()

        # Determine sorting column (Normalized_Count first, fallback to Count)
        sort_col = None
        if "Normalized_Count" in df.columns:
            sort_col = "Normalized_Count"
        elif "Count" in df.columns:
            sort_col = "Count"

        if "RNAME" not in df.columns or sort_col is None:
            print(f"[SKIP] {file_name}: Missing RNAME or Count/Normalized_Count. Columns: {cols}")
            continue

        # Top 5 entries
        top_df = df.sort_values(by=sort_col, ascending=False).head(5).reset_index(drop=True)

        # Lookup expected RNAME for highlight
        expected_rname = highlight_mapping.get(sample_key)

        # -------- Plotting (keeps original visual style) --------
        fig, ax = plt.subplots(figsize=(7, 4))
        bar_labels = top_df["RNAME"].tolist()
        values = top_df[sort_col].tolist()

        # Highlight color assignment
        colors = []
        for r in bar_labels:
            if expected_rname and r == expected_rname:
                colors.append("tab:orange")
            else:
                colors.append("tab:blue")

        ax.bar(range(len(bar_labels)), values, tick_label=bar_labels, color=colors)
        ax.set_ylabel(sort_col)
        ax.set_title(f"Top5 RNAMEs — {sample_key}")

        # Highlight marker (*)
        if expected_rname and expected_rname in bar_labels:
            idx = bar_labels.index(expected_rname)
            ax.annotate("*",
                        xy=(idx, values[idx]),
                        xytext=(0, 5),
                        textcoords="offset points",
                        ha="center", va="bottom", fontsize=14, fontweight="bold")

        plt.xticks(rotation=20, ha="right")
        plt.tight_layout()

        out_png = os.path.join(summary_folder, f"{sample_key}_top5.png")
        plt.savefig(out_png, dpi=200)
        plt.close(fig)
        print(f"[OK] Saved plot: {out_png}")

        # Append summary data
        for rank, row in top_df.iterrows():
            summary_rows.append({
                "file": file_name,
                "sample_key": sample_key,
                "rank": rank + 1,
                "RNAME": row["RNAME"],
                sort_col: row[sort_col],
                "is_expected": (expected_rname == row["RNAME"]) if expected_rname else False
            })

    except Exception as e:
        print(f"[ERROR] {file_name}: {e}")

# Save summary CSV
if summary_rows:
    summary_df = pd.DataFrame(summary_rows)
    out_csv = os.path.join(summary_folder, "top5_summary_all_files.csv")
    summary_df.to_csv(out_csv, index=False)
    print(f"[OK] Wrote summary CSV: {out_csv}")
else:
    print("[WARN] No data available to write.")

[INFO] Matched files: 8
  - histogram_DNA_Data_0N_assemble.csv
  - histogram_DNA_Data_1D_assemble.csv
  - histogram_DNA_Data_2S_assemble.csv
  - histogram_DNA_Data_3SP26_assemble.csv
  - histogram_DNA_Data_4G_assemble.csv
  - histogram_DNA_Data_5I_assemble.csv
  - histogram_DNA_Data_6S_assemble.csv
  - histogram_DNA_Data_7T_assemble.csv
[OK] Saved plot: sequence_merge_method/5_histogram/graph_top5/0N_top5.png
[OK] Saved plot: sequence_merge_method/5_histogram/graph_top5/1D_top5.png
[OK] Saved plot: sequence_merge_method/5_histogram/graph_top5/2S_top5.png
[OK] Saved plot: sequence_merge_method/5_histogram/graph_top5/3SP26_top5.png
[OK] Saved plot: sequence_merge_method/5_histogram/graph_top5/4G_top5.png
[OK] Saved plot: sequence_merge_method/5_histogram/graph_top5/5I_top5.png
[OK] Saved plot: sequence_merge_method/5_histogram/graph_top5/6S_top5.png
[OK] Saved plot: sequence_merge_method/5_histogram/graph_top5/7T_top5.png
[OK] Wrote summary CSV: sequence_merge_method/5_histogram/graph_to

## C. Summarize Highlighted Read Counts into a CSV File

In [10]:
import os
import re
import pandas as pd

# === Highlight Mapping (associates sample prefixes with their expected RNAME) ===
highlight_mapping = {
    "0N": "seq_013_00001101",
    "1D": "seq_035_00100011",
    "2S": "seq_082_01010010",
    "3G": "seq_102_01100110",
    "4I": "seq_136_10001000",
    "5S": "seq_178_10110010",
    "6T": "seq_211_11010011",
    "5K": "seq_170_10101010",
    "1X8": "seq_255_11111111",
    "0X8": "seq_000_00000000",
    "4G": "seq_134_10000110",
    "5I": "seq_168_10101000",
    "6S": "seq_210_11010010",
    "7T": "seq_243_11110011",
    "3SP26": "seq_122_01111010",
    "3SP31": "seq_127_01111111"
}

# === Folders ===
histogram_folder = "sequence_merge_method/5_histogram"
summary_folder = "sequence_merge_method/6_summary"
os.makedirs(summary_folder, exist_ok=True)
highlight_result_csv = os.path.join(summary_folder, "highlight_result.csv")

# === Helpers ===
def canonicalize_rname(x: str) -> str:
    """
    Normalize RNAME to a canonical form without zero-padding in the index part.
    Examples:
      'seq_0013_00001101' -> 'seq_13_00001101'
      'seq_013_00001101'  -> 'seq_13_00001101'
      'seq_13_00001101'   -> 'seq_13_00001101'
    If pattern doesn't match, return original stripped string.
    """
    s = str(x).strip()
    m = re.fullmatch(r"seq_(\d+)_([01]+)", s)
    if not m:
        return s
    idx = int(m.group(1))   # remove leading zeros
    bits = m.group(2)
    return f"seq_{idx}_{bits}"

# Pre-normalize mapping so it matches canonicalized RNAMEs in CSVs
normalized_mapping = {k: canonicalize_rname(v) for k, v in highlight_mapping.items()}

def extract_prefix_from_filename(file: str) -> str:
    """
    Robustly extract the sample prefix from filenames like:
      'histogram_DNA_Data_0N_assemble.csv' -> '0N'
      'histogram_DNA_Data_3SP26_assemble.csv' -> '3SP26'
    Strategy: take the token right before 'assemble'.
    """
    name = os.path.basename(file)
    if name.startswith("histogram_"):
        name = name[len("histogram_"):]
    if name.endswith(".csv"):
        name = name[:-4]
    tokens = name.split("_")
    # find token before 'assemble'
    try:
        i = tokens.index("assemble")
        if i - 1 >= 0:
            return tokens[i - 1]
    except ValueError:
        pass
    # fallback heuristic
    return tokens[2] if len(tokens) > 2 else (tokens[-1] if tokens else "")

# === Collect Highlight Summary Information ===
highlight_data = []
csv_files = [f for f in os.listdir(histogram_folder)
             if f.startswith("histogram_") and f.endswith(".csv")]

for file in csv_files:
    file_path = os.path.join(histogram_folder, file)
    try:
        df = pd.read_csv(file_path)

        # Ensure required columns exist
        if "RNAME" not in df.columns or "Count" not in df.columns:
            raise ValueError(f"Required columns 'RNAME' and 'Count' not found in {file}")

        # Canonicalize RNAMEs for reliable matching
        df["RNAME"] = df["RNAME"].map(canonicalize_rname)

        # Extract prefix robustly and get normalized highlight RNAME
        prefix = extract_prefix_from_filename(file)
        highlight_rname = normalized_mapping.get(prefix, "")

        # Ensure Count is integer
        df["Count"] = pd.to_numeric(df["Count"], errors="coerce").fillna(0).astype(int)
        total_count = int(df["Count"].sum())

        # Highlight stats
        highlight_count = int(df.loc[df["RNAME"] == highlight_rname, "Count"].sum()) if highlight_rname else 0
        highlight_percentage = (highlight_count / total_count * 100.0) if total_count > 0 else 0.0

        # Ratio vs second top
        sorted_counts = df["Count"].sort_values(ascending=False).to_list()
        second_max_count = sorted_counts[1] if len(sorted_counts) >= 2 else 0
        highlight_vs_second_ratio = (highlight_count / second_max_count) if second_max_count > 0 else 0.0

        # Keep the same 'File' field shape you used before (without the 'histogram_' prefix)
        file_name = file.replace("histogram_", "")

        highlight_data.append([
            file_name,
            highlight_count,
            total_count,
            highlight_percentage,
            highlight_rname,
            highlight_vs_second_ratio
        ])

    except Exception as e:
        print(f"❌ Error processing file '{file}': {e}")

# === Save the Summary to a CSV File (same columns/order as your original) ===
highlight_df = pd.DataFrame(highlight_data, columns=[
    'File',
    'Highlight_Count',
    'Total_Count',
    'Highlight_Percentage',
    'Highlight_RNAME',
    'Highlight_vs_SecondTop_Ratio'
])
highlight_df = highlight_df.sort_values(by='File')
highlight_df.to_csv(highlight_result_csv, index=False)

print(f"📌 Highlight summary saved to: {highlight_result_csv}")

📌 Highlight summary saved to: sequence_merge_method/6_summary/highlight_result.csv


## D. Plot Stacked Bar Graph top5_gray_rest_white_box

In [25]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# ---------------- Paths ----------------
histogram_folder = "sequence_merge_method/5_histogram"
summary_folder = "sequence_merge_method/6_summary"
os.makedirs(summary_folder, exist_ok=True)

# ---------------- Expected RNAME map (by sample prefix) ----------------
highlight_mapping = {
    "0N": "seq_013_00001101",
    "1D": "seq_035_00100011",
    "2S": "seq_082_01010010",
    "3G": "seq_102_01100110",
    "4I": "seq_136_10001000",
    "5S": "seq_178_10110010",
    "6T": "seq_211_11010011",
    "5K": "seq_170_10101010",
    "1X8": "seq_255_11111111",
    "0X8": "seq_000_00000000",
    "4G": "seq_134_10000110",
    "5I": "seq_168_10101000",
    "6S": "seq_210_11010010",
    "7T": "seq_243_11110011",
    "3SP26": "seq_122_01111010",
    "3SP31": "seq_127_01111111",
}

# ---------------- Visual options ----------------
# If True, use only the prefix (e.g., 0N, 1D) as x-axis label; else use full sample name.
USE_PREFIX_LABELS = True

# Rank-based colors: 1st=red, 2nd~4th gray shades (RGB 0–1), 5+=white merged
RANK_COLORS = [
    "red",              # 1st
    (0.30, 0.30, 0.30), # 2nd
    (0.5, 0.5, 0.5), # 3rd
    (0.7, 0.7, 0.7), # 4th
]

# ---------------- Load per-sample data ----------------
sample_rname_dfs = {}
for file_name in sorted(os.listdir(histogram_folder)):
    if not (file_name.lower().startswith("histogram_") and file_name.endswith(".csv")):
        continue

    file_path = os.path.join(histogram_folder, file_name)
    sample_name = file_name.replace("histogram_", "").replace(".csv", "")  # e.g., DNA_Data_0N_assemble

    # robust prefix extraction: token right before "_assemble"
    m = re.search(r"([A-Za-z0-9]+)(?=_assemble$)", sample_name)
    prefix = m.group(1) if m else None

    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"[ERROR] Failed to read {file_name}: {e}")
        continue

    if "RNAME" not in df.columns or "Count" not in df.columns:
        print(f"[SKIP] {file_name}: missing RNAME/Count columns.")
        continue

    # Normalize safely (avoid zero division)
    df["Count"] = df["Count"].astype(int)
    total = df["Count"].sum()
    if total == 0:
        print(f"[SKIP] {file_name}: total Count is zero.")
        continue

    df["Normalized_Count"] = df["Count"] / total
    df = df.sort_values(by="Count", ascending=False).reset_index(drop=True)

    sample_rname_dfs[sample_name] = (prefix, df)

if not sample_rname_dfs:
    print("[WARN] No valid histogram CSV files found. Check folder and filenames.")
else:
    print(f"[INFO] Loaded {len(sample_rname_dfs)} samples.")

# ---------------- Plot ----------------
fig, ax = plt.subplots(figsize=(24, 12))

for sample_name, (prefix, df) in sample_rname_dfs.items():
    # expected highlight RNAME by prefix (may be None if prefix not in map)
    highlight_rname = highlight_mapping.get(prefix)

    # split top4 and the rest
    top4 = df.head(4).copy()
    rest_sum = df["Normalized_Count"].iloc[4:].sum() if len(df) > 4 else 0.0

    # x label
    x_label = prefix if (USE_PREFIX_LABELS and prefix) else sample_name

    bottom = 0.0
    for i, row in top4.iterrows():
        height = float(row["Normalized_Count"])
        rname = str(row["RNAME"]).strip()

        # 1st should be red if it equals expected highlight; otherwise still rank color
        if highlight_rname and rname == highlight_rname:
            bar_color = RANK_COLORS[0]
        else:
            # i: 0..3  →  choose rank color slot
            bar_color = RANK_COLORS[i if i < len(RANK_COLORS) else -1]

        ax.bar(
            x_label,
            height,
            bottom=bottom,
            color=bar_color,
            edgecolor="black",
            linewidth=0.2,
        )
        bottom += height

    if rest_sum > 0:
        ax.bar(
            x_label,
            rest_sum,
            bottom=bottom,
            color="white",
            edgecolor="black",
            linewidth=0.2,
        )

# reference line and styling
ax.axhline(y=0.5, color="gray", linestyle="--", linewidth=1)
ax.set_ylabel("Normalized Count", fontsize=20)
ax.set_xlabel("Sample", fontsize=20)
ax.set_title(
    "Stacked Bar Chart (1st=Red, 2nd-4th=Gray Shades, Other=White)",
    fontsize=18,
)
ax.tick_params(axis="x", labelsize=18)
ax.tick_params(axis="y", labelsize=18)
plt.xticks(rotation=45, ha="right")
plt.tight_layout()

# ---------------- Save ----------------
png_path = os.path.join(summary_folder, "stacked_bar_rank_color_RGB.png")
svg_path = os.path.join(summary_folder, "stacked_bar_rank_color_RGB.svg")
plt.savefig(png_path, dpi=300)
plt.savefig(svg_path)
# pdf_path = os.path.join(summary_folder, "stacked_bar_rank_color_RGB.pdf"); plt.savefig(pdf_path)
plt.close()

print("✅ Saved outputs:")
print(" -", png_path)
print(" -", svg_path)
# print(" -", pdf_path)

[INFO] Loaded 8 samples.
✅ Saved outputs:
 - sequence_merge_method/6_summary/stacked_bar_rank_color_RGB.png
 - sequence_merge_method/6_summary/stacked_bar_rank_color_RGB.svg


# Error Analysis

In [26]:
import pandas as pd
import os
import re
import numpy as np

# --- 1. Set Answer Key (same as before) ---
answer_data = {
    "0N": "seq_013_00001101", "1D": "seq_035_00100011",
    "2S": "seq_082_01010010", "3G": "seq_102_01100110",
    "4I": "seq_136_10001000", "5S": "seq_178_10110010",
    "6T": "seq_211_11010011", "5K": "seq_170_10101010",
    "1X8": "seq_255_11111111", "0X8": "seq_000_00000000",
    "4G": "seq_134_10000110", "5I": "seq_168_10101000",
    "6S": "seq_210_11010010", "7T": "seq_243_11110011",
    "3SP26": "seq_122_01111010", "3SP31": "seq_127_01111111"
}
# Extract only the 8-digit number for convenience and create answer_key_map
# Result example: {'0N': '00001101', '1D': '00100011', ...}
answer_key_map = {key: re.search(r'([01]{8})$', value).group(1) for key, value in answer_data.items()}


# --- 2. File Processing and Combined Calculation (same as before) ---
input_folder = "sequence_merge_method/5_histogram"
output_folder = "sequence_merge_method/6_summary"
output_path = os.path.join(output_folder, "summary_combined.csv")

try:
    files = [f for f in os.listdir(input_folder) if f.endswith('.csv') and f.startswith("histogram_")]
except FileNotFoundError:
    print(f"❌ Error: Folder '{input_folder}' not found.")
    files = []

combined_summary_list = []

if not files:
    print("⚠️ No files found for analysis.")
else:
    print(f"📁 Analyzing a total of {len(files)} files.")
    for file_name in sorted(files):
        try:
            df = pd.read_csv(os.path.join(input_folder, file_name))
            total_count = df['Count'].sum()
            
            position_counts = [{'0': 0, '1': 0} for _ in range(8)]
            for _, row in df.iterrows():
                rname = row.get('RNAME', '')
                count = int(row['Count'])
                match = re.search(r'seq_[^_]+_([01]{8})', rname)
                if match:
                    eight_digits = match.group(1)
                    for i, digit in enumerate(eight_digits):
                        position_counts[i][digit] += count
            
            answer_key = None
            for key in answer_key_map:
                if key in file_name:
                    answer_key = answer_key_map[key]
                    break
            
            combined_row = {
                "File_Name": file_name,
                "Total_Count": total_count
            }
            
            accuracies_for_avg = [] 
            for i in range(8):
                zeros_count = position_counts[i]['0']
                ones_count = position_counts[i]['1']
                
                combined_row[f"Pos{i+1}_Zeros_Count"] = zeros_count
                combined_row[f"Pos{i+1}_Ones_Count"] = ones_count
                
                accuracy = np.nan
                if answer_key:
                    correct_digit = answer_key[i]
                    current_total = zeros_count + ones_count
                    correct_count = position_counts[i][correct_digit]
                    accuracy = correct_count / current_total if current_total > 0 else 0
                
                combined_row[f"Pos{i+1}_Accuracy"] = accuracy
                accuracies_for_avg.append(accuracy)

            avg_accuracy = np.nanmean(accuracies_for_avg)
            combined_row["Avg_Accuracy"] = avg_accuracy

            combined_summary_list.append(combined_row)
            if not answer_key:
                print(f"⚠️ Warning: Answer key not found for {file_name}.")
            print(f"✅ {file_name} processing complete.")
        
        except Exception as e:
            print(f"❌ Error processing {file_name}: {e}")

# --- 3. Save Combined Results and Add Summary Row (Modified Section) ---
if combined_summary_list:
    combined_df = pd.DataFrame(combined_summary_list)
    
    column_order = ['File_Name', 'Total_Count', 'Avg_Accuracy']
    for i in range(1, 9):
        column_order.append(f'Pos{i}_Zeros_Count')
        column_order.append(f'Pos{i}_Ones_Count')
        column_order.append(f'Pos{i}_Accuracy')
        
    final_df = combined_df[[col for col in column_order if col in combined_df.columns]]
    
    # ★★★ Create Summary Row (Modified Logic) ★★★
    summary_row = {'File_Name': 'Avg. Accuracy'}
    # Find and iterate over all columns containing 'Accuracy'
    for col_name in final_df.columns:
        if 'Accuracy' in col_name:
            # Calculate the mean of that Accuracy column and add to the summary row
            summary_row[col_name] = final_df[col_name].mean()

    # ★★★ Add Summary Row to the existing DataFrame ★★★
    summary_row_df = pd.DataFrame([summary_row]) # Convert dictionary to DataFrame
    final_df = pd.concat([final_df, summary_row_df], ignore_index=True)

    final_df.to_csv(output_path, index=False)
    
    print("\n--- Final Combined Analysis Results (including summary row) ---")
    print(final_df[['File_Name', 'Total_Count', 'Avg_Accuracy']].tail())
    print(f"\n📄 All analysis results and the summary row have been saved to '{output_path}'.")


📁 Analyzing a total of 8 files.
✅ histogram_DNA_Data_0N_assemble.csv processing complete.
✅ histogram_DNA_Data_1D_assemble.csv processing complete.
✅ histogram_DNA_Data_2S_assemble.csv processing complete.
✅ histogram_DNA_Data_3SP26_assemble.csv processing complete.
✅ histogram_DNA_Data_4G_assemble.csv processing complete.
✅ histogram_DNA_Data_5I_assemble.csv processing complete.
✅ histogram_DNA_Data_6S_assemble.csv processing complete.
✅ histogram_DNA_Data_7T_assemble.csv processing complete.

--- Final Combined Analysis Results (including summary row) ---
                            File_Name  Total_Count  Avg_Accuracy
4  histogram_DNA_Data_4G_assemble.csv         48.0      0.932292
5  histogram_DNA_Data_5I_assemble.csv         46.0      0.904891
6  histogram_DNA_Data_6S_assemble.csv         43.0      0.930233
7  histogram_DNA_Data_7T_assemble.csv         42.0      0.904762
8                       Avg. Accuracy          NaN      0.921023

📄 All analysis results and the summary row ha