# 1. Install Modules

In [1]:
# Bioinformatics Tools (Ubuntu)
!sudo apt-get update
!sudo apt-get install -y fastp flash bwa samtools

# Python Library
!pip3 install biopython cutadapt pysam --break-system-packages

Get:1 https://packages.microsoft.com/repos/code stable InRelease [3,590 B]
Get:2 https://packages.microsoft.com/repos/code stable/main amd64 Packages [20.5 kB]
Get:3 https://packages.microsoft.com/repos/code stable/main arm64 Packages [20.5 kB]
Get:4 https://packages.microsoft.com/repos/code stable/main armhf Packages [20.6 kB]
Hit:5 http://ports.ubuntu.com/ubuntu-ports noble InRelease                     
Get:6 http://ports.ubuntu.com/ubuntu-ports noble-updates InRelease [126 kB]    
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu noble InRelease   
Get:8 http://ports.ubuntu.com/ubuntu-ports noble-backports InRelease [126 kB]
Get:9 http://ports.ubuntu.com/ubuntu-ports noble-security InRelease [126 kB]
Get:10 http://ports.ubuntu.com/ubuntu-ports noble-updates/main arm64 Packages [1,617 kB]
Get:11 http://ports.ubuntu.com/ubuntu-ports noble-updates/main Translation-en [292 kB]
Get:12 http://ports.ubuntu.com/ubuntu-ports noble-updates/main arm64 Components [172 kB]
Get:13 htt

# 2. Trimming and Discard trimmed sample

In [72]:
import subprocess
import glob
import os

# Specify the folder containing your input files
# Specify the folder where you want to save the untrimmed sequences (adapter-free sequences)
input_folder = "fastq_1_2_3_4_5_6"
untrimmed_output_folder = "fastq_1_2_3_4_5_6/A_Untrimmed_output"

# Define the adapter sequences for R1 and R2
adapter_sequence_r1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
adapter_sequence_r2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"

# Use glob to get a list of all input file pairs (R1 and R2) in the folder
input_file_pairs = []
for input_r1 in glob.glob(os.path.join(input_folder, "*_R1.fastq.gz")):
    # Assuming R2 files have the same naming format as R1 files
    input_r2 = input_r1.replace("_R1.fastq.gz", "_R2.fastq.gz")
    if os.path.exists(input_r2):  # Ensure R2 file exists
        input_file_pairs.append({"r1": input_r1, "r2": input_r2})

# Create the output folder if it doesn't exist
os.makedirs(untrimmed_output_folder, exist_ok=True)

for input_files in input_file_pairs:
    input_r1 = input_files["r1"]
    input_r2 = input_files["r2"]

    # Define output file paths for untrimmed (clean, adapter-free) sequences
    untrimmed_r1 = os.path.join(untrimmed_output_folder, os.path.basename(input_r1).replace(".fastq.gz", "_untrimmed.fastq.gz"))
    untrimmed_r2 = os.path.join(untrimmed_output_folder, os.path.basename(input_r2).replace(".fastq.gz", "_untrimmed.fastq.gz"))

    # Use cutadapt to keep only untrimmed sequences (completely adapter-free)
    result = subprocess.run([
        "cutadapt",
        "-a", adapter_sequence_r1,  # Adapter for R1
        "-A", adapter_sequence_r2,  # Adapter for R2
        "-O", "15",  # Minimum overlap for adapter trimming
        #"--discard-trimmed",  # Discard sequences where trimming occurred
        "-o", untrimmed_r1,  # Save only untrimmed R1 reads
        "-p", untrimmed_r2,  # Save only untrimmed R2 reads
        input_r1, input_r2
    ], capture_output=True, text=True)

    # Log result
    if result.returncode == 0:
        print(f"Untrimmed sequences saved: {untrimmed_r1}, {untrimmed_r2}")
    else:
        print(f"Error processing {input_r1} and {input_r2}:\n{result.stderr}")

Untrimmed sequences saved: fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_01step_R1_untrimmed.fastq.gz, fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_01step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_06step_R1_untrimmed.fastq.gz, fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_06step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_03step_R1_untrimmed.fastq.gz, fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_03step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_04step_R1_untrimmed.fastq.gz, fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_04step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_05step_R1_untrimmed.fastq.gz, fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_05step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_02step_R1_untrimmed.fastq.gz, fastq_1

# 3. Q filtering

In [73]:
import os
import subprocess

# Quality threshold (Phred score)
quality_threshold = 30

# Set input and output folders
input_folder = "fastq_1_2_3_4_5_6/A_Untrimmed_output"
output_folder = "fastq_1_2_3_4_5_6/B_Qfiltered"

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True) 

# Iterate through files in the input folder and process those ending with "_untrimmed.fastq.gz"
for filename in os.listdir(input_folder):
    if filename.endswith("_untrimmed.fastq.gz"):
        # Input file path
        input_file = os.path.join(input_folder, filename)
        
        # Output file name (e.g., sample_untrimmed.fastq.gz -> sample_Qfiltered.fastq.gz)
        output_file = os.path.join(
            output_folder, 
            filename.replace("_untrimmed.fastq.gz", "_Qfiltered.fastq.gz")
        )
        
        # Run fastp (single-end mode)
        subprocess.call([
            "fastp",
            "-i", input_file,                 # Input file
            "-o", output_file,                # Output file
            "-q", str(quality_threshold),     # Quality cutoff (Q30)
            "-u", "15",                       # Discard reads with >15% low-quality bases
            # "-l", "151",                    # Minimum read length (optional)
            "--cut_mean_quality", "30",       # Discard reads with mean quality < 30
            "--html", f"{output_file}.html",  # HTML report
            "--json", f"{output_file}.json"   # JSON report
        ])
        
        print(f"Filtering for {filename} is complete.\n"
              f"Output FASTQ : {output_file}\n"
              f"Reports      : {output_file}.html / {output_file}.json\n")

print("All filtering processes are done.")

Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 2468
total bases: 271323
Q20 bases: 262966(96.9199%)
Q30 bases: 252892(93.207%)

Read1 after filtering:
total reads: 2144
total bases: 232413
Q20 bases: 230924(99.3593%)
Q30 bases: 226129(97.2962%)

Filtering result:
reads passed filter: 2144
reads failed due to low quality: 323
reads failed due to too many N: 0
reads failed due to too short: 1
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 78.6872%

JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_05step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_05step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_05step_R1_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_05step_R1_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_

Filtering for Stepwise_05step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_05step_R1_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_05step_R1_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_05step_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 2632
total bases: 163402
Q20 bases: 151694(92.8348%)
Q30 bases: 145564(89.0834%)

Read1 after filtering:
total reads: 2254
total bases: 118174
Q20 bases: 117972(99.8291%)
Q30 bases: 117090(99.0827%)

Filtering result:
reads passed filter: 2254
reads failed due to low quality: 355
reads failed due to too many N: 0
reads failed due to too short: 23
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 85.9043%

JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_02step_R2_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R2_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R2_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltere

Filtering for Stepwise_02step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R2_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R2_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 2468
total bases: 282991
Q20 bases: 267052(94.3677%)
Q30 bases: 252250(89.1371%)

Read1 after filtering:
total reads: 1954
total bases: 214165
Q20 bases: 210956(98.5016%)
Q30 bases: 203983(95.2457%)

Filtering result:
reads passed filter: 1954
reads failed due to low quality: 513
reads failed due to too many N: 0
reads failed due to too short: 1
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 68.8817%

JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_05step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_05step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_05step_R2_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_05step_R2_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_05step_R2_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered

Filtering for Stepwise_05step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_05step_R2_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_05step_R2_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_05step_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 2632
total bases: 133821
Q20 bases: 132480(98.9979%)
Q30 bases: 130150(97.2568%)

Read1 after filtering:
total reads: 2457
total bases: 125363
Q20 bases: 125152(99.8317%)
Q30 bases: 124202(99.0739%)

Filtering result:
reads passed filter: 2457
reads failed due to low quality: 150
reads failed due to too many N: 0
reads failed due to too short: 25
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 94.5289%

JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_02step_R1_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R1_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R1_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltere

Filtering for Stepwise_02step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R1_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R1_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_02step_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 2387
total bases: 318681
Q20 bases: 295857(92.838%)
Q30 bases: 274463(86.1247%)

Read1 after filtering:
total reads: 1654
total bases: 214977
Q20 bases: 210076(97.7202%)
Q30 bases: 200235(93.1425%)

Filtering result:
reads passed filter: 1654
reads failed due to low quality: 732
reads failed due to too many N: 0
reads failed due to too short: 1
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 51.9062%

JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_06step_R2_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R2_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R2_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/

Filtering for Stepwise_06step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R2_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R2_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 3219
total bases: 101569
Q20 bases: 100677(99.1218%)
Q30 bases: 98934(97.4057%)

Read1 after filtering:
total reads: 2960
total bases: 94847
Q20 bases: 94645(99.787%)
Q30 bases: 93971(99.0764%)

Filtering result:
reads passed filter: 2960
reads failed due to low quality: 190
reads failed due to too many N: 0
reads failed due to too short: 69
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 97.2041%

JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_01step_R1_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R1_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R1_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/Ste

Filtering for Stepwise_01step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R1_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R1_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 2387
total bases: 313755
Q20 bases: 296656(94.5502%)
Q30 bases: 279793(89.1756%)

Read1 after filtering:
total reads: 1802
total bases: 232772
Q20 bases: 230093(98.8491%)
Q30 bases: 223168(95.8741%)

Filtering result:
reads passed filter: 1802
reads failed due to low quality: 584
reads failed due to too many N: 0
reads failed due to too short: 1
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 59.2375%

JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_06step_R1_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R1_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R1_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered

Filtering for Stepwise_06step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R1_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R1_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_06step_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 3219
total bases: 147116
Q20 bases: 132119(89.806%)
Q30 bases: 125160(85.0757%)

Read1 after filtering:
total reads: 2724
total bases: 95886
Q20 bases: 95762(99.8707%)
Q30 bases: 95123(99.2043%)

Filtering result:
reads passed filter: 2724
reads failed due to low quality: 430
reads failed due to too many N: 0
reads failed due to too short: 65
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 88.5057%

JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_01step_R2_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R2_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R2_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/St

Filtering for Stepwise_01step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R2_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R2_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_01step_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 2741
total bases: 267462
Q20 bases: 255423(95.4988%)
Q30 bases: 247215(92.43%)

Read1 after filtering:
total reads: 2374
total bases: 219665
Q20 bases: 218813(99.6121%)
Q30 bases: 215981(98.3229%)

Filtering result:
reads passed filter: 2374
reads failed due to low quality: 364
reads failed due to too many N: 0
reads failed due to too short: 3
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 80.4816%

JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_04step_R2_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R2_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R2_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/S

Filtering for Stepwise_04step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R2_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R2_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 2706
total bases: 194062
Q20 bases: 191626(98.7447%)
Q30 bases: 187949(96.85%)

Read1 after filtering:
total reads: 2530
total bases: 181841
Q20 bases: 181428(99.7729%)
Q30 bases: 179808(98.882%)

Filtering result:
reads passed filter: 2530
reads failed due to low quality: 159
reads failed due to too many N: 0
reads failed due to too short: 17
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 91.4265%

JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_03step_R1_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R1_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R1_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/S

Filtering for Stepwise_03step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R1_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R1_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 2741
total bases: 252271
Q20 bases: 248474(98.4949%)
Q30 bases: 243090(96.3607%)

Read1 after filtering:
total reads: 2543
total bases: 233407
Q20 bases: 232739(99.7138%)
Q30 bases: 230134(98.5977%)

Filtering result:
reads passed filter: 2543
reads failed due to low quality: 195
reads failed due to too many N: 0
reads failed due to too short: 3
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 87.158%

JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_04step_R1_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R1_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R1_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/

Filtering for Stepwise_04step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R1_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R1_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_04step_R1_Qfiltered.fastq.gz.json

Filtering for Stepwise_03step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R2_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R2_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R2_Qfiltered.fastq.gz.json

All filtering processes are done.


Read1 before filtering:
total reads: 2706
total bases: 219279
Q20 bases: 201488(91.8866%)
Q30 bases: 193152(88.085%)

Read1 after filtering:
total reads: 2289
total bases: 167242
Q20 bases: 166915(99.8045%)
Q30 bases: 165479(98.9458%)

Filtering result:
reads passed filter: 2289
reads failed due to low quality: 402
reads failed due to too many N: 0
reads failed due to too short: 15
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 82.3725%

JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/Stepwise_03step_R2_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R2_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/Stepwise_03step_R2_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered

# 4. Match Paired-End Read IDs

In [74]:
import gzip
import glob
import os

def extract_matching_reads(r1_path, r2_path, out_r1_path, out_r2_path):
    def get_read_id(header):
        # Extract read ID from FASTQ header
        return header.split()[0].replace('/1', '').replace('/2', '')

    r1_ids = set()
    r2_ids = set()

    # Extract all read IDs from the R1 file
    with gzip.open(r1_path, 'rt') as r1_file:
        while True:
            header = r1_file.readline()
            if not header:
                break
            r1_ids.add(get_read_id(header.strip()))
            # Skip the other 3 lines of the read (sequence, +, quality)
            [r1_file.readline() for _ in range(3)]  
    
    # Extract all read IDs from the R2 file
    with gzip.open(r2_path, 'rt') as r2_file:
        while True:
            header = r2_file.readline()
            if not header:
                break
            r2_ids.add(get_read_id(header.strip()))
            [r2_file.readline() for _ in range(3)]

    # Find common and unique IDs
    matching_ids = r1_ids & r2_ids
    r1_only = r1_ids - r2_ids
    r2_only = r2_ids - r1_ids

    print(f"Processing {os.path.basename(r1_path)} and {os.path.basename(r2_path)}")
    print(f"Total R1 IDs: {len(r1_ids)}, Total R2 IDs: {len(r2_ids)}, Matching IDs: {len(matching_ids)}")
    print(f"IDs only in R1: {len(r1_only)}, IDs only in R2: {len(r2_only)}\n")

    # Create output folders if necessary
    for out_path in [out_r1_path, out_r2_path]:
        os.makedirs(os.path.dirname(out_path), exist_ok=True)

    # Function to write only the reads with matching IDs to a new file
    def write_matching_reads(input_path, output_path, matching_ids):
        with gzip.open(input_path, 'rt') as infile, gzip.open(output_path, 'wt') as outfile:
            while True:
                lines = [infile.readline() for _ in range(4)]
                if not lines[0]:
                    break
                read_id = get_read_id(lines[0].strip())
                if read_id in matching_ids:
                    outfile.writelines(lines)

    # Write the filtered R1 and R2 files
    write_matching_reads(r1_path, out_r1_path, matching_ids)
    write_matching_reads(r2_path, out_r2_path, matching_ids)

# ----------------------
# Apply to all files
# ----------------------

input_folder = "fastq_1_2_3_4_5_6/B_Qfiltered"
output_folder = "fastq_1_2_3_4_5_6/C_ID_matched"

# Find all R1 files
r1_files = glob.glob(os.path.join(input_folder, "*_R1_Qfiltered.fastq.gz"))

# For each R1, find the matching R2 and process
for r1_file in r1_files:
    r2_file = r1_file.replace("_R1_Qfiltered.fastq.gz", "_R2_Qfiltered.fastq.gz")
    
    if os.path.exists(r2_file):
        # Set output paths
        base_name = os.path.basename(r1_file).replace("_R1_Qfiltered.fastq.gz", "")
        out_r1 = os.path.join(output_folder, f"{base_name}_ID_match_R1.fastq.gz")
        out_r2 = os.path.join(output_folder, f"{base_name}_ID_match_R2.fastq.gz")
        
        # Execute the function
        extract_matching_reads(r1_file, r2_file, out_r1, out_r2)
    else:
        print(f"Warning: {r2_file} not found. Skipping.")

Processing Stepwise_06step_R1_Qfiltered.fastq.gz and Stepwise_06step_R2_Qfiltered.fastq.gz
Total R1 IDs: 1802, Total R2 IDs: 1654, Matching IDs: 1551
IDs only in R1: 251, IDs only in R2: 103

Processing Stepwise_01step_R1_Qfiltered.fastq.gz and Stepwise_01step_R2_Qfiltered.fastq.gz
Total R1 IDs: 2960, Total R2 IDs: 2724, Matching IDs: 2660
IDs only in R1: 300, IDs only in R2: 64

Processing Stepwise_04step_R1_Qfiltered.fastq.gz and Stepwise_04step_R2_Qfiltered.fastq.gz
Total R1 IDs: 2543, Total R2 IDs: 2374, Matching IDs: 2328
IDs only in R1: 215, IDs only in R2: 46

Processing Stepwise_03step_R1_Qfiltered.fastq.gz and Stepwise_03step_R2_Qfiltered.fastq.gz
Total R1 IDs: 2530, Total R2 IDs: 2289, Matching IDs: 2239
IDs only in R1: 291, IDs only in R2: 50

Processing Stepwise_02step_R1_Qfiltered.fastq.gz and Stepwise_02step_R2_Qfiltered.fastq.gz
Total R1 IDs: 2457, Total R2 IDs: 2254, Matching IDs: 2192
IDs only in R1: 265, IDs only in R2: 62

Processing Stepwise_05step_R1_Qfiltered.fast

# 5. Merge W/ Flash

In [75]:
import os
import glob
import subprocess

# === Folder setup ===
input_folder = "fastq_1_2_3_4_5_6/C_ID_matched"
output_folder = "fastq_1_2_3_4_5_6/D_merged_output"
os.makedirs(output_folder, exist_ok=True)

# === Set N-values (Overlap Length) per Sample Prefix ===
sample_n_mapping = {
    "01step": 32,  # 20 + 12
    "02step": 52,  # 32 + 20
    "03step": 74,  # 52 + 22
    "04step": 94,  # 74 + 20
    "05step": 116, # 94 + 22
    "06step": 136, # 116 + 20
}

# === Find R1 files ===
r1_files = glob.glob(os.path.join(input_folder, "*_R1.fastq.gz"))
print(f"🔎 Found {len(r1_files)} R1 files.")

# === Process each R1 file ===
for r1_path in r1_files:
    sample_base = os.path.basename(r1_path).replace("_R1.fastq.gz", "")
    r2_path = os.path.join(input_folder, f"{sample_base}_R2.fastq.gz")

    if not os.path.exists(r2_path):
        print(f"⚠️ Matching R2 file not found for {sample_base} → Skipping.")
        continue

    # Find the corresponding N value from the filename
    matched_n = None
    for prefix, n_value in sample_n_mapping.items():
        if prefix in sample_base:
            matched_n = n_value
            break

    if matched_n is None:
        print(f"⚠️ No N value matched for {sample_base} → Skipping.")
        continue

    output_name = f"{sample_base}_FLASH"
    print(f"🔵 Running FLASH for sample: {sample_base} (N={matched_n})")

    try:
        # Execute the FLASH command
        subprocess.check_call([
            "flash",
            "-m", str(matched_n),   # minimum overlap
            "-M", str(matched_n),   # Maximum overlap
            "-o", output_name,      # Output file prefix
            "-d", output_folder,    # Output directory
            r1_path,
            r2_path
        ])
        print(f"✅ FLASH merging complete → {os.path.join(output_folder, output_name)}.fastq")
    except subprocess.CalledProcessError as e:
        print(f"❌ FLASH merging failed for {sample_base}: {e}")

🔎 Found 6 R1 files.
🔵 Running FLASH for sample: Stepwise_06step_ID_match (N=136)
[FLASH] Starting FLASH v1.2.11
[FLASH] Fast Length Adjustment of SHort reads
[FLASH]  
[FLASH] Input files:
[FLASH]     fastq_1_2_3_4_5_6/C_ID_matched/Stepwise_06step_ID_match_R1.fastq.gz
[FLASH]     fastq_1_2_3_4_5_6/C_ID_matched/Stepwise_06step_ID_match_R2.fastq.gz
[FLASH]  
[FLASH] Output files:
[FLASH]     fastq_1_2_3_4_5_6/D_merged_output/Stepwise_06step_ID_match_FLASH.extendedFrags.fastq
[FLASH]     fastq_1_2_3_4_5_6/D_merged_output/Stepwise_06step_ID_match_FLASH.notCombined_1.fastq
[FLASH]     fastq_1_2_3_4_5_6/D_merged_output/Stepwise_06step_ID_match_FLASH.notCombined_2.fastq
[FLASH]     fastq_1_2_3_4_5_6/D_merged_output/Stepwise_06step_ID_match_FLASH.hist
[FLASH]     fastq_1_2_3_4_5_6/D_merged_output/Stepwise_06step_ID_match_FLASH.histogram
[FLASH]  
[FLASH] Parameters:
[FLASH]     Min overlap:           136
[FLASH]     Max overlap:           136
[FLASH]     Max mismatch density:  0.250000
[FLASH]

overlapped by more than 116 bp, the --max-overlap (-M) parameter.  Consider
increasing this parameter.  (As-is, FLASH is penalizing overlaps longer than
116 bp when considering them for possible combining!)


# 6. fastq -> fasta

In [76]:
import os
import gzip
from Bio import SeqIO

# Input and output folder paths
input_folder = "fastq_1_2_3_4_5_6/D_merged_output"
output_folder = "fastq_1_2_3_4_5_6/E_fastq_to_fasta"

# Create the output folder if it doesn't exist.
os.makedirs(output_folder, exist_ok=True)

for filename in os.listdir(input_folder):
    # Process only files with .fastq or .fastq.gz extensions
    if filename.endswith(".extendedFrags.fastq") or filename.endswith(".extendedFrags.fastq.gz"):
        input_file = os.path.join(input_folder, filename)
        
        # Set output filename (.fasta extension)
        output_file = os.path.join(
            output_folder,
            filename.replace(".fastq.gz", ".fasta").replace(".fastq", ".fasta")
        )

        # Choose open mode based on gzip
        open_func = gzip.open if filename.endswith(".gz") else open

        # Read FASTQ and convert to FASTA
        with open_func(input_file, "rt") as fastq_file:  # open in text mode
            records = list(SeqIO.parse(fastq_file, "fastq"))

        # Save as FASTA
        with open(output_file, "w") as fasta_file:
            SeqIO.write(records, fasta_file, "fasta")

        print(f"Converted: {filename} → {os.path.basename(output_file)}")

print("All conversions are done.")

Converted: Stepwise_03step_ID_match_FLASH.extendedFrags.fastq → Stepwise_03step_ID_match_FLASH.extendedFrags.fasta
Converted: Stepwise_06step_ID_match_FLASH.extendedFrags.fastq → Stepwise_06step_ID_match_FLASH.extendedFrags.fasta
Converted: Stepwise_05step_ID_match_FLASH.extendedFrags.fastq → Stepwise_05step_ID_match_FLASH.extendedFrags.fasta
Converted: Stepwise_04step_ID_match_FLASH.extendedFrags.fastq → Stepwise_04step_ID_match_FLASH.extendedFrags.fasta
Converted: Stepwise_01step_ID_match_FLASH.extendedFrags.fastq → Stepwise_01step_ID_match_FLASH.extendedFrags.fasta
Converted: Stepwise_02step_ID_match_FLASH.extendedFrags.fastq → Stepwise_02step_ID_match_FLASH.extendedFrags.fasta
All conversions are done.


# 7. Each step reference data generate

In [77]:
import os
from pathlib import Path

def generate_sequences_for_bit(bit_length: int):
    """
    Generate DNA sequences for all binary combinations of the given bit_length.
    """
    sequences = {}

    seq_0 = "ACTCATATACACACTTAATC"
    seq_1 = "ACTCATATACATACACTTAATC"
    prefix = "ACACTTAATC"

    for i in range(2 ** bit_length):
        binary_str = format(i, f'0{bit_length}b')
        sequence = ''.join(seq_1 if bit == '1' else seq_0 for bit in binary_str)
        full_sequence = prefix + sequence
        seq_id = f"seq_{i:04d}_{binary_str}"
        sequences[seq_id] = full_sequence

    return sequences

def write_fasta(sequences: dict, output_path: str):
    """Write sequences to a FASTA file."""
    with open(output_path, "w") as f:
        for seq_id, sequence in sequences.items():
            f.write(f">{seq_id}\n{sequence}\n")

# ===== Settings =====
output_dir = Path("step_reference")
output_dir.mkdir(parents=True, exist_ok=True)

MAX_STEP = 10              # Generate 1 ~ 10 steps
PAD = len(str(MAX_STEP))   # Zero-padding width (12 → 2 digits)
# =====================

for step in range(1, MAX_STEP + 1):
    seqs = generate_sequences_for_bit(step)
    out_name = output_dir / f"{step:0{PAD}d}step_reference.fasta"  # → 01step_reference.fasta
    write_fasta(seqs, out_name)
    print(f"✅ {step:0{PAD}d}step FASTA saved: {out_name}")

✅ 01step FASTA saved: step_reference/01step_reference.fasta
✅ 02step FASTA saved: step_reference/02step_reference.fasta
✅ 03step FASTA saved: step_reference/03step_reference.fasta
✅ 04step FASTA saved: step_reference/04step_reference.fasta
✅ 05step FASTA saved: step_reference/05step_reference.fasta
✅ 06step FASTA saved: step_reference/06step_reference.fasta
✅ 07step FASTA saved: step_reference/07step_reference.fasta
✅ 08step FASTA saved: step_reference/08step_reference.fasta
✅ 09step FASTA saved: step_reference/09step_reference.fasta
✅ 10step FASTA saved: step_reference/10step_reference.fasta


# 8. Reference sequence - Sample Matching

In [78]:
%%bash
set -euo pipefail
shopt -s nullglob

# ========== Configuration ==========
ref_dir="step_reference"                       # Reference FASTA directory
query_dir="fastq_1_2_3_4_5_6/E_fastq_to_fasta"  # Input FASTA directory
output_dir="fastq_1_2_3_4_5_6/1_align_sam"      # Output SAM directory
threads=4

# Optional: filter steps (e.g., "07"–"12")
# Leave empty to process all steps automatically.
step_min="01"   # e.g., "07"
step_max="06"   # e.g., "12"
# ====================================

mkdir -p "$output_dir"

# Avoid duplicate indexing during the run
declare -A indexed
# Avoid duplicate processing: prefer "*assemble.fasta"
declare -A seen

# 1) Collect "*assemble.fasta" first
for f in "$query_dir"/*step*assemble.fasta; do
  seen["$f"]=1
done

# 2) Collect other *step*.fasta only if not already included
for f in "$query_dir"/*step*.fasta; do
  [[ -n "${seen[$f]:-}" ]] && continue
  seen["$f"]=1
done

# Alignment loop
for query_file in "${!seen[@]}"; do
  filename="$(basename "$query_file")"

  # Extract step digit(s): matches both _07step_ and _7step
  if [[ "$filename" =~ _([0-9]+)step(_|$) ]]; then
    step_raw="${BASH_REMATCH[1]}"
  else
    echo "⚠️  Step number not found in filename: $filename"
    continue
  fi

  # Zero-padding to 2 digits (e.g., 7 → 07)
  step_pad=$(printf "%02d" $((10#$step_raw)))

  # Optional step range filtering
  if [[ -n "$step_min" && -n "$step_max" ]]; then
    s_val=$((10#$step_pad))
    s_min=$((10#$step_min))
    s_max=$((10#$step_max))
    if (( s_val < s_min || s_val > s_max )); then
      echo "⏭️  Skip (step not in ${step_min}–${step_max}): $filename"
      continue
    fi
  fi

  reference_file="${ref_dir}/${step_pad}step_reference.fasta"
  if [[ ! -f "$reference_file" ]]; then
    echo "⚠️  Missing reference: $reference_file"
    continue
  fi

  out_file="${output_dir}/${filename%.fasta}.sam"
  echo "🔄 Aligning: $filename → $(basename "$reference_file") (step=$step_pad)"

  # Index only once per reference in this run, and reuse existing index files
  if [[ -z "${indexed[$reference_file]:-}" ]]; then
    if [[ -f "${reference_file}.bwt" ]]; then
      echo "⏭️  Index exists, skipping indexing."
    else
      echo "🔧 Indexing reference..."
      bwa index "$reference_file"
    fi
    indexed[$reference_file]=1
  fi

  bwa mem -M -t "$threads" "$reference_file" "$query_file" > "$out_file"
  echo "✅ Done: $out_file"
done

🔄 Aligning: Stepwise_02step_ID_match_FLASH.extendedFrags.fasta → 02step_reference.fasta (step=02)
🔧 Indexing reference...


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/02step_reference.fasta
[main] Real time: 0.027 sec; CPU: 0.003 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1921 sequences (100150 bp)...
[M::mem_process_seqs] Processed 1921 reads in 0.048 CPU sec, 0.013 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/02step_reference.fasta fastq_1_2_3_4_5_6/E_fastq_to_fasta/Stepwise_02step_ID_match_FLASH.extendedFrags.fasta
[main] Real time: 0.060 sec; CPU: 0.054 sec


✅ Done: fastq_1_2_3_4_5_6/1_align_sam/Stepwise_02step_ID_match_FLASH.extendedFrags.sam
🔄 Aligning: Stepwise_06step_ID_match_FLASH.extendedFrags.fasta → 06step_reference.fasta (step=06)
🔧 Indexing reference...


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/06step_reference.fasta
[main] Real time: 0.033 sec; CPU: 0.005 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 690 sequences (94440 bp)...
[M::mem_process_seqs] Processed 690 reads in 0.118 CPU sec, 0.031 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/06step_reference.fasta fastq_1_2_3_4_5_6/E_fastq_to_fasta/Stepwise_06step_ID_match_FLASH.extendedFrags.fasta
[main] Real time: 0.062 sec; CPU: 0.122 sec


✅ Done: fastq_1_2_3_4_5_6/1_align_sam/Stepwise_06step_ID_match_FLASH.extendedFrags.sam
🔄 Aligning: Stepwise_05step_ID_match_FLASH.extendedFrags.fasta → 05step_reference.fasta (step=05)
🔧 Indexing reference...


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/05step_reference.fasta
[main] Real time: 0.028 sec; CPU: 0.004 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1178 sequences (137298 bp)...
[M::mem_process_seqs] Processed 1178 reads in 0.121 CPU sec, 0.031 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/05step_reference.fasta fastq_1_2_3_4_5_6/E_fastq_to_fasta/Stepwise_05step_ID_match_FLASH.extendedFrags.fasta
[main] Real time: 0.067 sec; CPU: 0.126 sec


✅ Done: fastq_1_2_3_4_5_6/1_align_sam/Stepwise_05step_ID_match_FLASH.extendedFrags.sam
🔄 Aligning: Stepwise_03step_ID_match_FLASH.extendedFrags.fasta → 03step_reference.fasta (step=03)
🔧 Indexing reference...


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/03step_reference.fasta
[main] Real time: 0.031 sec; CPU: 0.003 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1790 sequences (132667 bp)...
[M::mem_process_seqs] Processed 1790 reads in 0.098 CPU sec, 0.033 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/03step_reference.fasta fastq_1_2_3_4_5_6/E_fastq_to_fasta/Stepwise_03step_ID_match_FLASH.extendedFrags.fasta
[main] Real time: 0.127 sec; CPU: 0.107 sec


✅ Done: fastq_1_2_3_4_5_6/1_align_sam/Stepwise_03step_ID_match_FLASH.extendedFrags.sam
🔄 Aligning: Stepwise_01step_ID_match_FLASH.extendedFrags.fasta → 01step_reference.fasta (step=01)
🔧 Indexing reference...


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/01step_reference.fasta
[main] Real time: 0.051 sec; CPU: 0.005 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 2528 sequences (80989 bp)...
[M::mem_process_seqs] Processed 2528 reads in 0.019 CPU sec, 0.006 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/01step_reference.fasta fastq_1_2_3_4_5_6/E_fastq_to_fasta/Stepwise_01step_ID_match_FLASH.extendedFrags.fasta
[main] Real time: 0.051 sec; CPU: 0.026 sec


✅ Done: fastq_1_2_3_4_5_6/1_align_sam/Stepwise_01step_ID_match_FLASH.extendedFrags.sam
🔄 Aligning: Stepwise_04step_ID_match_FLASH.extendedFrags.fasta → 04step_reference.fasta (step=04)
🔧 Indexing reference...


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/04step_reference.fasta
[main] Real time: 0.028 sec; CPU: 0.004 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1786 sequences (168310 bp)...
[M::mem_process_seqs] Processed 1786 reads in 0.129 CPU sec, 0.035 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/04step_reference.fasta fastq_1_2_3_4_5_6/E_fastq_to_fasta/Stepwise_04step_ID_match_FLASH.extendedFrags.fasta
[main] Real time: 0.084 sec; CPU: 0.136 sec


✅ Done: fastq_1_2_3_4_5_6/1_align_sam/Stepwise_04step_ID_match_FLASH.extendedFrags.sam


## 8.1 sam to bam

In [79]:
%%bash

# Set the path to the directory containing SAM files
sam_dir="fastq_1_2_3_4_5_6/1_align_sam"
# Set the output directory for BAM files
bam_dir="fastq_1_2_3_4_5_6/2_align_bam"

# Make sure the output directory exists or create it if necessary
mkdir -p "$bam_dir"

# Convert SAM files to BAM
for sam_file in "$sam_dir"/*.sam; do
    bam_file="$bam_dir/$(basename "$sam_file" .sam).bam"
    samtools view -bS "$sam_file" -o "$bam_file"
    echo "Conversion from $sam_file to $bam_file is complete."
done

Conversion from fastq_1_2_3_4_5_6/1_align_sam/Stepwise_01step_ID_match_FLASH.extendedFrags.sam to fastq_1_2_3_4_5_6/2_align_bam/Stepwise_01step_ID_match_FLASH.extendedFrags.bam is complete.
Conversion from fastq_1_2_3_4_5_6/1_align_sam/Stepwise_02step_ID_match_FLASH.extendedFrags.sam to fastq_1_2_3_4_5_6/2_align_bam/Stepwise_02step_ID_match_FLASH.extendedFrags.bam is complete.
Conversion from fastq_1_2_3_4_5_6/1_align_sam/Stepwise_03step_ID_match_FLASH.extendedFrags.sam to fastq_1_2_3_4_5_6/2_align_bam/Stepwise_03step_ID_match_FLASH.extendedFrags.bam is complete.
Conversion from fastq_1_2_3_4_5_6/1_align_sam/Stepwise_04step_ID_match_FLASH.extendedFrags.sam to fastq_1_2_3_4_5_6/2_align_bam/Stepwise_04step_ID_match_FLASH.extendedFrags.bam is complete.
Conversion from fastq_1_2_3_4_5_6/1_align_sam/Stepwise_05step_ID_match_FLASH.extendedFrags.sam to fastq_1_2_3_4_5_6/2_align_bam/Stepwise_05step_ID_match_FLASH.extendedFrags.bam is complete.
Conversion from fastq_1_2_3_4_5_6/1_align_sam/Step

## 8.2 Convert BAM to CSV

In [80]:
import os
import pysam
import pandas as pd

# Input folder (path containing BAM files)
input_folder = "fastq_1_2_3_4_5_6/2_align_bam"
# Output folder (path to save CSV files; change if needed)
output_folder = "fastq_1_2_3_4_5_6/3_align_csv"

# Create output folder if it does not exist
os.makedirs(output_folder, exist_ok=True)

# BAM -> CSV conversion function (including optional SAM tags)
def bam_to_csv(bam_file, output_folder):
    output_csv = os.path.join(output_folder, os.path.basename(bam_file).replace(".bam", ".csv"))
    
    # Read BAM file
    with pysam.AlignmentFile(bam_file, "rb") as bam:
        records = []
        all_tags = set()  # set to collect optional tag names
        
        for read in bam:
            # Core fields
            record = {
                "QNAME": read.query_name,
                "FLAG": read.flag,
                "RNAME": bam.get_reference_name(read.reference_id) if read.reference_id >= 0 else "*",
                "POS": read.reference_start + 1,
                "MAPQ": read.mapping_quality,
                "CIGAR": read.cigarstring if read.cigarstring else "*",
                "RNEXT": bam.get_reference_name(read.next_reference_id) if read.next_reference_id >= 0 else "*",
                "PNEXT": read.next_reference_start + 1 if read.next_reference_start >= 0 else 0,
                "TLEN": read.template_length,
                "SEQ": read.query_sequence if read.query_sequence else "*",
                "QUAL": read.qual if read.qual else "*",
            }
            
            # Optional SAM tags (aux fields)
            for tag, value in read.tags:
                record[tag] = value
                all_tags.add(tag)

            records.append(record)
    
    # Build DataFrame
    df = pd.DataFrame(records)

    # Fill missing optional fields with "*"
    df = df.fillna("*")

    # Save CSV
    df.to_csv(output_csv, index=False)
    print(f"✅ Converted: {os.path.basename(bam_file)} -> {os.path.basename(output_csv)}")

    return output_csv

# Find all BAM files in the folder
bam_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".bam")]

# Convert all BAM files to CSV
csv_files = []
for bam_file in bam_files:
    csv_file = bam_to_csv(bam_file, output_folder)
    csv_files.append(csv_file)

# Print the list of newly created CSV files.
csv_files

✅ Converted: Stepwise_05step_ID_match_FLASH.extendedFrags.bam -> Stepwise_05step_ID_match_FLASH.extendedFrags.csv
✅ Converted: Stepwise_02step_ID_match_FLASH.extendedFrags.bam -> Stepwise_02step_ID_match_FLASH.extendedFrags.csv
✅ Converted: Stepwise_04step_ID_match_FLASH.extendedFrags.bam -> Stepwise_04step_ID_match_FLASH.extendedFrags.csv
✅ Converted: Stepwise_03step_ID_match_FLASH.extendedFrags.bam -> Stepwise_03step_ID_match_FLASH.extendedFrags.csv
✅ Converted: Stepwise_01step_ID_match_FLASH.extendedFrags.bam -> Stepwise_01step_ID_match_FLASH.extendedFrags.csv
✅ Converted: Stepwise_06step_ID_match_FLASH.extendedFrags.bam -> Stepwise_06step_ID_match_FLASH.extendedFrags.csv


['fastq_1_2_3_4_5_6/3_align_csv/Stepwise_05step_ID_match_FLASH.extendedFrags.csv',
 'fastq_1_2_3_4_5_6/3_align_csv/Stepwise_02step_ID_match_FLASH.extendedFrags.csv',
 'fastq_1_2_3_4_5_6/3_align_csv/Stepwise_04step_ID_match_FLASH.extendedFrags.csv',
 'fastq_1_2_3_4_5_6/3_align_csv/Stepwise_03step_ID_match_FLASH.extendedFrags.csv',
 'fastq_1_2_3_4_5_6/3_align_csv/Stepwise_01step_ID_match_FLASH.extendedFrags.csv',
 'fastq_1_2_3_4_5_6/3_align_csv/Stepwise_06step_ID_match_FLASH.extendedFrags.csv']

## 8.3 Filter Alignments by MAPQ Score

In [81]:
import os
import pandas as pd
from pathlib import Path

# ===== Settings =====
input_dir = Path("fastq_1_2_3_4_5_6/3_align_csv")  # Input folder containing CSV files
output_dir = input_dir / "MAPQ_removed"            # Output folder for filtered CSV files
output_dir.mkdir(parents=True, exist_ok=True)

MAPQ_THRESHOLD = 10     # Keep rows where MAPQ > this value
KEEP_NAN = True         # Keep rows with NaN MAPQ values (e.g., unaligned reads)
# ====================

def process_one_csv(in_path: Path, out_dir: Path, mapq_threshold: int, keep_nan: bool = True):
    out_path = out_dir / in_path.name

    # Remove existing output file to avoid duplicates
    if out_path.exists():
        out_path.unlink()

    # Read input CSV
    try:
        df = pd.read_csv(in_path)
    except Exception as e:
        print(f"⚠️  Read fail: {in_path.name} -> {e}")
        return

    # Skip if MAPQ column does not exist
    if "MAPQ" not in df.columns:
        print(f"⚠️  Skip (no MAPQ column): {in_path.name}")
        return

    # Convert MAPQ column to numeric (invalid entries become NaN)
    m = pd.to_numeric(df["MAPQ"], errors="coerce")

    # Filtering mask: keep MAPQ > threshold, optionally keep NaN
    keep_mask = (m > mapq_threshold) | (m.isna() if keep_nan else False)

    kept = int(keep_mask.sum())
    removed = int((~keep_mask).sum())

    # Save filtered CSV
    df.loc[keep_mask].to_csv(out_path, index=False)
    print(
        f"✅ {in_path.name} → {out_path.name} | kept={kept}, removed={removed} "
        f"| threshold={mapq_threshold}, keep_nan={keep_nan}"
    )

def main():
    csv_files = sorted(input_dir.glob("*.csv"))
    if not csv_files:
        print(f"⚠️  No CSV files in {input_dir}")
        return

    for p in csv_files:
        process_one_csv(p, output_dir, MAPQ_THRESHOLD, KEEP_NAN)

if __name__ == "__main__":
    main()

✅ Stepwise_01step_ID_match_FLASH.extendedFrags.csv → Stepwise_01step_ID_match_FLASH.extendedFrags.csv | kept=2467, removed=61 | threshold=10, keep_nan=True
✅ Stepwise_02step_ID_match_FLASH.extendedFrags.csv → Stepwise_02step_ID_match_FLASH.extendedFrags.csv | kept=1881, removed=43 | threshold=10, keep_nan=True
✅ Stepwise_03step_ID_match_FLASH.extendedFrags.csv → Stepwise_03step_ID_match_FLASH.extendedFrags.csv | kept=1751, removed=39 | threshold=10, keep_nan=True
✅ Stepwise_04step_ID_match_FLASH.extendedFrags.csv → Stepwise_04step_ID_match_FLASH.extendedFrags.csv | kept=1721, removed=65 | threshold=10, keep_nan=True
✅ Stepwise_05step_ID_match_FLASH.extendedFrags.csv → Stepwise_05step_ID_match_FLASH.extendedFrags.csv | kept=1128, removed=53 | threshold=10, keep_nan=True
✅ Stepwise_06step_ID_match_FLASH.extendedFrags.csv → Stepwise_06step_ID_match_FLASH.extendedFrags.csv | kept=629, removed=61 | threshold=10, keep_nan=True


# Histogram Data Analysis

## A. Generate Histogram Data from Aligned Reads(MAPQ filtered)

In [82]:
import os
import pandas as pd

# Folder setup
input_folder = "fastq_1_2_3_4_5_6/3_align_csv/MAPQ_removed"
histogram_folder = "fastq_1_2_3_4_5_6/4_align_histogram"
os.makedirs(histogram_folder, exist_ok=True)

# Process all CSV files in the input folder
files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

for file_name in files:
    file_path = os.path.join(input_folder, file_name)

    # Clean filename (remove specific substrings)
    clean_name = file_name
    clean_name = clean_name.replace("assemble", "")
    clean_name = clean_name.replace("ID_match_FLASH.extendedFrags", "")
    # remove duplicate/trailing underscores
    clean_name = clean_name.replace("__", "_").strip("_")  
    output_csv = os.path.join(histogram_folder, f"histogram_{clean_name}")

    try:
        df = pd.read_csv(file_path, dtype=str)
        if 'RNAME' not in df.columns:
            print(f"⚠️ Skipping file: {file_name} (no 'RNAME' column found)")
            continue

        # Aggregate and normalize RNAME counts
        rname_counts = df['RNAME'].value_counts().reset_index()
        rname_counts.columns = ['RNAME', 'Count']
        rname_counts.insert(0, 'File_Name', clean_name)
        rname_counts['Count'] = rname_counts['Count'].astype(int)
        total_count = rname_counts['Count'].sum()
        rname_counts['Normalized_Count'] = rname_counts['Count'] / total_count

        rname_counts.to_csv(output_csv, index=False)
        print(f"✅ Saved cleaned RNAME histogram: {output_csv}")

    except Exception as e:
        print(f"❌ Error processing file '{file_name}': {e}")

✅ Saved cleaned RNAME histogram: fastq_1_2_3_4_5_6/4_align_histogram/histogram_Stepwise_06step_.csv
✅ Saved cleaned RNAME histogram: fastq_1_2_3_4_5_6/4_align_histogram/histogram_Stepwise_01step_.csv
✅ Saved cleaned RNAME histogram: fastq_1_2_3_4_5_6/4_align_histogram/histogram_Stepwise_03step_.csv
✅ Saved cleaned RNAME histogram: fastq_1_2_3_4_5_6/4_align_histogram/histogram_Stepwise_04step_.csv
✅ Saved cleaned RNAME histogram: fastq_1_2_3_4_5_6/4_align_histogram/histogram_Stepwise_02step_.csv
✅ Saved cleaned RNAME histogram: fastq_1_2_3_4_5_6/4_align_histogram/histogram_Stepwise_05step_.csv


## B. Create Top 5 Histogram Plots for Each Sample

In [83]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# 📁 Folder setup
histogram_folder = "fastq_1_2_3_4_5_6/4_align_histogram"
summary_folder = "fastq_1_2_3_4_5_6/4_align_histogram/graph_top5"
os.makedirs(summary_folder, exist_ok=True)

# 🔴 Highlight mapping (based on filename suffix)
highlight_mapping = {
    "_01step": "seq_0001_1",
    "_02step": "seq_0002_10",
    "_03step": "seq_0005_101",
    "_04step": "seq_0010_1010",
    "_05step": "seq_0021_10101",
    "_06step": "seq_0042_101010",
    "_07step": "seq_0085_1010101",
    "_08step": "seq_0170_10101010",
}

# 📄 List CSV files
csv_files = [f for f in os.listdir(histogram_folder) if f.startswith("histogram_") and f.endswith(".csv")]

# 🔁 Iterate over files
for file_name in csv_files:
    file_path = os.path.join(histogram_folder, file_name)
    try:
        df = pd.read_csv(file_path)
        if 'RNAME' not in df.columns or 'Normalized_Count' not in df.columns:
            print(f"⚠️ Skipping file: {file_name} (missing column)")
            continue

        # Extract Top 5 RNAMEs
        top_df = df.sort_values(by="Count", ascending=False).head(5).reset_index(drop=True)
        sample_name = file_name.replace("histogram_", "").replace(".csv", "")

        # 🔍 Find highlight RNAME using the suffix
        highlight_rname = None
        for suffix, rname in highlight_mapping.items():
            if suffix in file_name:
                highlight_rname = rname
                break

        # 📊 Create plot
        plt.figure(figsize=(10, 6))
        bars = plt.bar(top_df["RNAME"], top_df["Normalized_Count"], color='blue')

        # 🔴 Color the matched RNAME in red
        for bar, rname in zip(bars, top_df["RNAME"]):
            if rname == highlight_rname:
                bar.set_color('red')

        plt.title(f"Top 5 RNAME Histogram - {sample_name}")
        plt.xlabel("RNAME")
        plt.ylabel("Normalized Count")
        plt.xticks(rotation=45)
        plt.ylim(0, 1)
        plt.tight_layout()

        # 💾 Save
        output_png = os.path.join(summary_folder, file_name.replace(".csv", ".png"))
        output_svg = os.path.join(summary_folder, file_name.replace(".csv", ".svg"))
        plt.savefig(output_png)
        plt.savefig(output_svg)
        plt.close()

        print(f"✅ Saved plot: {output_png}, {output_svg}")

    except Exception as e:
        print(f"❌ Error processing {file_name}: {e}")

✅ Saved plot: fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_Stepwise_01step_.png, fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_Stepwise_01step_.svg
✅ Saved plot: fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_Stepwise_06step_.png, fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_Stepwise_06step_.svg
✅ Saved plot: fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_Stepwise_03step_.png, fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_Stepwise_03step_.svg
✅ Saved plot: fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_Stepwise_02step_.png, fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_Stepwise_02step_.svg
✅ Saved plot: fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_Stepwise_04step_.png, fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_Stepwise_04step_.svg
✅ Saved plot: fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_Stepwise_05step_.png, fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histo

## C. Summarize Highlighted Read Counts into a CSV File

In [84]:
import os
import pandas as pd
import re

# === Highlight mapping (suffix -> RNAME) ===
highlight_mapping = {
    "_01step": "seq_0001_1",
    "_02step": "seq_0002_10",
    "_03step": "seq_0005_101",
    "_04step": "seq_0010_1010",
    "_05step": "seq_0021_10101",
    "_06step": "seq_0042_101010",
    "_07step": "seq_0085_1010101",
    "_08step": "seq_0170_10101010",
}

# === Folder setup ===
histogram_folder = "fastq_1_2_3_4_5_6/4_align_histogram"
summary_folder = "fastq_1_2_3_4_5_6/5_align_summary"
os.makedirs(summary_folder, exist_ok=True)
highlight_result_csv = os.path.join(summary_folder, "highlight_result.csv")

# === Function to extract step number ===
def extract_step_number(filename):
    match = re.search(r"_(\d+)step", filename)
    return int(match.group(1)) if match else float("inf")

# === Collect highlight summary info ===
highlight_data = []
csv_files = [f for f in os.listdir(histogram_folder) if f.startswith("histogram_") and f.endswith(".csv")]

for file in csv_files:
    file_path = os.path.join(histogram_folder, file)
    try:
        df = pd.read_csv(file_path)
        file_name = file.replace("histogram_", "")

        # Get highlight_rname by suffix
        highlight_rname = ""
        for suffix, rname in highlight_mapping.items():
            if suffix in file_name:
                highlight_rname = rname
                break

        df['Count'] = df['Count'].astype(int)
        total_count = df['Count'].sum()

        highlight_count = df[df['RNAME'] == highlight_rname]['Count'].sum() if highlight_rname else 0
        highlight_percentage = (highlight_count / total_count) * 100 if total_count > 0 else 0

        sorted_counts = df['Count'].sort_values(ascending=False).values
        second_max_count = sorted_counts[1] if len(sorted_counts) >= 2 else (sorted_counts[0] if len(sorted_counts) == 1 else 0)
        highlight_vs_second_ratio = (highlight_count / second_max_count) if second_max_count > 0 else 0

        highlight_data.append([
            file_name,
            highlight_count,
            total_count,
            round(highlight_percentage, 2),
            highlight_rname,
            round(highlight_vs_second_ratio, 3),
            extract_step_number(file_name)
        ])

    except Exception as e:
        print(f"❌ Error processing file '{file}': {e}")

# === Create DataFrame, sort by step, and save ===
highlight_df = pd.DataFrame(highlight_data, columns=[
    'File',
    'Highlight_Count',
    'Total_Count',
    'Highlight_Percentage',
    'Highlight_RNAMEs',
    'Highlight_vs_SecondTop_Ratio',
    'Step_Number'
])

highlight_df = highlight_df.sort_values(by='Step_Number').drop(columns='Step_Number')
highlight_df.to_csv(highlight_result_csv, index=False)

print(f"📌 Highlight summary saved to: {highlight_result_csv}")

📌 Highlight summary saved to: fastq_1_2_3_4_5_6/5_align_summary/highlight_result.csv


## D. Plot Stacked Bar Graph top5_gray_rest_white_box

In [85]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

# Folder setup
histogram_folder = "fastq_1_2_3_4_5_6/4_align_histogram"
summary_folder = "fastq_1_2_3_4_5_6/5_align_summary"
os.makedirs(summary_folder, exist_ok=True)

# Highlight mapping (based on filename suffix)
highlight_mapping = {
    "_01step": "seq_0001_1",
    "_02step": "seq_0002_10",
    "_03step": "seq_0005_101",
    "_04step": "seq_0010_1010",
    "_05step": "seq_0021_10101",
    "_06step": "seq_0042_101010",
    "_07step": "seq_0085_1010101",
    "_08step": "seq_0170_10101010",
}

# Gray → white gradient color function
def blend_color(base_rgb, t):
    white = np.array([255, 255, 255])
    base = np.array(base_rgb)
    blended = (1 - t) * base + t * white
    return tuple(blended / 255)

base_rgb = (137, 137, 138)

# Extract step number for ascending sort
def extract_step_number(name):
    match = re.search(r'_(\d+)step', name)
    return int(match.group(1)) if match else float('inf')

# Load per-sample data
sample_rname_dfs = {}
for file_name in os.listdir(histogram_folder):
    if file_name.startswith("histogram_") and file_name.endswith(".csv"):
        sample_name = file_name.replace("histogram_", "").replace(".csv", "")
        df = pd.read_csv(os.path.join(histogram_folder, file_name))
        if 'RNAME' not in df.columns or 'Count' not in df.columns:
            continue
        df['Sample'] = sample_name
        df['Count'] = df['Count'].astype(int)
        df['Normalized_Count'] = df['Count'] / df['Count'].sum()
        df = df.sort_values(by='Count', ascending=False).reset_index(drop=True)
        sample_rname_dfs[sample_name] = df

# Sort sample_name by step
sorted_samples = sorted(sample_rname_dfs.items(), key=lambda x: extract_step_number(x[0]))

# Visualization
fig, ax = plt.subplots(figsize=(24, 12))

for sample_idx, (sample_name, df) in enumerate(sorted_samples):
    # Find highlight RNAME
    highlight_rname = None
    for suffix, rname in highlight_mapping.items():
        if suffix in sample_name:  # check inclusion (not only end-of-string)
            highlight_rname = rname
            break

    bottom = 0
    top_n = 5
    rest_sum = 0

    for rank, row in df.iterrows():
        rname = row['RNAME']
        height = row['Normalized_Count']

        if rname == highlight_rname:
            ax.bar(sample_name, height, bottom=bottom, color='red', edgecolor='black', linewidth=0.2)
            bottom += height
        elif rank < top_n:
            t = rank / (top_n - 1) if top_n > 1 else 0
            color = blend_color(base_rgb, t)
            ax.bar(sample_name, height, bottom=bottom, color=color, edgecolor='black', linewidth=0.2)
            bottom += height
        else:
            rest_sum += height

    if rest_sum > 0:
        ax.bar(sample_name, rest_sum, bottom=bottom, color='white', edgecolor='black', linewidth=0.2)

# Reference line & styling
ax.axhline(y=0.5, color='gray', linestyle='--', linewidth=1, label='y = 0.5')
ax.set_ylabel("Normalized Count", fontsize=20)
ax.set_xlabel("Sample", fontsize=20)
ax.set_title("Stacked Bar Chart (Red = Highlight, Gray→White = Top 5, Rest = One White Box)", fontsize=16)
ax.tick_params(axis='x', labelsize=20)
ax.tick_params(axis='y', labelsize=20)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Save
png_path = os.path.join(summary_folder, "stacked_bar_top5_gray_rest_white_box.png")
svg_path = os.path.join(summary_folder, "stacked_bar_top5_gray_rest_white_box.svg")
plt.savefig(png_path)
plt.savefig(svg_path)
plt.close()

print(f"✅ Saved:\n - PNG: {png_path}\n - SVG: {svg_path}")

✅ Saved:
 - PNG: fastq_1_2_3_4_5_6/5_align_summary/stacked_bar_top5_gray_rest_white_box.png
 - SVG: fastq_1_2_3_4_5_6/5_align_summary/stacked_bar_top5_gray_rest_white_box.svg
