# 1. Install Modules

In [None]:
# Bioinformatics Tools (Ubuntu)
!sudo apt-get update
!sudo apt-get install -y fastp flash bwa samtools

# Python Library
!pip3 install biopython cutadapt pysam --break-system-packages

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fastp is already the newest version (0.23.4+dfsg-1).
The following packages were automatically installed and are no longer required:
  pigz python3-xopen
Use 'sudo apt autoremove' to remove them.
0 upgraded, 0 newly installed, 0 to remove and 242 not upgraded.
Get:1 https://packages.microsoft.com/repos/code stable InRelease [3,590 B]     
Get:2 https://packages.microsoft.com/repos/code stable/main armhf Packages [20.3 kB]
Hit:3 http://ports.ubuntu.com/ubuntu-ports noble InRelease                     
Get:4 https://packages.microsoft.com/repos/code stable/main amd64 Packages [20.2 kB]
Get:5 http://ports.ubuntu.com/ubuntu-ports noble-updates InRelease [126 kB]    
Get:6 https://packages.microsoft.com/repos/code stable/main arm64 Packages [20.3 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu noble InRelease   
Get:8 http://ports.ubuntu.com/ubuntu-ports noble-backports InReleas

# 2 Trimming and Discard trimmed sample

In [None]:
import subprocess
import glob
import os

# Specify the folder containing your input files.
input_folder = "fastq"
# Specify the folder where you want to save the untrimmed (adapter-free) sequences.
untrimmed_output_folder = "sequence_merge_method/A_untrimmed_output"

# Define the adapter sequences for R1 and R2.
adapter_sequence_r1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
adapter_sequence_r2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"

# Use glob to get a list of all input file pairs (R1 and R2) in the folder.
input_file_pairs = []
for input_r1 in glob.glob(os.path.join(input_folder, "*_R1.fastq.gz")):
    # Assuming R2 files have the same naming format as R1 files.
    input_r2 = input_r1.replace("_R1.fastq.gz", "_R2.fastq.gz")
    if os.path.exists(input_r2):  # Ensure R2 file exists.
        input_file_pairs.append({"r1": input_r1, "r2": input_r2})

# Create the output folder if it doesn't exist.
os.makedirs(untrimmed_output_folder, exist_ok=True)

for input_files in input_file_pairs:
    input_r1 = input_files["r1"]
    input_r2 = input_files["r2"]

    # Define output file paths for untrimmed (clean, adapter-free) sequences.
    untrimmed_r1 = os.path.join(untrimmed_output_folder, os.path.basename(input_r1).replace(".fastq.gz", "_untrimmed.fastq.gz"))
    untrimmed_r2 = os.path.join(untrimmed_output_folder, os.path.basename(input_r2).replace(".fastq.gz", "_untrimmed.fastq.gz"))

    # Use cutadapt to keep only untrimmed sequences (completely adapter-free).
    result = subprocess.run([
        "cutadapt",
        "-a", adapter_sequence_r1,  # Adapter for R1
        "-A", adapter_sequence_r2,  # Adapter for R2
        "-O", "15",                  # Minimum overlap for adapter trimming
        "--discard-trimmed",         # Discard sequences where trimming occurred
        "-o", untrimmed_r1,          # Save only untrimmed R1 reads
        "-p", untrimmed_r2,          # Save only untrimmed R2 reads
        input_r1, input_r2
    ], capture_output=True, text=True)

    # Log the result.
    if result.returncode == 0:
        print(f"Untrimmed sequences saved: {untrimmed_r1}, {untrimmed_r2}")
    else:
        print(f"Error processing {input_r1} and {input_r2}:\n{result.stderr}")

Untrimmed sequences saved: sequence_merge_method/A_untrimmed_output/6S_1_250821_batch17_07_R1_untrimmed.fastq.gz, sequence_merge_method/A_untrimmed_output/6S_1_250821_batch17_07_R2_untrimmed.fastq.gz
Untrimmed sequences saved: sequence_merge_method/A_untrimmed_output/5I_1_250821_batch17_06_R1_untrimmed.fastq.gz, sequence_merge_method/A_untrimmed_output/5I_1_250821_batch17_06_R2_untrimmed.fastq.gz
Untrimmed sequences saved: sequence_merge_method/A_untrimmed_output/7T_1_250821_batch17_08_R1_untrimmed.fastq.gz, sequence_merge_method/A_untrimmed_output/7T_1_250821_batch17_08_R2_untrimmed.fastq.gz
Untrimmed sequences saved: sequence_merge_method/A_untrimmed_output/3SP26_250828_batch18_04_R1_untrimmed.fastq.gz, sequence_merge_method/A_untrimmed_output/3SP26_250828_batch18_04_R2_untrimmed.fastq.gz
Untrimmed sequences saved: sequence_merge_method/A_untrimmed_output/0N_1_250821_batch17_02_R1_untrimmed.fastq.gz, sequence_merge_method/A_untrimmed_output/0N_1_250821_batch17_02_R2_untrimmed.fastq.g

# 3. Q filtering

In [None]:
import os
import subprocess

# Quality threshold (Phred score)
quality_threshold = 30

# Set input and output folders
input_folder = "sequence_merge_method/A_untrimmed_output"
output_folder = "sequence_merge_method/B_Qfiltered"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate through files in the input folder, processing only those ending with "_untrimmed.fastq.gz"
for filename in os.listdir(input_folder):
    if filename.endswith("_untrimmed.fastq.gz"):
        # Input file path
        input_file = os.path.join(input_folder, filename)
        
        # Output filename (e.g., sample_untrimmed.fastq.gz -> sample_Qfiltered.fastq.gz)
        output_file = os.path.join(
            output_folder, 
            filename.replace("_untrimmed.fastq.gz", "_Qfiltered.fastq.gz")
        )
        
        # Execute fastp in single-end mode for each file
        subprocess.call([
            "fastp",
            "-i", input_file,                      # Input file
            "-o", output_file,                     # Output file
            "-q", str(quality_threshold),          # Quality threshold for a base to be qualified
            "-u", "15",                            # Discard reads if the percentage of unqualified bases is >= 15%
            "-l", "151",                           # Minimum read length to keep
            "--cut_mean_quality", "30",            # Discard reads if mean quality is less than 30
            "--html", f"{output_file}.html",       # HTML report file path
            "--json", f"{output_file}.json"        # JSON report file path
        ])
        
        print(f"Filtering for {filename} is complete.\n"
              f"Output FASTQ : {output_file}\n"
              f"Reports      : {output_file}.html / {output_file}.json\n")

print("All filtering processes are done.")

Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 721
total bases: 108871
Q20 bases: 94451(86.755%)
Q30 bases: 81732(75.0723%)

Read1 after filtering:
total reads: 236
total bases: 35636
Q20 bases: 34651(97.2359%)
Q30 bases: 32101(90.0803%)

Filtering result:
reads passed filter: 236
reads failed due to low quality: 485
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 23.8558%

JSON report: sequence_merge_method/B_Qfiltered/0N_1_250821_batch17_02_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/0N_1_250821_batch17_02_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/0N_1_250821_batch17_02_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/0N_1_250821_batch17_02_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --h

Filtering for 0N_1_250821_batch17_02_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/0N_1_250821_batch17_02_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/0N_1_250821_batch17_02_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/0N_1_250821_batch17_02_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 825
total bases: 124575
Q20 bases: 106281(85.3149%)
Q30 bases: 92297(74.0895%)

Read1 after filtering:
total reads: 227
total bases: 34277
Q20 bases: 33136(96.6712%)
Q30 bases: 30917(90.1975%)

Filtering result:
reads passed filter: 227
reads failed due to low quality: 598
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 14.5455%

JSON report: sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/6S_1_250821_batch17_07_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R1_Qfilt

Filtering for 6S_1_250821_batch17_07_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 4144
total bases: 625744
Q20 bases: 567365(90.6705%)
Q30 bases: 523150(83.6045%)

Read1 after filtering:
total reads: 2527
total bases: 381577
Q20 bases: 374028(98.0216%)
Q30 bases: 359194(94.1341%)

Filtering result:
reads passed filter: 2527
reads failed due to low quality: 1617
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 41.4817%

JSON report: sequence_merge_method/B_Qfiltered/3SP26_250828_batch18_04_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/3SP26_250828_batch18_04_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/3SP26_250828_batch18_04_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/3SP26_250828_batch18_04_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/3SP26_250828_batch1

Filtering for 3SP26_250828_batch18_04_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/3SP26_250828_batch18_04_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/3SP26_250828_batch18_04_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/3SP26_250828_batch18_04_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 804
total bases: 121404
Q20 bases: 105164(86.6232%)
Q30 bases: 90694(74.7043%)

Read1 after filtering:
total reads: 231
total bases: 34881
Q20 bases: 33915(97.2306%)
Q30 bases: 31283(89.6849%)

Filtering result:
reads passed filter: 231
reads failed due to low quality: 573
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 21.7662%

JSON report: sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/1D_1_250821_batch17_01_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R2_Qfilt

Filtering for 1D_1_250821_batch17_01_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 804
total bases: 121404
Q20 bases: 104389(85.9848%)
Q30 bases: 91086(75.0272%)

Read1 after filtering:
total reads: 218
total bases: 32918
Q20 bases: 31587(95.9566%)
Q30 bases: 29409(89.3402%)

Filtering result:
reads passed filter: 218
reads failed due to low quality: 586
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 13.3085%

JSON report: sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/1D_1_250821_batch17_01_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R1_Qfilt

Filtering for 1D_1_250821_batch17_01_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/1D_1_250821_batch17_01_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 4144
total bases: 625744
Q20 bases: 569626(91.0318%)
Q30 bases: 520618(83.1998%)

Read1 after filtering:
total reads: 2438
total bases: 368138
Q20 bases: 362244(98.399%)
Q30 bases: 348230(94.5922%)

Filtering result:
reads passed filter: 2438
reads failed due to low quality: 1706
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 45.6081%

JSON report: sequence_merge_method/B_Qfiltered/3SP26_250828_batch18_04_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/3SP26_250828_batch18_04_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/3SP26_250828_batch18_04_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/3SP26_250828_batch18_04_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/3SP26_250828_batch18

Filtering for 3SP26_250828_batch18_04_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/3SP26_250828_batch18_04_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/3SP26_250828_batch18_04_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/3SP26_250828_batch18_04_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 825
total bases: 124575
Q20 bases: 107887(86.6041%)
Q30 bases: 92988(74.6442%)

Read1 after filtering:
total reads: 228
total bases: 34428
Q20 bases: 33388(96.9792%)
Q30 bases: 30766(89.3633%)

Filtering result:
reads passed filter: 228
reads failed due to low quality: 597
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 15.3939%

JSON report: sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/6S_1_250821_batch17_07_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R2_Qfilt

Filtering for 6S_1_250821_batch17_07_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/6S_1_250821_batch17_07_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 721
total bases: 108871
Q20 bases: 92957(85.3827%)
Q30 bases: 81028(74.4257%)

Read1 after filtering:
total reads: 194
total bases: 29294
Q20 bases: 28171(96.1665%)
Q30 bases: 26218(89.4996%)

Filtering result:
reads passed filter: 194
reads failed due to low quality: 527
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 14.9792%

JSON report: sequence_merge_method/B_Qfiltered/0N_1_250821_batch17_02_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/0N_1_250821_batch17_02_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/0N_1_250821_batch17_02_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/0N_1_250821_batch17_02_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/0N_1_250821_batch17_02_R1_Qfilte

Filtering for 0N_1_250821_batch17_02_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/0N_1_250821_batch17_02_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/0N_1_250821_batch17_02_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/0N_1_250821_batch17_02_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 867
total bases: 130917
Q20 bases: 113222(86.4838%)
Q30 bases: 97238(74.2745%)

Read1 after filtering:
total reads: 250
total bases: 37750
Q20 bases: 36731(97.3007%)
Q30 bases: 33973(89.9947%)

Filtering result:
reads passed filter: 250
reads failed due to low quality: 617
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 20.7612%

JSON report: sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/4G_1_250821_batch17_05_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R2_Qfilt

Filtering for 4G_1_250821_batch17_05_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 867
total bases: 130917
Q20 bases: 110998(84.785%)
Q30 bases: 96576(73.7689%)

Read1 after filtering:
total reads: 213
total bases: 32163
Q20 bases: 30934(96.1788%)
Q30 bases: 28893(89.833%)

Filtering result:
reads passed filter: 213
reads failed due to low quality: 654
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 12.9181%

JSON report: sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/4G_1_250821_batch17_05_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R1_Qfilter

Filtering for 4G_1_250821_batch17_05_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/4G_1_250821_batch17_05_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 771
total bases: 116421
Q20 bases: 99711(85.6469%)
Q30 bases: 86468(74.2718%)

Read1 after filtering:
total reads: 214
total bases: 32314
Q20 bases: 31248(96.7011%)
Q30 bases: 29065(89.9455%)

Filtering result:
reads passed filter: 214
reads failed due to low quality: 557
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 14.5266%

JSON report: sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/7T_1_250821_batch17_08_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R1_Qfilte

Filtering for 7T_1_250821_batch17_08_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 808
total bases: 122008
Q20 bases: 104048(85.2797%)
Q30 bases: 90596(74.2541%)

Read1 after filtering:
total reads: 200
total bases: 30200
Q20 bases: 29104(96.3709%)
Q30 bases: 27065(89.6192%)

Filtering result:
reads passed filter: 200
reads failed due to low quality: 608
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 9.77723%

JSON report: sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/2S_1_250821_batch17_03_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R1_Qfilt

Filtering for 2S_1_250821_batch17_03_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 808
total bases: 122008
Q20 bases: 104834(85.9239%)
Q30 bases: 89977(73.7468%)

Read1 after filtering:
total reads: 200
total bases: 30200
Q20 bases: 29240(96.8212%)
Q30 bases: 26938(89.1987%)

Filtering result:
reads passed filter: 200
reads failed due to low quality: 608
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 16.2129%

JSON report: sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/2S_1_250821_batch17_03_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R2_Qfilt

Filtering for 2S_1_250821_batch17_03_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/2S_1_250821_batch17_03_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 771
total bases: 116421
Q20 bases: 99857(85.7723%)
Q30 bases: 85933(73.8123%)

Read1 after filtering:
total reads: 230
total bases: 34730
Q20 bases: 33795(97.3078%)
Q30 bases: 31170(89.7495%)

Filtering result:
reads passed filter: 230
reads failed due to low quality: 541
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 18.9364%

JSON report: sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/7T_1_250821_batch17_08_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R2_Qfilte

Filtering for 7T_1_250821_batch17_08_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/7T_1_250821_batch17_08_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 774
total bases: 116874
Q20 bases: 99962(85.5297%)
Q30 bases: 87120(74.5418%)

Read1 after filtering:
total reads: 212
total bases: 32012
Q20 bases: 31048(96.9886%)
Q30 bases: 29026(90.6722%)

Filtering result:
reads passed filter: 212
reads failed due to low quality: 562
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 14.9871%

JSON report: sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R1_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R1_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/5I_1_250821_batch17_06_R1_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R1_Qfilte

Filtering for 5I_1_250821_batch17_06_R1_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R1_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R1_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R1_Qfiltered.fastq.gz.json

Filtering for 5I_1_250821_batch17_06_R2_untrimmed.fastq.gz is complete.
Output FASTQ : sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R2_Qfiltered.fastq.gz
Reports      : sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R2_Qfiltered.fastq.gz.html / sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R2_Qfiltered.fastq.gz.json

All filtering processes are done.


Read1 before filtering:
total reads: 774
total bases: 116874
Q20 bases: 100577(86.0559%)
Q30 bases: 86450(73.9685%)

Read1 after filtering:
total reads: 216
total bases: 32616
Q20 bases: 31631(96.98%)
Q30 bases: 29247(89.6707%)

Filtering result:
reads passed filter: 216
reads failed due to low quality: 558
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 14.4703%

JSON report: sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R2_Qfiltered.fastq.gz.json
HTML report: sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R2_Qfiltered.fastq.gz.html

fastp -i sequence_merge_method/A_untrimmed_output/5I_1_250821_batch17_06_R2_untrimmed.fastq.gz -o sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html sequence_merge_method/B_Qfiltered/5I_1_250821_batch17_06_R2_Qfilter

# 4. Match Paired-End Read IDs

In [None]:
import gzip
import glob
import os

def extract_matching_reads(r1_path, r2_path, out_r1_path, out_r2_path):
    def get_read_id(header):
        # Extract ID from the FASTQ header
        return header.split()[0].replace('/1', '').replace('/2', '')

    r1_ids = set()
    r2_ids = set()

    # Extract all read IDs from the R1 file
    with gzip.open(r1_path, 'rt') as r1_file:
        while True:
            header = r1_file.readline()
            if not header:
                break
            r1_ids.add(get_read_id(header.strip()))
            # Skip the other 3 lines of the read (sequence, +, quality)
            [r1_file.readline() for _ in range(3)] 

    # Extract all read IDs from the R2 file
    with gzip.open(r2_path, 'rt') as r2_file:
        while True:
            header = r2_file.readline()
            if not header:
                break
            r2_ids.add(get_read_id(header.strip()))
            [r2_file.readline() for _ in range(3)]

    # Find common and unique IDs
    matching_ids = r1_ids & r2_ids
    r1_only = r1_ids - r2_ids
    r2_only = r2_ids - r1_ids

    print(f"Processing {os.path.basename(r1_path)} and {os.path.basename(r2_path)}")
    print(f"Total R1 IDs: {len(r1_ids)}, Total R2 IDs: {len(r2_ids)}, Matching IDs: {len(matching_ids)}")
    print(f"IDs only in R1: {len(r1_only)}, IDs only in R2: {len(r2_only)}\n")

    # Create the output directory if it doesn't exist
    os.makedirs(os.path.dirname(out_r1_path), exist_ok=True)

    # Function to write only the reads with matching IDs to a new file
    def write_matching_reads(input_path, output_path, matching_ids):
        with gzip.open(input_path, 'rt') as infile, gzip.open(output_path, 'wt') as outfile:
            while True:
                lines = [infile.readline() for _ in range(4)]
                if not lines[0]:
                    break
                read_id = get_read_id(lines[0].strip())
                if read_id in matching_ids:
                    outfile.writelines(lines)

    # Write the filtered R1 and R2 files
    write_matching_reads(r1_path, out_r1_path, matching_ids)
    write_matching_reads(r2_path, out_r2_path, matching_ids)

# --------------------------
# Apply to all file pairs
# --------------------------

input_folder = "sequence_merge_method/B_Qfiltered"
output_folder = "sequence_merge_method/C_id_matched"

# Find all R1 files
r1_files = glob.glob(os.path.join(input_folder, "*_R1_Qfiltered.fastq.gz"))

# For each R1, find the corresponding R2 file and run the process
for r1_file in r1_files:
    r2_file = r1_file.replace("_R1_Qfiltered.fastq.gz", "_R2_Qfiltered.fastq.gz")
    
    if os.path.exists(r2_file):
        # Set the output file paths
        base_name = os.path.basename(r1_file).replace("_R1_Qfiltered.fastq.gz", "")
        out_r1 = os.path.join(output_folder, f"{base_name}_ID_match_R1.fastq.gz")
        out_r2 = os.path.join(output_folder, f"{base_name}_ID_match_R2.fastq.gz")
        
        # Execute the function
        extract_matching_reads(r1_file, r2_file, out_r1, out_r2)
    else:
        print(f"Warning: Corresponding R2 file not found for {r1_file}. Skipping.")

Processing 7T_1_250821_batch17_08_R1_Qfiltered.fastq.gz and 7T_1_250821_batch17_08_R2_Qfiltered.fastq.gz
Total R1 IDs: 214, Total R2 IDs: 230, Matching IDs: 139
IDs only in R1: 75, IDs only in R2: 91

Processing 2S_1_250821_batch17_03_R1_Qfiltered.fastq.gz and 2S_1_250821_batch17_03_R2_Qfiltered.fastq.gz
Total R1 IDs: 200, Total R2 IDs: 200, Matching IDs: 119
IDs only in R1: 81, IDs only in R2: 81

Processing 5I_1_250821_batch17_06_R1_Qfiltered.fastq.gz and 5I_1_250821_batch17_06_R2_Qfiltered.fastq.gz
Total R1 IDs: 212, Total R2 IDs: 216, Matching IDs: 135
IDs only in R1: 77, IDs only in R2: 81

Processing 3SP26_250828_batch18_04_R1_Qfiltered.fastq.gz and 3SP26_250828_batch18_04_R2_Qfiltered.fastq.gz
Total R1 IDs: 2438, Total R2 IDs: 2527, Matching IDs: 2148
IDs only in R1: 290, IDs only in R2: 379

Processing 1D_1_250821_batch17_01_R1_Qfiltered.fastq.gz and 1D_1_250821_batch17_01_R2_Qfiltered.fastq.gz
Total R1 IDs: 218, Total R2 IDs: 231, Matching IDs: 146
IDs only in R1: 72, IDs only

# 5 Merge W/ Flash

## 5.1 R1(Front, Back), R2(Front, Back) Fragmentation

In [None]:
import gzip
import glob
import os

def split_fastq_by_position(r1_path, r2_path, n, output_dir):
    """Splits each read in R1 and R2 files into front and back parts."""
    os.makedirs(output_dir, exist_ok=True)

    sample_base = os.path.basename(r1_path).replace("_ID_match_R1.fastq.gz", "")
    r1_f_path = os.path.join(output_dir, f"{sample_base}_R1_F.fastq.gz")
    r1_b_path = os.path.join(output_dir, f"{sample_base}_R1_B.fastq.gz")
    r2_f_path = os.path.join(output_dir, f"{sample_base}_R2_F.fastq.gz")
    r2_b_path = os.path.join(output_dir, f"{sample_base}_R2_B.fastq.gz")

    with gzip.open(r1_path, 'rt') as r1_file, \
         gzip.open(r2_path, 'rt') as r2_file, \
         gzip.open(r1_f_path, 'wt') as r1_f_out, \
         gzip.open(r1_b_path, 'wt') as r1_b_out, \
         gzip.open(r2_f_path, 'wt') as r2_f_out, \
         gzip.open(r2_b_path, 'wt') as r2_b_out:

        while True:
            r1_lines = [r1_file.readline() for _ in range(4)]
            r2_lines = [r2_file.readline() for _ in range(4)]

            if not r1_lines[0] or not r2_lines[0]:
                break

            header1, seq1, plus1, qual1 = [line.strip() for line in r1_lines]
            header2, seq2, plus2, qual2 = [line.strip() for line in r2_lines]

            # Split R1 read
            r1_f_out.write(f"{header1}\n{seq1[:151-n]}\n{plus1}\n{qual1[:151-n]}\n")
            r1_b_out.write(f"{header1}\n{seq1[-n:]}\n{plus1}\n{qual1[-n:]}\n")
            # Split R2 read
            r2_f_out.write(f"{header2}\n{seq2[:151-n]}\n{plus2}\n{qual2[:151-n]}\n")
            r2_b_out.write(f"{header2}\n{seq2[-n:]}\n{plus2}\n{qual2[-n:]}\n")

    print(f"✅ Split complete for: {sample_base} → {output_dir} (N={n})")

# -----------------------------------
# Apply the split function to all files
# -----------------------------------

input_folder = "sequence_merge_method/C_id_matched"
output_folder = "sequence_merge_method/D_split_reads"
os.makedirs(output_folder, exist_ok=True)

# Define the N-value (length of the back part) for each sample prefix
sample_n_mapping = {
    "0N": 126,
    "1D": 126,
    "2S": 126,
    "3G": 124,
    "4I": 128,
    "5S": 124,
    "6T": 122,
    "5K": 124,
    "1X8": 116,
    "0X8": 132,
    "4G": 126,
    "5I": 126,
    "6S": 124,
    "7T": 120,
    "3SP26": 122,
    "3SP31": 118   
}


# Find all R1 files
r1_files = glob.glob(os.path.join(input_folder, "*_ID_match_R1.fastq.gz"))

for r1_file in r1_files:
    r2_file = r1_file.replace("_R1.fastq.gz", "_R2.fastq.gz")

    if not os.path.exists(r2_file):
        print(f"⚠️ Matching R2 file not found: {r2_file}")
        continue

    # Find the corresponding N value based on the filename prefix
    matched_n = None
    for prefix, n_value in sample_n_mapping.items():
        if prefix in os.path.basename(r1_file):
            matched_n = n_value
            break

    if matched_n is None:
        print(f"⚠️ Could not find N value for: {r1_file} → Skipping")
        continue

    # Execute the split function
    split_fastq_by_position(r1_file, r2_file, matched_n, output_folder)

✅ 분리 완료: 2S_1_250821_batch17_03 → sequence_merge_method/D_split_reads (N=126)
✅ 분리 완료: 5I_1_250821_batch17_06 → sequence_merge_method/D_split_reads (N=126)
✅ 분리 완료: 4G_1_250821_batch17_05 → sequence_merge_method/D_split_reads (N=126)
✅ 분리 완료: 0N_1_250821_batch17_02 → sequence_merge_method/D_split_reads (N=126)
✅ 분리 완료: 1D_1_250821_batch17_01 → sequence_merge_method/D_split_reads (N=126)
✅ 분리 완료: 3SP26_250828_batch18_04 → sequence_merge_method/D_split_reads (N=122)
✅ 분리 완료: 7T_1_250821_batch17_08 → sequence_merge_method/D_split_reads (N=120)
✅ 분리 완료: 6S_1_250821_batch17_07 → sequence_merge_method/D_split_reads (N=124)


## 5.2 R2 DNA reverse complementary

In [None]:
import gzip
import glob
import os
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

def reverse_complement_fastq(input_fastq_path, output_fastq_path):
    """
    Reads a FASTQ file, creates the reverse complement of each record, 
    and writes it to a new file.
    """
    with gzip.open(input_fastq_path, "rt") as infile, gzip.open(output_fastq_path, "wt") as outfile:
        for record in SeqIO.parse(infile, "fastq"):
            # Create the reverse complement record, preserving the ID and description
            rev_comp_record = record.reverse_complement(id=True, description=True)
            SeqIO.write(rev_comp_record, outfile, "fastq")
            
    print(f"✅ Reverse complemented: {os.path.basename(output_fastq_path)}")

# --------------------------------------------------
# Perform reverse complement on all relevant R2 files
# --------------------------------------------------

input_folder = "sequence_merge_method/D_split_reads"
os.makedirs(input_folder, exist_ok=True)

# Find only the R2 front (F) and back (B) fragment files
input_files = glob.glob(os.path.join(input_folder, "*_R2_[BF].fastq.gz"))

for input_path in input_files:
    base = os.path.basename(input_path)
    # Remove the .fastq.gz extension to create a new filename
    name_without_ext = base.replace(".fastq.gz", "")
    output_path = os.path.join(input_folder, f"{name_without_ext}_revcomp.fastq.gz")
    
    reverse_complement_fastq(input_path, output_path)

✅ Reverse complemented: 7T_1_250821_batch17_08_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 4G_1_250821_batch17_05_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 2S_1_250821_batch17_03_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 5I_1_250821_batch17_06_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 6S_1_250821_batch17_07_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 3SP26_250828_batch18_04_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 0N_1_250821_batch17_02_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 1D_1_250821_batch17_01_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 6S_1_250821_batch17_07_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 0N_1_250821_batch17_02_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 1D_1_250821_batch17_01_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 3SP26_250828_batch18_04_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 5I_1_250821_batch17_06_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 4G_1_250821_batch17_05_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 7T_1_250

## 5.3 [R1_back]-[R2_back] merge (FLASH)

In [None]:
import os
import glob
import subprocess

# === Folder Setup ===
input_folder = "sequence_merge_method/D_split_reads"
output_folder = "sequence_merge_method/E_merged_output"
os.makedirs(output_folder, exist_ok=True)

# === Set N-values (Overlap Length) per Sample Prefix ===
sample_n_mapping = {
    "0N": 126,
    "1D": 126,
    "2S": 126,
    "3G": 124,
    "4I": 128,
    "5S": 124,
    "6T": 122,
    "5K": 124,
    "1X8": 116,
    "0X8": 132,
    "4G": 126,
    "5I": 126,
    "6S": 124,
    "7T": 120,
    "3SP26": 122,
    "3SP31": 118   
}

# === Find List of all R1_B Files ===
r1_files = glob.glob(os.path.join(input_folder, "*_R1_B.fastq.gz"))

print(f"🔎 Found {len(r1_files)} R1_B files.")

# === Process Each R1_B File ===
for r1_path in r1_files:
    sample_base = os.path.basename(r1_path).replace("_R1_B.fastq.gz", "")
    r2_path = os.path.join(input_folder, f"{sample_base}_R2_B.fastq.gz")

    if not os.path.exists(r2_path):
        print(f"⚠️ Matching R2_B file not found for {sample_base} → Skipping.")
        continue

    # Find the corresponding N value for the filename
    matched_n = None
    for prefix, n_value in sample_n_mapping.items():
        if prefix in sample_base:
            matched_n = n_value
            break

    if matched_n is None:
        print(f"⚠️ No N value matched for {sample_base} → Skipping.")
        continue

    output_name = f"{sample_base}_FLASH"

    print(f"🔵 Running FLASH for sample: {sample_base} (N={matched_n})")

    try:
        # Execute the FLASH command
        subprocess.check_call([
            "flash",
            "-m", str(matched_n),   # Minimum overlap
            "-M", str(matched_n),   # Maximum overlap
            "-o", output_name,      # Output file prefix
            "-d", output_folder,    # Output directory
            r1_path,
            r2_path
        ])
        print(f"✅ FLASH merging complete → {os.path.join(output_folder, output_name)}.extendedFrags.fastq")
    except subprocess.CalledProcessError as e:
        print(f"❌ FLASH merging failed for {sample_base}: {e}")

🔎 Found 8 R1_B files.
🔵 Running FLASH for sample: 7T_1_250821_batch17_08 (N=120)
[FLASH] Starting FLASH v1.2.11
[FLASH] Fast Length Adjustment of SHort reads
[FLASH]  
[FLASH] Input files:
[FLASH]     sequence_merge_method/D_split_reads/7T_1_250821_batch17_08_R1_B.fastq.gz
[FLASH]     sequence_merge_method/D_split_reads/7T_1_250821_batch17_08_R2_B.fastq.gz
[FLASH]  
[FLASH] Output files:
[FLASH]     sequence_merge_method/E_merged_output/7T_1_250821_batch17_08_FLASH.extendedFrags.fastq
[FLASH]     sequence_merge_method/E_merged_output/7T_1_250821_batch17_08_FLASH.notCombined_1.fastq
[FLASH]     sequence_merge_method/E_merged_output/7T_1_250821_batch17_08_FLASH.notCombined_2.fastq
[FLASH]     sequence_merge_method/E_merged_output/7T_1_250821_batch17_08_FLASH.hist
[FLASH]     sequence_merge_method/E_merged_output/7T_1_250821_batch17_08_FLASH.histogram
[FLASH]  
[FLASH] Parameters:
[FLASH]     Min overlap:           120
[FLASH]     Max overlap:           120
[FLASH]     Max mismatch densit

## 5.4 Assemble 
## R1_Front - [R1_Back]-[R2_Back]_merged (FLASH) - R2_Front_ReverseComplement

In [None]:
import os
import gzip
import glob
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

def load_fastq_to_dict(file_path):
    """Loads a FASTQ file into a dictionary: key=read_id, value=(sequence, quality)."""
    data = {}
    open_func = gzip.open if file_path.endswith(".gz") else open

    with open_func(file_path, "rt") as handle:
        for record in SeqIO.parse(handle, "fastq"):
            seq = str(record.seq)
            qual = record.letter_annotations["phred_quality"]
            data[record.id] = (seq, qual)
    return data

def assemble_fastq(r1_path, merged_path, r2_path, output_path):
    """Assembles the final sequence from R1_F, Merged, and R2_F_revcomp fragments."""
    print(f"🔄 Assembling for sample: {os.path.basename(output_path)}")
    r1_dict = load_fastq_to_dict(r1_path)
    r2_dict = load_fastq_to_dict(r2_path)

    with open(merged_path, "r") as merged_file, gzip.open(output_path, "wt") as output_file:
        for record in SeqIO.parse(merged_file, "fastq"):
            read_id = record.id
            merged_seq = str(record.seq)
            merged_qual = record.letter_annotations["phred_quality"]

            # A read must have corresponding R1 and R2 fragments to be assembled.
            if read_id not in r1_dict or read_id not in r2_dict:
                continue  

            r1_seq, r1_qual = r1_dict[read_id]
            r2_seq, r2_qual = r2_dict[read_id]

            # Concatenate in order: R1_F → Merged_Fragment → R2_F_revcomp
            full_seq = r1_seq + merged_seq + r2_seq
            full_qual = r1_qual + merged_qual + r2_qual

            new_record = SeqRecord(
                Seq(full_seq),
                id=read_id,
                description="",
                letter_annotations={"phred_quality": full_qual}
            )

            SeqIO.write(new_record, output_file, "fastq")

    print(f"✅ Assembled FASTQ saved: {output_path}")

# ===== Automate Processing for All Samples =====

# Set up paths
input_merged_folder = "sequence_merge_method/E_merged_output"
input_split_folder = "sequence_merge_method/D_split_reads"
output_folder = "sequence_merge_method/1_assemble"
os.makedirs(output_folder, exist_ok=True)

# Get a list of all merged files from FLASH
merged_files = glob.glob(os.path.join(input_merged_folder, "*_FLASH.extendedFrags.fastq"))

print(f"🔍 Found {len(merged_files)} merged samples to assemble.")

for merged_file in merged_files:
    sample_base = os.path.basename(merged_file).replace("_FLASH.extendedFrags.fastq", "")

    r1_path = os.path.join(input_split_folder, f"{sample_base}_R1_F.fastq.gz")
    r2_path = os.path.join(input_split_folder, f"{sample_base}_R2_F_revcomp.fastq.gz")
    output_path = os.path.join(output_folder, f"{sample_base}_assemble.fastq.gz")

    if os.path.exists(r1_path) and os.path.exists(r2_path):
        assemble_fastq(r1_path, merged_file, r2_path, output_path)
    else:
        print(f"⚠️ Missing split files for {sample_base}, skipping.")

🔍 Found 8 merged samples to assemble.
🔄 Assembling for sample: 2S_1_250821_batch17_03_assemble.fastq.gz
✅ Assembled FASTQ saved: sequence_merge_method/1_assemble/2S_1_250821_batch17_03_assemble.fastq.gz
🔄 Assembling for sample: 0N_1_250821_batch17_02_assemble.fastq.gz
✅ Assembled FASTQ saved: sequence_merge_method/1_assemble/0N_1_250821_batch17_02_assemble.fastq.gz
🔄 Assembling for sample: 3SP26_250828_batch18_04_assemble.fastq.gz
✅ Assembled FASTQ saved: sequence_merge_method/1_assemble/3SP26_250828_batch18_04_assemble.fastq.gz
🔄 Assembling for sample: 6S_1_250821_batch17_07_assemble.fastq.gz
✅ Assembled FASTQ saved: sequence_merge_method/1_assemble/6S_1_250821_batch17_07_assemble.fastq.gz
🔄 Assembling for sample: 4G_1_250821_batch17_05_assemble.fastq.gz
✅ Assembled FASTQ saved: sequence_merge_method/1_assemble/4G_1_250821_batch17_05_assemble.fastq.gz
🔄 Assembling for sample: 7T_1_250821_batch17_08_assemble.fastq.gz
✅ Assembled FASTQ saved: sequence_merge_method/1_assemble/7T_1_250821

# 6. fastq -> fasta

In [None]:
import os
import gzip
from Bio import SeqIO

# Folder containing the assembled FASTQ.GZ files.
input_folder = "sequence_merge_method/1_assemble"
# Folder to save the converted FASTA files.
output_folder = "sequence_merge_method/2_fastq_to_fasta"

# Create the output folder if it doesn't exist.
os.makedirs(output_folder, exist_ok=True)  

for filename in os.listdir(input_folder):
    # Convert only files ending with "_assemble.fastq.gz".
    if filename.endswith("_assemble.fastq.gz"):
        # Input FASTQ.GZ file path.
        input_file = os.path.join(input_folder, filename)
        
        # Output FASTA file path (change extension to .fasta).
        output_file = os.path.join(
            output_folder,
            filename.replace("_assemble.fastq.gz", "_assemble.fasta")
        )

        # Read the input FASTQ.GZ and write to the output FASTA file.
        # Using SeqIO.convert is more memory-efficient than loading all records into a list first.
        with gzip.open(input_file, "rt") as in_handle, open(output_file, "w") as out_handle:
            SeqIO.convert(in_handle, "fastq", out_handle, "fasta")

        print(f"Conversion from {filename} → {os.path.basename(output_file)} is complete.")

print("All conversions are done.")

Conversion from 4G_1_250821_batch17_05_assemble.fastq.gz → 4G_1_250821_batch17_05_assemble.fasta is complete.
Conversion from 0N_1_250821_batch17_02_assemble.fastq.gz → 0N_1_250821_batch17_02_assemble.fasta is complete.
Conversion from 1D_1_250821_batch17_01_assemble.fastq.gz → 1D_1_250821_batch17_01_assemble.fasta is complete.
Conversion from 5I_1_250821_batch17_06_assemble.fastq.gz → 5I_1_250821_batch17_06_assemble.fasta is complete.
Conversion from 6S_1_250821_batch17_07_assemble.fastq.gz → 6S_1_250821_batch17_07_assemble.fasta is complete.
Conversion from 7T_1_250821_batch17_08_assemble.fastq.gz → 7T_1_250821_batch17_08_assemble.fasta is complete.
Conversion from 3SP26_250828_batch18_04_assemble.fastq.gz → 3SP26_250828_batch18_04_assemble.fasta is complete.
Conversion from 2S_1_250821_batch17_03_assemble.fastq.gz → 2S_1_250821_batch17_03_assemble.fasta is complete.
All conversions are done.


# 7. reference sequence

In [None]:
!bwa index "reference_sequence/full_sequences.fasta"

[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index Answer_sequence/full_sequences.fasta
[main] Real time: 0.039 sec; CPU: 0.011 sec


# 8. reference sequence - Sample matching

In [None]:
%%bash
# Set the path to the reference sequence file
reference_file="reference_sequence/full_sequences.fasta"

# Set the directory containing your filtered FASTA files
fasta_directory="sequence_merge_method/2_fastq_to_fasta"
# Set the output directory for aligned SAM files
output_dir="sequence_merge_method/3_align_sam"

# Make sure the output directory exists or create it if necessary
mkdir -p "$output_dir"

# Iterate through filtered FASTA files in the specified directory
for fasta_file in "$fasta_directory"/*_assemble.fasta; do
    # Generate an output file name based on the input filename
    output_file="$output_dir/$(basename "$fasta_file" .fasta).sam"

    # Perform the BWA alignment 
    bwa mem -M -t 4 "$reference_file" "$fasta_file" > "$output_file"

    echo "Alignment completed for $fasta_file. Result saved as $output_file"
done

[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 50 sequences (8800 bp)...
[M::mem_process_seqs] Processed 50 reads in 0.020 CPU sec, 0.010 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 Answer_sequence/full_sequences.fasta sequence_merge_method/2_fastq_to_fasta/0N_1_250821_batch17_02_assemble.fasta
[main] Real time: 0.030 sec; CPU: 0.023 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/0N_1_250821_batch17_02_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/0N_1_250821_batch17_02_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 63 sequences (11088 bp)...
[M::mem_process_seqs] Processed 63 reads in 0.021 CPU sec, 0.010 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 Answer_sequence/full_sequences.fasta sequence_merge_method/2_fastq_to_fasta/1D_1_250821_batch17_01_assemble.fasta
[main] Real time: 0.035 sec; CPU: 0.024 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/1D_1_250821_batch17_01_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/1D_1_250821_batch17_01_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 47 sequences (8272 bp)...
[M::mem_process_seqs] Processed 47 reads in 0.025 CPU sec, 0.013 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 Answer_sequence/full_sequences.fasta sequence_merge_method/2_fastq_to_fasta/2S_1_250821_batch17_03_assemble.fasta
[main] Real time: 0.036 sec; CPU: 0.028 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/2S_1_250821_batch17_03_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/2S_1_250821_batch17_03_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 420 sequences (75600 bp)...
[M::mem_process_seqs] Processed 420 reads in 0.349 CPU sec, 0.099 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 Answer_sequence/full_sequences.fasta sequence_merge_method/2_fastq_to_fasta/3SP26_250828_batch18_04_assemble.fasta
[main] Real time: 0.132 sec; CPU: 0.354 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/3SP26_250828_batch18_04_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/3SP26_250828_batch18_04_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 61 sequences (10736 bp)...
[M::mem_process_seqs] Processed 61 reads in 0.039 CPU sec, 0.013 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 Answer_sequence/full_sequences.fasta sequence_merge_method/2_fastq_to_fasta/4G_1_250821_batch17_05_assemble.fasta
[main] Real time: 0.038 sec; CPU: 0.042 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/4G_1_250821_batch17_05_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/4G_1_250821_batch17_05_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 47 sequences (8272 bp)...
[M::mem_process_seqs] Processed 47 reads in 0.018 CPU sec, 0.013 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 Answer_sequence/full_sequences.fasta sequence_merge_method/2_fastq_to_fasta/5I_1_250821_batch17_06_assemble.fasta
[main] Real time: 0.033 sec; CPU: 0.021 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/5I_1_250821_batch17_06_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/5I_1_250821_batch17_06_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 47 sequences (8366 bp)...
[M::mem_process_seqs] Processed 47 reads in 0.029 CPU sec, 0.012 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 Answer_sequence/full_sequences.fasta sequence_merge_method/2_fastq_to_fasta/6S_1_250821_batch17_07_assemble.fasta
[main] Real time: 0.033 sec; CPU: 0.032 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/6S_1_250821_batch17_07_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/6S_1_250821_batch17_07_assemble.sam


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 50 sequences (9100 bp)...
[M::mem_process_seqs] Processed 50 reads in 0.031 CPU sec, 0.015 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 Answer_sequence/full_sequences.fasta sequence_merge_method/2_fastq_to_fasta/7T_1_250821_batch17_08_assemble.fasta
[main] Real time: 0.037 sec; CPU: 0.035 sec


Alignment completed for sequence_merge_method/2_fastq_to_fasta/7T_1_250821_batch17_08_assemble.fasta. Result saved as sequence_merge_method/3_align_sam/7T_1_250821_batch17_08_assemble.sam


## 8.1 sam to bam

In [16]:
%%bash

# Set the path to the directory containing SAM files
sam_dir="sequence_merge_method/3_align_sam"
# Set the output directory for BAM files
bam_dir="sequence_merge_method/4_align_bam"


# Make sure the output directory exists or create it if necessary
mkdir -p "$bam_dir"

# Convert SAM files to BAM
for sam_file in "$sam_dir"/*.sam; do
    bam_file="$bam_dir/$(basename "$sam_file" .sam).bam"
    samtools view -bS "$sam_file" -o "$bam_file"
    echo "Conversion from $sam_file to $bam_file is complete."
done

Conversion from sequence_merge_method/3_align_sam/0N_1_250821_batch17_02_assemble.sam to sequence_merge_method/4_align_bam/0N_1_250821_batch17_02_assemble.bam is complete.
Conversion from sequence_merge_method/3_align_sam/1D_1_250821_batch17_01_assemble.sam to sequence_merge_method/4_align_bam/1D_1_250821_batch17_01_assemble.bam is complete.
Conversion from sequence_merge_method/3_align_sam/2S_1_250821_batch17_03_assemble.sam to sequence_merge_method/4_align_bam/2S_1_250821_batch17_03_assemble.bam is complete.
Conversion from sequence_merge_method/3_align_sam/3SP26_250828_batch18_04_assemble.sam to sequence_merge_method/4_align_bam/3SP26_250828_batch18_04_assemble.bam is complete.
Conversion from sequence_merge_method/3_align_sam/4G_1_250821_batch17_05_assemble.sam to sequence_merge_method/4_align_bam/4G_1_250821_batch17_05_assemble.bam is complete.
Conversion from sequence_merge_method/3_align_sam/5I_1_250821_batch17_06_assemble.sam to sequence_merge_method/4_align_bam/5I_1_250821_bat

## 8.2  Convert BAM to CSV

In [None]:
import os
import pysam
import pandas as pd

# Input folder (path where BAM files are located).
input_folder = "sequence_merge_method/4_align_bam"
# Output folder (path to save CSV files).
output_folder = "sequence_merge_method/4_align_bam/csv"

# Create the output folder if it does not exist.
os.makedirs(output_folder, exist_ok=True)

# Function to convert a BAM file to CSV, including optional fields.
def bam_to_csv(bam_file, output_folder):
    output_csv = os.path.join(output_folder, os.path.basename(bam_file).replace(".bam", ".csv"))
    
    # Read the BAM file.
    with pysam.AlignmentFile(bam_file, "rb") as bam:
        records = []
        
        for read in bam:
            # Standard BAM fields.
            record = {
                "QNAME": read.query_name,
                "FLAG": read.flag,
                "RNAME": bam.get_reference_name(read.reference_id) if read.reference_id >= 0 else "*",
                "POS": read.reference_start + 1,
                "MAPQ": read.mapping_quality,
                "CIGAR": read.cigarstring if read.cigarstring else "*",
                "RNEXT": bam.get_reference_name(read.next_reference_id) if read.next_reference_id >= 0 else "*",
                "PNEXT": read.next_reference_start + 1 if read.next_reference_start >= 0 else 0,
                "TLEN": read.template_length,
                "SEQ": read.query_sequence if read.query_sequence else "*",
                "QUAL": read.qual if read.qual else "*",
            }
            
            # Add optional fields (tags).
            for tag, value in read.tags:
                record[tag] = value

            records.append(record)
    
    # Create a DataFrame from the list of records.
    df = pd.DataFrame(records)

    # Fill any missing optional fields with "*" instead of NaN for consistency.
    df = df.fillna("*")

    # Save the DataFrame to a CSV file.
    df.to_csv(output_csv, index=False)
    print(f"✅ Converted: {os.path.basename(bam_file)} -> {os.path.basename(output_csv)}")
    return output_csv

# Find all BAM files in the input folder.
bam_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".bam")]

# Convert all found BAM files to CSV.
csv_files = []
for bam_file in bam_files:
    csv_file = bam_to_csv(bam_file, output_folder)
    csv_files.append(csv_file)

# Print the list of newly created CSV files.
print("\nList of converted files:")
print(csv_files)

['sequence_merge_method/4_align_bam/csv/1D_1_250821_batch17_01_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/7T_1_250821_batch17_08_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/4G_1_250821_batch17_05_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/3SP26_250828_batch18_04_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/5I_1_250821_batch17_06_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/0N_1_250821_batch17_02_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/6S_1_250821_batch17_07_assemble.csv',
 'sequence_merge_method/4_align_bam/csv/2S_1_250821_batch17_03_assemble.csv']

## 8.3 Filter Alignments by MAPQ Score

In [None]:
import os
import pandas as pd
from pathlib import Path

# Input/Output folders
csv_dir = Path("sequence_merge_method/4_align_bam/csv")
out_dir = csv_dir / "MAPQ_removed"
out_dir.mkdir(parents=True, exist_ok=True)

# ===== Set the threshold (alignments with MAPQ <= this value will be removed) =====
MAPQ_THRESHOLD = 10
# =================================================================================

for in_path in sorted(csv_dir.glob("*.csv")):
    try:
        df = pd.read_csv(in_path)
    except Exception as e:
        print(f"⚠️  Read fail: {in_path.name} -> {e}")
        continue

    if "MAPQ" not in df.columns:
        print(f"⚠️  Skip (no MAPQ column): {in_path.name}")
        continue

    # Convert MAPQ column to a numeric type, coercing errors into 'Not a Number' (NaN)
    m = pd.to_numeric(df["MAPQ"], errors="coerce")
    
    # Create a boolean mask to identify rows to keep.
    # Keep rows where MAPQ > threshold.
    # Also, keep rows where MAPQ is NaN (e.g., unaligned reads), to remove them, delete '| m.isna()'.
    keep_mask = (m > MAPQ_THRESHOLD) | m.isna()
    
    kept = int(keep_mask.sum())
    removed = int((~keep_mask).sum())

    out_path = out_dir / in_path.name
    df.loc[keep_mask].to_csv(out_path, index=False)
    print(f"✅ {in_path.name} -> kept={kept}, removed={removed}, saved: {out_path.name}")

✅ 0N_1_250821_batch17_02_assemble.csv -> kept=48, removed=2, saved: 0N_1_250821_batch17_02_assemble.csv
✅ 1D_1_250821_batch17_01_assemble.csv -> kept=59, removed=4, saved: 1D_1_250821_batch17_01_assemble.csv
✅ 2S_1_250821_batch17_03_assemble.csv -> kept=43, removed=4, saved: 2S_1_250821_batch17_03_assemble.csv
✅ 3SP26_250828_batch18_04_assemble.csv -> kept=366, removed=54, saved: 3SP26_250828_batch18_04_assemble.csv
✅ 4G_1_250821_batch17_05_assemble.csv -> kept=48, removed=13, saved: 4G_1_250821_batch17_05_assemble.csv
✅ 5I_1_250821_batch17_06_assemble.csv -> kept=46, removed=1, saved: 5I_1_250821_batch17_06_assemble.csv
✅ 6S_1_250821_batch17_07_assemble.csv -> kept=43, removed=4, saved: 6S_1_250821_batch17_07_assemble.csv
✅ 7T_1_250821_batch17_08_assemble.csv -> kept=42, removed=8, saved: 7T_1_250821_batch17_08_assemble.csv


# Histogram Data Analysis

## A. Generate Histogram Data from Aligned Reads(MAPQ filtered)

In [None]:
import os
import pandas as pd

# Folder setup
input_folder = "sequence_merge_method/4_align_bam/csv/MAPQ_removed"
histogram_folder = "sequence_merge_method/5_histogram"
os.makedirs(histogram_folder, exist_ok=True)

# Process all CSV files in the input folder
files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

for file_name in files:
    file_path = os.path.join(input_folder, file_name)
    output_csv = os.path.join(histogram_folder, f"histogram_{file_name}")

    try:
        df = pd.read_csv(file_path, dtype=str)
        if 'RNAME' not in df.columns:
            print(f"Skipping file: {file_name} (no 'RNAME' column found)")
            continue

        # Count the occurrences of each unique RNAME
        rname_counts = df['RNAME'].value_counts().reset_index()
        rname_counts.columns = ['RNAME', 'Count']
        
        # Add metadata and calculate normalized counts
        rname_counts.insert(0, 'File_Name', file_name)
        rname_counts['Count'] = rname_counts['Count'].astype(int)
        total_count = rname_counts['Count'].sum()
        rname_counts['Normalized_Count'] = rname_counts['Count'] / total_count

        # Save the histogram data to a new CSV file
        rname_counts.to_csv(output_csv, index=False)
        print(f"✅ Saved full RNAME histogram: {output_csv}")

    except Exception as e:
        print(f"❌ Error processing file '{file_name}': {e}")

✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_2S_1_250821_batch17_03_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_6S_1_250821_batch17_07_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_0N_1_250821_batch17_02_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_5I_1_250821_batch17_06_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_4G_1_250821_batch17_05_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_3SP26_250828_batch18_04_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_7T_1_250821_batch17_08_assemble.csv
✅ Saved full RNAME histogram: sequence_merge_method/5_histogram/histogram_1D_1_250821_batch17_01_assemble.csv


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Folder setup
histogram_folder = "sequence_merge_method/5_histogram"
summary_folder = "sequence_merge_method/5_histogram/graph_top5"
os.makedirs(summary_folder, exist_ok=True)

# Highlight mapping (associates sample prefixes with their expected correct RNAME)
highlight_mapping = {
    "0N": "seq_013_00001101",
    "1D": "seq_035_00100011",
    "2S": "seq_082_01010010",
    "3G": "seq_102_01100110",
    "4I": "seq_136_10001000",
    "5S": "seq_178_10110010",
    "6T": "seq_211_11010011",
    "5K": "seq_170_10101010",
    "1X8": "seq_255_11111111",
    "0X8": "seq_000_00000000",
    "4G": "seq_134_10000110",
    "5I": "seq_168_10101000",
    "6S": "seq_210_11010010",
    "7T": "seq_243_11110011",
    "3SP26": "seq_122_01111010",
    "3SP31": "seq_127_01111111"
}

# Iterate through all histogram CSV files
csv_files = [f for f in os.listdir(histogram_folder) if f.startswith("histogram_") and f.endswith(".csv")]

for file_name in csv_files:
    file_path = os.path.join(histogram_folder, file_name)
    try:
        df = pd.read_csv(file_path)
        if 'RNAME' not in df.columns or 'Normalized_Count' not in df.columns:
            print(f"Skipping file: {file_name} (missing column)")
            continue

        # Get the top 5 RNAMEs by count
        top_df = df.sort_values(by="Count", ascending=False).head(5).reset_index(drop=True)
        sample_name = file_name.replace("histogram_", "").replace(".csv", "")

📊 Saved plot: sequence_merge_method/5_histogram/graph_top5/histogram_0N_1_250821_batch17_02_assemble.png, sequence_merge_method/5_histogram/graph_top5/histogram_0N_1_250821_batch17_02_assemble.svg
📊 Saved plot: sequence_merge_method/5_histogram/graph_top5/histogram_5I_1_250821_batch17_06_assemble.png, sequence_merge_method/5_histogram/graph_top5/histogram_5I_1_250821_batch17_06_assemble.svg
📊 Saved plot: sequence_merge_method/5_histogram/graph_top5/histogram_7T_1_250821_batch17_08_assemble.png, sequence_merge_method/5_histogram/graph_top5/histogram_7T_1_250821_batch17_08_assemble.svg
📊 Saved plot: sequence_merge_method/5_histogram/graph_top5/histogram_4G_1_250821_batch17_05_assemble.png, sequence_merge_method/5_histogram/graph_top5/histogram_4G_1_250821_batch17_05_assemble.svg
📊 Saved plot: sequence_merge_method/5_histogram/graph_top5/histogram_1D_1_250821_batch17_01_assemble.png, sequence_merge_method/5_histogram/graph_top5/histogram_1D_1_250821_batch17_01_assemble.svg
📊 Saved plot: s

## B. Create Top 5 Histogram Plots for Each Sample

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Folder setup
histogram_folder = "sequence_merge_method/5_histogram"
summary_folder = "sequence_merge_method/5_histogram/graph_top5"
os.makedirs(summary_folder, exist_ok=True)

# Highlight mapping (associates sample prefixes with their expected correct RNAME)
highlight_mapping = {
    "0N": "seq_013_00001101",
    "1D": "seq_035_00100011",
    "2S": "seq_082_01010010",
    "3G": "seq_102_01100110",
    "4I": "seq_136_10001000",
    "5S": "seq_178_10110010",
    "6T": "seq_211_11010011",
    "5K": "seq_170_10101010",
    "1X8": "seq_255_11111111",
    "0X8": "seq_000_00000000",
    "4G": "seq_134_10000110",
    "5I": "seq_168_10101000",
    "6S": "seq_210_11010010",
    "7T": "seq_243_11110011",
    "3SP26": "seq_122_01111010",
    "3SP31": "seq_127_01111111"
}

# Iterate through all histogram CSV files
csv_files = [f for f in os.listdir(histogram_folder) if f.startswith("histogram_") and f.endswith(".csv")]

for file_name in csv_files:
    file_path = os.path.join(histogram_folder, file_name)
    try:
        df = pd.read_csv(file_path)
        if 'RNAME' not in df.columns or 'Normalized_Count' not in df.columns:
            print(f"Skipping file: {file_name} (missing column)")
            continue

        # Get the top 5 RNAMEs by count
        top_df = df.sort_values(by="Count", ascending=False).head(5).reset_index(drop=True)
        sample_name = file_name.replace("histogram_", "").replace(".csv", "")
        
        # 👉 Extract prefix to find the correct RNAME to highlight (e.g., from "1D_sample_X.csv" -> "1D")
        prefix = sample_name.split("_")[0]  
        highlight_rname = highlight_mapping.get(prefix, None)

        # Create the plot
        plt.figure(figsize=(10, 6))
        bars = plt.bar(top_df["RNAME"], top_df["Normalized_Count"], color='blue')

        # Highlight the expected correct RNAME in red
        for bar, rname in zip(bars, top_df["RNAME"]):
            if rname == highlight_rname:
                bar.set_color('red')

        plt.title(f"Top 5 RNAME Histogram - {sample_name}")
        plt.xlabel("RNAME")
        plt.ylabel("Normalized Count")
        plt.xticks(rotation=45, ha="right")
        plt.ylim(0, 1)
        plt.tight_layout()

        # Save the plot in both PNG and SVG formats
        output_png = os.path.join(summary_folder, file_name.replace(".csv", ".png"))
        output_svg = os.path.join(summary_folder, file_name.replace(".csv", ".svg"))
        plt.savefig(output_png)
        plt.savefig(output_svg)
        plt.close()

        print(f"📊 Saved plot: {output_png}, {output_svg}")

    except Exception as e:
        print(f"❌ Error processing {file_name}: {e}")

✅ 저장 완료:
 - PNG: sequence_merge_method/6_summary/stacked_bar_top5_gray_rest_white_box.png
 - SVG: sequence_merge_method/6_summary/stacked_bar_top5_gray_rest_white_box.svg


## C. Summarize Highlighted Read Counts into a CSV File

In [None]:
import os
import pandas as pd

# === Highlight Mapping (associates sample prefixes with their expected RNAME) ===
highlight_mapping = {
    "0N": "seq_013_00001101",
    "1D": "seq_035_00100011",
    "2S": "seq_082_01010010",
    "3G": "seq_102_01100110",
    "4I": "seq_136_10001000",
    "5S": "seq_178_10110010",
    "6T": "seq_211_11010011",
    "5K": "seq_170_10101010",
    "1X8": "seq_255_11111111",
    "0X8": "seq_000_00000000",
    "4G": "seq_134_10000110",
    "5I": "seq_168_10101000",
    "6S": "seq_210_11010010",
    "7T": "seq_243_11110011",
    "3SP26": "seq_122_01111010",
    "3SP31": "seq_127_01111111"
}

# === Folder Setup ===
histogram_folder = "sequence_merge_method/5_histogram"
summary_folder = "sequence_merge_method/6_summary"
os.makedirs(summary_folder, exist_ok=True)
highlight_result_csv = os.path.join(summary_folder, "highlight_result.csv")

# === Collect Highlight Summary Information ===
highlight_data = []
csv_files = [f for f in os.listdir(histogram_folder) if f.startswith("histogram_") and f.endswith(".csv")]

for file in csv_files:
    file_path = os.path.join(histogram_folder, file)
    try:
        df = pd.read_csv(file_path)
        file_name = file.replace("histogram_", "")
        prefix = file_name.split("_")[0]
        highlight_rname = highlight_mapping.get(prefix, "")

        df['Count'] = df['Count'].astype(int)
        total_count = df['Count'].sum()

        # Calculate count and percentage for the highlighted (expected) RNAME
        highlight_count = df[df['RNAME'] == highlight_rname]['Count'].sum() if highlight_rname else 0
        highlight_percentage = (highlight_count / total_count) * 100 if total_count > 0 else 0

        # Calculate the ratio of the highlighted count to the second-highest count
        sorted_counts = df['Count'].sort_values(ascending=False).values
        second_max_count = sorted_counts[1] if len(sorted_counts) >= 2 else 0
        highlight_vs_second_ratio = (highlight_count / second_max_count) if second_max_count > 0 else 0

        highlight_data.append([
            file_name,
            highlight_count,
            total_count,
            highlight_percentage,
            highlight_rname,
            highlight_vs_second_ratio
        ])

    except Exception as e:
        print(f"❌ Error processing file '{file}': {e}")

# === Save the Summary to a CSV File ===
highlight_df = pd.DataFrame(highlight_data, columns=[
    'File',
    'Highlight_Count',
    'Total_Count',
    'Highlight_Percentage',
    'Highlight_RNAME',
    'Highlight_vs_SecondTop_Ratio'
])
highlight_df = highlight_df.sort_values(by='File')
highlight_df.to_csv(highlight_result_csv, index=False)

print(f"📌 Highlight summary saved to: {highlight_result_csv}")

📌 Highlight summary saved to: sequence_merge_method/6_summary/highlight_result.csv


# Error Analysis

In [None]:
import pandas as pd
import os
import re
import numpy as np

# --- 1. Set Answer Key (same as before) ---
answer_data = {
    "0N": "seq_013_00001101", "1D": "seq_035_00100011",
    "2S": "seq_082_01010010", "3G": "seq_102_01100110",
    "4I": "seq_136_10001000", "5S": "seq_178_10110010",
    "6T": "seq_211_11010011", "5K": "seq_170_10101010",
    "1X8": "seq_255_11111111", "0X8": "seq_000_00000000",
    "4G": "seq_134_10000110", "5I": "seq_168_10101000",
    "6S": "seq_210_11010010", "7T": "seq_243_11110011",
    "3SP26": "seq_122_01111010", "3SP31": "seq_127_01111111"
}
# Extract only the 8-digit number for convenience and create answer_key_map
# Result example: {'0N': '00001101', '1D': '00100011', ...}
answer_key_map = {key: re.search(r'([01]{8})$', value).group(1) for key, value in answer_data.items()}


# --- 2. File Processing and Combined Calculation (same as before) ---
input_folder = "/Users/janghochoi/Documents/Ubuntu/TEMPER_final_data/DNA_memory_final_figure/sequence_merge_method/5_histogram" # or specify directly, e.g., "fastq/4_align_histogram_MAPQ_removed"
output_path = os.path.join(input_folder, "summary_combined.csv")

try:
    files = [f for f in os.listdir(input_folder) if f.endswith('.csv') and f.startswith("histogram_")]
except FileNotFoundError:
    print(f"❌ Error: Folder '{input_folder}' not found.")
    files = []

combined_summary_list = []

if not files:
    print("⚠️ No files found for analysis.")
else:
    print(f"📁 Analyzing a total of {len(files)} files.")
    for file_name in sorted(files):
        try:
            df = pd.read_csv(os.path.join(input_folder, file_name))
            total_count = df['Count'].sum()
            
            position_counts = [{'0': 0, '1': 0} for _ in range(8)]
            for _, row in df.iterrows():
                rname = row.get('RNAME', '')
                count = int(row['Count'])
                match = re.search(r'seq_[^_]+_([01]{8})', rname)
                if match:
                    eight_digits = match.group(1)
                    for i, digit in enumerate(eight_digits):
                        position_counts[i][digit] += count
            
            answer_key = None
            for key in answer_key_map:
                if key in file_name:
                    answer_key = answer_key_map[key]
                    break
            
            combined_row = {
                "File_Name": file_name,
                "Total_Count": total_count
            }
            
            accuracies_for_avg = [] 
            for i in range(8):
                zeros_count = position_counts[i]['0']
                ones_count = position_counts[i]['1']
                
                combined_row[f"Pos{i+1}_Zeros_Count"] = zeros_count
                combined_row[f"Pos{i+1}_Ones_Count"] = ones_count
                
                accuracy = np.nan
                if answer_key:
                    correct_digit = answer_key[i]
                    current_total = zeros_count + ones_count
                    correct_count = position_counts[i][correct_digit]
                    accuracy = correct_count / current_total if current_total > 0 else 0
                
                combined_row[f"Pos{i+1}_Accuracy"] = accuracy
                accuracies_for_avg.append(accuracy)

            avg_accuracy = np.nanmean(accuracies_for_avg)
            combined_row["Avg_Accuracy"] = avg_accuracy

            combined_summary_list.append(combined_row)
            if not answer_key:
                print(f"⚠️ Warning: Answer key not found for {file_name}.")
            print(f"✅ {file_name} processing complete.")
        
        except Exception as e:
            print(f"❌ Error processing {file_name}: {e}")

# --- 3. Save Combined Results and Add Summary Row (Modified Section) ---
if combined_summary_list:
    combined_df = pd.DataFrame(combined_summary_list)
    
    column_order = ['File_Name', 'Total_Count', 'Avg_Accuracy']
    for i in range(1, 9):
        column_order.append(f'Pos{i}_Zeros_Count')
        column_order.append(f'Pos{i}_Ones_Count')
        column_order.append(f'Pos{i}_Accuracy')
        
    final_df = combined_df[[col for col in column_order if col in combined_df.columns]]
    
    # ★★★ Create Summary Row (Modified Logic) ★★★
    summary_row = {'File_Name': 'Avg. Accuracy'}
    # Find and iterate over all columns containing 'Accuracy'
    for col_name in final_df.columns:
        if 'Accuracy' in col_name:
            # Calculate the mean of that Accuracy column and add to the summary row
            summary_row[col_name] = final_df[col_name].mean()

    # ★★★ Add Summary Row to the existing DataFrame ★★★
    summary_row_df = pd.DataFrame([summary_row]) # Convert dictionary to DataFrame
    final_df = pd.concat([final_df, summary_row_df], ignore_index=True)

    final_df.to_csv(output_path, index=False)
    
    print("\n--- Final Combined Analysis Results (including summary row) ---")
    print(final_df[['File_Name', 'Total_Count', 'Avg_Accuracy']].tail())
    print(f"\n📄 All analysis results and the summary row have been saved to '{output_path}'.")
