# **Install modules**

In [1]:
# Install modules
!sudo pip3 install biopython --break-system-packages
!sudo apt-get install fastp 
!sudo apt-get update 
!sudo apt-get install flash
!sudo pip3 install cutadapt --break-system-packages
!sudo apt-get install bwa 
!sudo pip3 install pysam --break-system-packages
!sudo apt-get install samtools 

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fastp is already the newest version (0.23.4+dfsg-1).
The following packages were automatically installed and are no longer required:
  pigz python3-xopen
Use 'sudo apt autoremove' to remove them.
0 upgraded, 0 newly installed, 0 to remove and 265 not upgraded.
Hit:1 https://packages.microsoft.com/repos/code stable InRelease               
Hit:2 http://ports.ubuntu.com/ubuntu-ports noble InRelease                     
Get:3 http://ports.ubuntu.com/ubuntu-ports noble-updates InRelease [126 kB]    
Hit:4 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu noble InRelease   
Get:5 http://ports.ubuntu.com/ubuntu-ports noble-backports InRelease [126 kB]
Get:6 http://ports.ubuntu.com/ubuntu-ports noble-security InRelease [126 kB]
Get:7 http://ports.ubuntu.com/ubuntu-ports noble-updates/main arm64 Components [172 kB]
Get:8 http://ports.ubuntu.com/ubuntu-ports noble-updates/restricted arm64 Compo

# Trimming and Discard

In [49]:
import subprocess
import glob
import os

# Specify the folder containing your input files
# Specify the folder where you want to save the untrimmed sequences (adapter-free sequences)

input_folder = "fastq_1_2_3_4_5_6"
untrimmed_output_folder = "fastq_1_2_3_4_5_6/A_Untrimmed_output"

# Define the adapter sequences for R1 and R2
adapter_sequence_r1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
adapter_sequence_r2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"

# Use glob to get a list of all input file pairs (R1 and R2) in the folder
input_file_pairs = []
for input_r1 in glob.glob(os.path.join(input_folder, "*_R1.fastq.gz")):
    # Assuming R2 files have the same naming format as R1 files
    input_r2 = input_r1.replace("_R1.fastq.gz", "_R2.fastq.gz")
    if os.path.exists(input_r2):  # Ensure R2 file exists
        input_file_pairs.append({"r1": input_r1, "r2": input_r2})

# Create the output folder if it doesn't exist
os.makedirs(untrimmed_output_folder, exist_ok=True)

for input_files in input_file_pairs:
    input_r1 = input_files["r1"]
    input_r2 = input_files["r2"]

    # Define output file paths for untrimmed (clean, adapter-free) sequences
    untrimmed_r1 = os.path.join(untrimmed_output_folder, os.path.basename(input_r1).replace(".fastq.gz", "_untrimmed.fastq.gz"))
    untrimmed_r2 = os.path.join(untrimmed_output_folder, os.path.basename(input_r2).replace(".fastq.gz", "_untrimmed.fastq.gz"))

    # Use cutadapt to keep only untrimmed sequences (completely adapter-free)
    result = subprocess.run([
        "cutadapt",
        "-a", adapter_sequence_r1,  # Adapter for R1
        "-A", adapter_sequence_r2,  # Adapter for R2
        "-O", "15",  # Minimum overlap for adapter trimming
        #"--discard-trimmed",  # Discard sequences where trimming occurred
        "-o", untrimmed_r1,  # Save only untrimmed R1 reads
        "-p", untrimmed_r2,  # Save only untrimmed R2 reads
        input_r1, input_r2
    ], capture_output=True, text=True)

    # Log result
    if result.returncode == 0:
        print(f"Untrimmed sequences saved: {untrimmed_r1}, {untrimmed_r2}")
    else:
        print(f"Error processing {input_r1} and {input_r2}:\n{result.stderr}")

Untrimmed sequences saved: fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_01step_R1_untrimmed.fastq.gz, fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_01step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_1_2_3_4_5_6/A_Untrimmed_output/250910_batch20_06step_R1_untrimmed.fastq.gz, fastq_1_2_3_4_5_6/A_Untrimmed_output/250910_batch20_06step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_1_2_3_4_5_6/A_Untrimmed_output/250910_batch20_05step_R1_untrimmed.fastq.gz, fastq_1_2_3_4_5_6/A_Untrimmed_output/250910_batch20_05step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_02step_R1_untrimmed.fastq.gz, fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_02step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_03step_R1_untrimmed.fastq.gz, fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_03step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_1_2_3_4_5_6/A_Unt

# Quality check

In [50]:
# import os
# import gzip
# from Bio import SeqIO
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt

# # 📁 입력 폴더와 출력 폴더 설정
# input_folder = "fastq_step/1_3_5_7_9_11/A_Untrimmed_output"
# output_csv_folder = "fastq_step/1_3_5_7_9_11/A_Untrimmed_output/quality_stats_csv"
# output_plot_folder = "fastq_step/1_3_5_7_9_11/A_Untrimmed_output/quality_plots"

# os.makedirs(output_csv_folder, exist_ok=True)
# os.makedirs(output_plot_folder, exist_ok=True)

# # 🔁 품질 통계 추출 함수
# def compute_quality_stats(file_path):
#     position_qualities = {}
#     open_func = gzip.open if file_path.endswith(".gz") else open

#     with open_func(file_path, "rt") as handle:
#         for record in SeqIO.parse(handle, "fastq"):
#             for i, q in enumerate(record.letter_annotations["phred_quality"]):
#                 position_qualities.setdefault(i, []).append(q)

#     stats = []
#     for pos in sorted(position_qualities):
#         scores = np.array(position_qualities[pos])
#         stats.append({
#             "position": pos + 1,
#             "mean": np.mean(scores),
#             "q1": np.percentile(scores, 25),
#             "median": np.median(scores),
#             "q3": np.percentile(scores, 75),
#             "min": np.min(scores),
#             "max": np.max(scores)
#         })
#     return pd.DataFrame(stats)

# # 📊 배경 색상 함수 (fastp 스타일)
# def add_quality_background(ax):
#     ax.axhspan(30, 40, facecolor='lightgreen', alpha=0.5)
#     ax.axhspan(25, 30, facecolor='khaki', alpha=0.5)
#     ax.axhspan(20, 25, facecolor='moccasin', alpha=0.5)
#     ax.axhspan(0, 20, facecolor='lightcoral', alpha=0.5)

# # 📂 폴더 내 모든 FASTQ(.gz 포함) 처리
# for filename in os.listdir(input_folder):
#     if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
#         input_path = os.path.join(input_folder, filename)
#         sample_name = os.path.splitext(filename)[0].replace(".fastq", "").replace(".gz", "")

#         print(f"📌 Processing: {sample_name}")
#         df = compute_quality_stats(input_path)

#         # CSV 저장
#         csv_path = os.path.join(output_csv_folder, f"{sample_name}_quality.csv")
#         df.to_csv(csv_path, index=False)

#         # 그래프 저장
#         plt.figure(figsize=(18, 8))
#         ax = plt.gca()
#         add_quality_background(ax)
#         plt.plot(df["position"], df["mean"], color="blue", linewidth=1.5, label="Mean Quality")

#         for i in range(len(df)):
#             x = df.loc[i, "position"]
#             q1 = df.loc[i, "q1"]
#             q3 = df.loc[i, "q3"]
#             plt.fill_between([x - 0.4, x + 0.4], [q1, q1], [q3, q3], color="yellow", edgecolor="black")

#         plt.vlines(df["position"], df["min"], df["max"], color="black", linewidth=0.5)
#         plt.title(f"Quality scores across all bases: {sample_name}", fontsize=14)
#         plt.xlabel("Position in read (bp)", fontsize=12)
#         plt.ylabel("Quality score", fontsize=12)
#         plt.ylim(0, 40)
#         plt.xlim(1, df["position"].max())
#         plt.legend()
#         plt.tight_layout()
#         plot_path = os.path.join(output_plot_folder, f"{sample_name}_quality_plot.png")
#         plt.savefig(plot_path, dpi=300)
#         plt.close()

#         print(f"✅ Saved: {sample_name}_quality.csv and quality_plot.png")

# read count check

In [51]:
# import os
# import gzip
# from Bio import SeqIO

# # 분석 대상 폴더
# input_folder = "fastq_step/1_3_5_7_9_11/A_Untrimmed_output"  # 여기에 대상 폴더 경로를 입력하세요

# # 허용 확장자
# valid_extensions = [".fastq", ".fastq.gz", ".fasta", ".fasta.gz"]

# # 포맷 결정 함수
# def get_format(filename):
#     if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
#         return "fastq"
#     elif filename.endswith(".fasta") or filename.endswith(".fasta.gz"):
#         return "fasta"
#     else:
#         return None

# # 결과 저장 리스트
# read_counts = []

# # 파일 순회 및 read 수 카운트
# for filename in os.listdir(input_folder):
#     if any(filename.endswith(ext) for ext in valid_extensions):
#         file_path = os.path.join(input_folder, filename)
#         file_format = get_format(filename)
#         open_func = gzip.open if filename.endswith(".gz") else open

#         try:
#             with open_func(file_path, "rt") as handle:
#                 count = sum(1 for _ in SeqIO.parse(handle, file_format))
#             read_counts.append((filename, count))
#         except Exception as e:
#             print(f"❌ Error processing {filename}: {e}")

# # 📄 파일명 기준 정렬 후 출력
# read_counts.sort(key=lambda x: x[0].lower())  # 파일명 기준 (대소문자 구분 없이) 정렬
# for fname, count in read_counts:
#     print(f"{fname:40} : {count} reads")

# length check

In [52]:
import os
import gzip
from Bio import SeqIO

# 분석 대상 폴더
input_folder = "fastq_1_2_3_4_5_6/A_Untrimmed_output"  # 여기에 대상 폴더 경로를 입력하세요

# 허용 확장자
valid_extensions = [".fastq", ".fastq.gz"]

# 파일 포맷 결정 함수
def get_format(filename):
    if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
        return "fastq"
    else:
        return None

# 결과 저장 리스트
read_stats = []

# 파일 순회 및 분석
for filename in os.listdir(input_folder):
    if any(filename.endswith(ext) for ext in valid_extensions):
        file_path = os.path.join(input_folder, filename)
        file_format = get_format(filename)
        open_func = gzip.open if filename.endswith(".gz") else open

        try:
            total_len = 0
            read_count = 0
            with open_func(file_path, "rt") as handle:
                for record in SeqIO.parse(handle, file_format):
                    total_len += len(record.seq)
                    read_count += 1
            avg_length = total_len / read_count if read_count > 0 else 0
            read_stats.append((filename, read_count, round(avg_length, 2)))
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

# 정렬 후 출력
read_stats.sort(key=lambda x: x[0].lower())
for fname, count, avg_len in read_stats:
    print(f"{fname:40} : {count:6} reads, Avg Length = {avg_len:6} bp")

250905_batch19_01step_R1_untrimmed.fastq.gz :   3219 reads, Avg Length =  31.55 bp
250905_batch19_01step_R2_untrimmed.fastq.gz :   3219 reads, Avg Length =   45.7 bp
250905_batch19_02step_R1_untrimmed.fastq.gz :   2632 reads, Avg Length =  50.84 bp
250905_batch19_02step_R2_untrimmed.fastq.gz :   2632 reads, Avg Length =  62.08 bp
250905_batch19_03step_R1_untrimmed.fastq.gz :   2706 reads, Avg Length =  71.72 bp
250905_batch19_03step_R2_untrimmed.fastq.gz :   2706 reads, Avg Length =  81.03 bp
250905_batch19_04step_R1_untrimmed.fastq.gz :   2741 reads, Avg Length =  92.04 bp
250905_batch19_04step_R2_untrimmed.fastq.gz :   2741 reads, Avg Length =  97.58 bp
250910_batch20_05step_R1_untrimmed.fastq.gz :   2468 reads, Avg Length = 109.94 bp
250910_batch20_05step_R2_untrimmed.fastq.gz :   2468 reads, Avg Length = 114.66 bp
250910_batch20_06step_R1_untrimmed.fastq.gz :   2387 reads, Avg Length = 131.44 bp
250910_batch20_06step_R2_untrimmed.fastq.gz :   2387 reads, Avg Length = 133.51 bp


# Q filtering

In [53]:
import os
import subprocess

# 품질 기준(Q30)
quality_threshold = 30

# 입력 폴더와 출력 폴더 설정

input_folder = "fastq_1_2_3_4_5_6/A_Untrimmed_output"
output_folder = "fastq_1_2_3_4_5_6/B_Qfiltered"

os.makedirs(output_folder, exist_ok=True)  # 출력 폴더가 없으면 생성

# 입력 폴더 내 파일들을 순회하며, "_trimmed.fastq.gz"로 끝나는 파일만 처리
for filename in os.listdir(input_folder):
    if filename.endswith("_untrimmed.fastq.gz"):
        # 입력 파일 경로
        input_file = os.path.join(input_folder, filename)
        
        # 출력 파일 이름(예: sample_trimmed.fastq.gz -> sample_trimmed_filtered.fastq.gz)
        output_file = os.path.join(
            output_folder, 
            filename.replace("_untrimmed.fastq.gz", "_Qfiltered.fastq.gz")
        )
        
        # fastp 실행 (싱글 엔드 모드)
        subprocess.call([
            "fastp",
            "-i", input_file,               # 입력 파일
            "-o", output_file,              # 출력 파일
            "-q", str(quality_threshold),   # Q30 미만 품질 제거
            "-u", "15",                      # low-quality base 비율 20% 이상이면 read 제거
            #"-l", "151",                      # 최소 read 길이
            "--cut_mean_quality", "30",     # 평균 Q<30이면 read 제거
            "--html", f"{output_file}.html",  # HTML 리포트
            "--json", f"{output_file}.json"   # JSON 리포트
        ])
        
        print(f"Filtering for {filename} is complete.\n"
              f"Output FASTQ : {output_file}\n"
              f"Reports      : {output_file}.html / {output_file}.json\n")

print("All filtering processes are done.")

Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 2387
total bases: 318681
Q20 bases: 295857(92.838%)
Q30 bases: 274463(86.1247%)

Read1 after filtering:
total reads: 1654
total bases: 214977
Q20 bases: 210076(97.7202%)
Q30 bases: 200235(93.1425%)

Filtering result:
reads passed filter: 1654
reads failed due to low quality: 732
reads failed due to too many N: 0
reads failed due to too short: 1
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 51.9062%

JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/250910_batch20_06step_R2_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R2_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5

Filtering for 250910_batch20_06step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R2_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R2_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R2_Qfiltered.fastq.gz.json




JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_02step_R2_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R2_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R2_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R2_Qfiltered.fastq.gz.json 
fastp v0.23.4, time used: 0 seconds
Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 2387
total bases: 313755
Q20 bases: 296656(94.5502%)
Q30 bases: 279793(89.1756%)

Read1 after filtering:
total reads: 1802
total bases: 232772
Q20 bases: 230093(98.8491%)
Q30 bases: 223168(95.8741%)

Filtering result:
reads passed filter: 1802
reads failed due to low quality: 584
reads failed du

Filtering for 250905_batch19_02step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R2_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R2_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R2_Qfiltered.fastq.gz.json




JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/250910_batch20_06step_R1_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R1_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R1_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R1_Qfiltered.fastq.gz.json 
fastp v0.23.4, time used: 0 seconds
Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 2632
total bases: 133821
Q20 bases: 132480(98.9979%)
Q30 bases: 130150(97.2568%)

Read1 after filtering:
total reads: 2457
total bases: 125363
Q20 bases: 125152(99.8317%)
Q30 bases: 124202(99.0739%)

Filtering result:
reads passed filter: 2457
reads failed due to low quality: 150
reads failed du

Filtering for 250910_batch20_06step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R1_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R1_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_06step_R1_Qfiltered.fastq.gz.json




JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_02step_R1_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R1_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R1_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R1_Qfiltered.fastq.gz.json 
fastp v0.23.4, time used: 1 seconds
Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 2468
total bases: 271323
Q20 bases: 262966(96.9199%)
Q30 bases: 252892(93.207%)

Read1 after filtering:
total reads: 2144
total bases: 232413
Q20 bases: 230924(99.3593%)
Q30 bases: 226129(97.2962%)

Filtering result:
reads passed filter: 2144
reads failed due to low quality: 323
reads failed due

Filtering for 250905_batch19_02step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R1_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R1_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_02step_R1_Qfiltered.fastq.gz.json




JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/250910_batch20_05step_R1_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R1_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R1_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R1_Qfiltered.fastq.gz.json 
fastp v0.23.4, time used: 0 seconds
Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 3219
total bases: 101569
Q20 bases: 100677(99.1218%)
Q30 bases: 98934(97.4057%)

Read1 after filtering:
total reads: 2960
total bases: 94847
Q20 bases: 94645(99.787%)
Q30 bases: 93971(99.0764%)

Filtering result:
reads passed filter: 2960
reads failed due to low quality: 190
reads failed due to 

Filtering for 250910_batch20_05step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R1_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R1_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R1_Qfiltered.fastq.gz.json




JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_01step_R1_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R1_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R1_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R1_Qfiltered.fastq.gz.json 
fastp v0.23.4, time used: 0 seconds
Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 2468
total bases: 282991
Q20 bases: 267052(94.3677%)
Q30 bases: 252250(89.1371%)

Read1 after filtering:
total reads: 1954
total bases: 214165
Q20 bases: 210956(98.5016%)
Q30 bases: 203983(95.2457%)

Filtering result:
reads passed filter: 1954
reads failed due to low quality: 513
reads failed du

Filtering for 250905_batch19_01step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R1_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R1_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R1_Qfiltered.fastq.gz.json




JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/250910_batch20_05step_R2_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R2_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R2_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R2_Qfiltered.fastq.gz.json 
fastp v0.23.4, time used: 0 seconds
Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 3219
total bases: 147116
Q20 bases: 132119(89.806%)
Q30 bases: 125160(85.0757%)

Read1 after filtering:
total reads: 2724
total bases: 95886
Q20 bases: 95762(99.8707%)
Q30 bases: 95123(99.2043%)

Filtering result:
reads passed filter: 2724
reads failed due to low quality: 430
reads failed due to

Filtering for 250910_batch20_05step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R2_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R2_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/250910_batch20_05step_R2_Qfiltered.fastq.gz.json




JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_01step_R2_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R2_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R2_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R2_Qfiltered.fastq.gz.json 
fastp v0.23.4, time used: 1 seconds
Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 2741
total bases: 267462
Q20 bases: 255423(95.4988%)
Q30 bases: 247215(92.43%)

Read1 after filtering:
total reads: 2374
total bases: 219665
Q20 bases: 218813(99.6121%)
Q30 bases: 215981(98.3229%)

Filtering result:
reads passed filter: 2374
reads failed due to low quality: 364
reads failed due 

Filtering for 250905_batch19_01step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R2_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R2_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_01step_R2_Qfiltered.fastq.gz.json




JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_04step_R2_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R2_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R2_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R2_Qfiltered.fastq.gz.json 
fastp v0.23.4, time used: 0 seconds
Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 2706
total bases: 194062
Q20 bases: 191626(98.7447%)
Q30 bases: 187949(96.85%)

Read1 after filtering:
total reads: 2530
total bases: 181841
Q20 bases: 181428(99.7729%)
Q30 bases: 179808(98.882%)

Filtering result:
reads passed filter: 2530
reads failed due to low quality: 159
reads failed due t

Filtering for 250905_batch19_04step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R2_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R2_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R2_Qfiltered.fastq.gz.json




JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_03step_R1_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R1_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R1_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R1_Qfiltered.fastq.gz.json 
fastp v0.23.4, time used: 0 seconds
Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 2741
total bases: 252271
Q20 bases: 248474(98.4949%)
Q30 bases: 243090(96.3607%)

Read1 after filtering:
total reads: 2543
total bases: 233407
Q20 bases: 232739(99.7138%)
Q30 bases: 230134(98.5977%)

Filtering result:
reads passed filter: 2543
reads failed due to low quality: 195
reads failed du

Filtering for 250905_batch19_03step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R1_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R1_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R1_Qfiltered.fastq.gz.json




JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_04step_R1_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R1_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R1_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R1_Qfiltered.fastq.gz.json 
fastp v0.23.4, time used: 1 seconds
Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 2706
total bases: 219279
Q20 bases: 201488(91.8866%)
Q30 bases: 193152(88.085%)

Read1 after filtering:
total reads: 2289
total bases: 167242
Q20 bases: 166915(99.8045%)
Q30 bases: 165479(98.9458%)

Filtering result:
reads passed filter: 2289
reads failed due to low quality: 402
reads failed due

Filtering for 250905_batch19_04step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R1_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R1_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_04step_R1_Qfiltered.fastq.gz.json

Filtering for 250905_batch19_03step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R2_Qfiltered.fastq.gz
Reports      : fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R2_Qfiltered.fastq.gz.html / fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R2_Qfiltered.fastq.gz.json

All filtering processes are done.



JSON report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_1_2_3_4_5_6/A_Untrimmed_output/250905_batch19_03step_R2_untrimmed.fastq.gz -o fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R2_Qfiltered.fastq.gz -q 30 -u 15 --cut_mean_quality 30 --html fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R2_Qfiltered.fastq.gz.html --json fastq_1_2_3_4_5_6/B_Qfiltered/250905_batch19_03step_R2_Qfiltered.fastq.gz.json 
fastp v0.23.4, time used: 0 seconds


# Quality check

In [54]:
# import os
# import gzip
# from Bio import SeqIO
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt

# # 📁 입력 폴더와 출력 폴더 설정
# input_folder = "fastq_step/1_3_5_7_9_11/B_Qfiltered"
# output_csv_folder = "fastq_step/1_3_5_7_9_11/B_Qfiltered/quality_stats_csv"
# output_plot_folder = "fastq_step/1_3_5_7_9_11/B_Qfiltered/quality_plots"
# os.makedirs(output_csv_folder, exist_ok=True)
# os.makedirs(output_plot_folder, exist_ok=True)

# # 🔁 품질 통계 추출 함수
# def compute_quality_stats(file_path):
#     position_qualities = {}
#     open_func = gzip.open if file_path.endswith(".gz") else open

#     with open_func(file_path, "rt") as handle:
#         for record in SeqIO.parse(handle, "fastq"):
#             for i, q in enumerate(record.letter_annotations["phred_quality"]):
#                 position_qualities.setdefault(i, []).append(q)

#     stats = []
#     for pos in sorted(position_qualities):
#         scores = np.array(position_qualities[pos])
#         stats.append({
#             "position": pos + 1,
#             "mean": np.mean(scores),
#             "q1": np.percentile(scores, 25),
#             "median": np.median(scores),
#             "q3": np.percentile(scores, 75),
#             "min": np.min(scores),
#             "max": np.max(scores)
#         })
#     return pd.DataFrame(stats)

# # 📊 배경 색상 함수 (fastp 스타일)
# def add_quality_background(ax):
#     ax.axhspan(30, 40, facecolor='lightgreen', alpha=0.5)
#     ax.axhspan(25, 30, facecolor='khaki', alpha=0.5)
#     ax.axhspan(20, 25, facecolor='moccasin', alpha=0.5)
#     ax.axhspan(0, 20, facecolor='lightcoral', alpha=0.5)

# # 📂 폴더 내 모든 FASTQ(.gz 포함) 처리
# for filename in os.listdir(input_folder):
#     if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
#         input_path = os.path.join(input_folder, filename)
#         sample_name = os.path.splitext(filename)[0].replace(".fastq", "").replace(".gz", "")

#         print(f"📌 Processing: {sample_name}")
#         df = compute_quality_stats(input_path)

#         # CSV 저장
#         csv_path = os.path.join(output_csv_folder, f"{sample_name}_quality.csv")
#         df.to_csv(csv_path, index=False)

#         # 그래프 저장
#         plt.figure(figsize=(18, 8))
#         ax = plt.gca()
#         add_quality_background(ax)
#         plt.plot(df["position"], df["mean"], color="blue", linewidth=1.5, label="Mean Quality")

#         for i in range(len(df)):
#             x = df.loc[i, "position"]
#             q1 = df.loc[i, "q1"]
#             q3 = df.loc[i, "q3"]
#             plt.fill_between([x - 0.4, x + 0.4], [q1, q1], [q3, q3], color="yellow", edgecolor="black")

#         plt.vlines(df["position"], df["min"], df["max"], color="black", linewidth=0.5)
#         plt.title(f"Quality scores across all bases: {sample_name}", fontsize=14)
#         plt.xlabel("Position in read (bp)", fontsize=12)
#         plt.ylabel("Quality score", fontsize=12)
#         plt.ylim(0, 40)
#         plt.xlim(1, df["position"].max())
#         plt.legend()
#         plt.tight_layout()
#         plot_path = os.path.join(output_plot_folder, f"{sample_name}_quality_plot.png")
#         plt.savefig(plot_path, dpi=300)
#         plt.close()

#         print(f"✅ Saved: {sample_name}_quality.csv and quality_plot.png")

# read count check

In [55]:
# import os
# import gzip
# from Bio import SeqIO

# # 분석 대상 폴더
# input_folder = "fastq_step/1_3_5_7_9_11/B_Qfiltered"  # 여기에 대상 폴더 경로를 입력하세요

# # 허용 확장자
# valid_extensions = [".fastq", ".fastq.gz", ".fasta", ".fasta.gz"]

# # 포맷 결정 함수
# def get_format(filename):
#     if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
#         return "fastq"
#     elif filename.endswith(".fasta") or filename.endswith(".fasta.gz"):
#         return "fasta"
#     else:
#         return None

# # 결과 저장 리스트
# read_counts = []

# # 파일 순회 및 read 수 카운트
# for filename in os.listdir(input_folder):
#     if any(filename.endswith(ext) for ext in valid_extensions):
#         file_path = os.path.join(input_folder, filename)
#         file_format = get_format(filename)
#         open_func = gzip.open if filename.endswith(".gz") else open

#         try:
#             with open_func(file_path, "rt") as handle:
#                 count = sum(1 for _ in SeqIO.parse(handle, file_format))
#             read_counts.append((filename, count))
#         except Exception as e:
#             print(f"❌ Error processing {filename}: {e}")

# # 📄 파일명 기준 정렬 후 출력
# read_counts.sort(key=lambda x: x[0].lower())  # 파일명 기준 (대소문자 구분 없이) 정렬
# for fname, count in read_counts:
#     print(f"{fname:40} : {count} reads")

# ID matching

In [56]:
import gzip
import glob
import os

def extract_matching_reads(r1_path, r2_path, out_r1_path, out_r2_path):
    def get_read_id(header):
        # FASTQ header에서 ID 추출
        return header.split()[0].replace('/1', '').replace('/2', '')

    r1_ids = set()
    r2_ids = set()

    with gzip.open(r1_path, 'rt') as r1_file:
        while True:
            header = r1_file.readline()
            if not header:
                break
            r1_ids.add(get_read_id(header.strip()))
            [r1_file.readline() for _ in range(3)]  # read 나머지 3줄 skip

    with gzip.open(r2_path, 'rt') as r2_file:
        while True:
            header = r2_file.readline()
            if not header:
                break
            r2_ids.add(get_read_id(header.strip()))
            [r2_file.readline() for _ in range(3)]

    matching_ids = r1_ids & r2_ids
    r1_only = r1_ids - r2_ids
    r2_only = r2_ids - r1_ids

    print(f"Processing {os.path.basename(r1_path)} and {os.path.basename(r2_path)}")
    print(f"Total R1 IDs: {len(r1_ids)}, Total R2 IDs: {len(r2_ids)}, Matching IDs: {len(matching_ids)}")
    print(f"IDs only in R1: {len(r1_only)}, IDs only in R2: {len(r2_only)}\n")

    # 결과 폴더 생성
    for out_path in [out_r1_path, out_r2_path]:
        os.makedirs(os.path.dirname(out_path), exist_ok=True)

    def write_matching_reads(input_path, output_path, matching_ids):
        with gzip.open(input_path, 'rt') as infile, gzip.open(output_path, 'wt') as outfile:
            while True:
                lines = [infile.readline() for _ in range(4)]
                if not lines[0]:
                    break
                read_id = get_read_id(lines[0].strip())
                if read_id in matching_ids:
                    outfile.writelines(lines)

    write_matching_reads(r1_path, out_r1_path, matching_ids)
    write_matching_reads(r2_path, out_r2_path, matching_ids)

# ----------------------
# 전체 파일에 대해 적용
# ----------------------

input_folder = "fastq_1_2_3_4_5_6/B_Qfiltered"
output_folder = "fastq_1_2_3_4_5_6/C_ID_matched"

# 모든 R1 파일 찾기
r1_files = glob.glob(os.path.join(input_folder, "*_R1_Qfiltered.fastq.gz"))

# 각 R1에 대해 짝이 맞는 R2를 찾고 작업 실행
for r1_file in r1_files:
    r2_file = r1_file.replace("_R1_Qfiltered.fastq.gz", "_R2_Qfiltered.fastq.gz")
    
    if os.path.exists(r2_file):
        # 결과 output 경로 설정
        base_name = os.path.basename(r1_file).replace("_R1_Qfiltered.fastq.gz", "")
        out_r1 = os.path.join(output_folder, f"{base_name}_ID_match_R1.fastq.gz")
        out_r2 = os.path.join(output_folder, f"{base_name}_ID_match_R2.fastq.gz")
        
        # 함수 실행
        extract_matching_reads(r1_file, r2_file, out_r1, out_r2)
    else:
        print(f"Warning: {r2_file} not found. Skipping.")

Processing 250905_batch19_01step_R1_Qfiltered.fastq.gz and 250905_batch19_01step_R2_Qfiltered.fastq.gz
Total R1 IDs: 2960, Total R2 IDs: 2724, Matching IDs: 2660
IDs only in R1: 300, IDs only in R2: 64

Processing 250910_batch20_05step_R1_Qfiltered.fastq.gz and 250910_batch20_05step_R2_Qfiltered.fastq.gz
Total R1 IDs: 2144, Total R2 IDs: 1954, Matching IDs: 1898
IDs only in R1: 246, IDs only in R2: 56

Processing 250905_batch19_04step_R1_Qfiltered.fastq.gz and 250905_batch19_04step_R2_Qfiltered.fastq.gz
Total R1 IDs: 2543, Total R2 IDs: 2374, Matching IDs: 2328
IDs only in R1: 215, IDs only in R2: 46

Processing 250905_batch19_03step_R1_Qfiltered.fastq.gz and 250905_batch19_03step_R2_Qfiltered.fastq.gz
Total R1 IDs: 2530, Total R2 IDs: 2289, Matching IDs: 2239
IDs only in R1: 291, IDs only in R2: 50

Processing 250905_batch19_02step_R1_Qfiltered.fastq.gz and 250905_batch19_02step_R2_Qfiltered.fastq.gz
Total R1 IDs: 2457, Total R2 IDs: 2254, Matching IDs: 2192
IDs only in R1: 265, IDs o

# read count check

In [57]:
# import os
# import gzip
# from Bio import SeqIO

# # 분석 대상 폴더
# input_folder = "fastq_step/1_3_5_7_9_11/C_ID_matched"  # 여기에 대상 폴더 경로를 입력하세요

# # 허용 확장자
# valid_extensions = [".fastq", ".fastq.gz", ".fasta", ".fasta.gz"]

# # 포맷 결정 함수
# def get_format(filename):
#     if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
#         return "fastq"
#     elif filename.endswith(".fasta") or filename.endswith(".fasta.gz"):
#         return "fasta"
#     else:
#         return None

# # 결과 저장 리스트
# read_counts = []

# # 파일 순회 및 read 수 카운트
# for filename in os.listdir(input_folder):
#     if any(filename.endswith(ext) for ext in valid_extensions):
#         file_path = os.path.join(input_folder, filename)
#         file_format = get_format(filename)
#         open_func = gzip.open if filename.endswith(".gz") else open

#         try:
#             with open_func(file_path, "rt") as handle:
#                 count = sum(1 for _ in SeqIO.parse(handle, file_format))
#             read_counts.append((filename, count))
#         except Exception as e:
#             print(f"❌ Error processing {filename}: {e}")

# # 📄 파일명 기준 정렬 후 출력
# read_counts.sort(key=lambda x: x[0].lower())  # 파일명 기준 (대소문자 구분 없이) 정렬
# for fname, count in read_counts:
#     print(f"{fname:40} : {count} reads")

# Merge(Flash)

In [58]:
import os
import glob
import subprocess

# === 폴더 설정 ===
input_folder = "fastq_1_2_3_4_5_6/C_ID_matched"
output_folder = "fastq_1_2_3_4_5_6/D_merged_output"
os.makedirs(output_folder, exist_ok=True)

# === Prefix별 N값 설정 ===
sample_n_mapping = {
    "01step": 32,#20+12
    "02step": 52,#32+20
    "03step": 74,#52+22
    "04step": 94,#74+20
    "05step": 116,#94+22
    "06step": 136,#116+20

}

# === R1_B 파일 리스트 찾기 ===
r1_files = glob.glob(os.path.join(input_folder, "*_R1.fastq.gz"))

print(f"🔎 Found {len(r1_files)} R1 files.")

# === 각 R1_B 파일에 대해 ===
for r1_path in r1_files:
    sample_base = os.path.basename(r1_path).replace("_R1.fastq.gz", "")
    r2_path = os.path.join(input_folder, f"{sample_base}_R2.fastq.gz")

    if not os.path.exists(r2_path):
        print(f"⚠️ Matching R2 file not found for {sample_base} → Skipping.")
        continue

    # 파일 이름에 맞는 N값 찾기
    matched_n = None
    for prefix, n_value in sample_n_mapping.items():
        if prefix in sample_base:
            matched_n = n_value
            break

    if matched_n is None:
        print(f"⚠️ No N value matched for {sample_base} → Skipping.")
        continue

    output_name = f"{sample_base}_FLASH"

    print(f"🔵 Running FLASH for sample: {sample_base} (N={matched_n})")

    try:
        subprocess.check_call([
            "flash",
            "-m", str(matched_n),   # 최소 overlap
            "-M", str(matched_n),   # 최대 overlap
            "-o", output_name,      # 결과 파일 prefix
            "-d", output_folder,    # 결과 저장 폴더
            r1_path,
            r2_path
        ])
        print(f"✅ FLASH merging complete → {os.path.join(output_folder, output_name)}.fastq")
    except subprocess.CalledProcessError as e:
        print(f"❌ FLASH merging failed for {sample_base}: {e}")

🔎 Found 6 R1 files.
🔵 Running FLASH for sample: 250910_batch20_05step_ID_match (N=116)
[FLASH] Starting FLASH v1.2.11
[FLASH] Fast Length Adjustment of SHort reads
[FLASH]  
[FLASH] Input files:
[FLASH]     fastq_1_2_3_4_5_6/C_ID_matched/250910_batch20_05step_ID_match_R1.fastq.gz
[FLASH]     fastq_1_2_3_4_5_6/C_ID_matched/250910_batch20_05step_ID_match_R2.fastq.gz
[FLASH]  
[FLASH] Output files:
[FLASH]     fastq_1_2_3_4_5_6/D_merged_output/250910_batch20_05step_ID_match_FLASH.extendedFrags.fastq
[FLASH]     fastq_1_2_3_4_5_6/D_merged_output/250910_batch20_05step_ID_match_FLASH.notCombined_1.fastq
[FLASH]     fastq_1_2_3_4_5_6/D_merged_output/250910_batch20_05step_ID_match_FLASH.notCombined_2.fastq
[FLASH]     fastq_1_2_3_4_5_6/D_merged_output/250910_batch20_05step_ID_match_FLASH.hist
[FLASH]     fastq_1_2_3_4_5_6/D_merged_output/250910_batch20_05step_ID_match_FLASH.histogram
[FLASH]  
[FLASH] Parameters:
[FLASH]     Min overlap:           116
[FLASH]     Max overlap:           116
[FL

overlapped by more than 116 bp, the --max-overlap (-M) parameter.  Consider
increasing this parameter.  (As-is, FLASH is penalizing overlaps longer than
116 bp when considering them for possible combining!)


# Fastq to Fasta

In [59]:
import os
import gzip
from Bio import SeqIO

# 입력 및 출력 폴더 경로 설정
input_folder = "fastq_1_2_3_4_5_6/D_merged_output"
output_folder = "fastq_1_2_3_4_5_6/E_fastq_to_fasta"

os.makedirs(output_folder, exist_ok=True)  # 출력 폴더 생성

for filename in os.listdir(input_folder):
    # .fastq 또는 .fastq.gz 확장자 파일만 처리
    if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
        input_file = os.path.join(input_folder, filename)
        
        # 출력 파일명 설정 (.fasta 확장자)
        output_file = os.path.join(
            output_folder,
            filename.replace(".fastq.gz", ".fasta").replace(".fastq", ".fasta")
        )

        # 파일 열기 모드 결정 (gzip 여부)
        open_func = gzip.open if filename.endswith(".gz") else open

        # FASTQ 파일 읽어서 FASTA로 변환
        with open_func(input_file, "rt") as fastq_file:  # 텍스트 모드로 열기
            records = list(SeqIO.parse(fastq_file, "fastq"))

        # FASTA 파일로 저장
        with open(output_file, "w") as fasta_file:
            SeqIO.write(records, fasta_file, "fasta")

        print(f"Converted: {filename} → {os.path.basename(output_file)}")

print("All conversions are done.")

Converted: 250905_batch19_04step_ID_match_FLASH.notCombined_2.fastq → 250905_batch19_04step_ID_match_FLASH.notCombined_2.fasta
Converted: 250905_batch19_02step_ID_match_FLASH.notCombined_2.fastq → 250905_batch19_02step_ID_match_FLASH.notCombined_2.fasta
Converted: 250905_batch19_01step_ID_match_FLASH.notCombined_2.fastq → 250905_batch19_01step_ID_match_FLASH.notCombined_2.fasta
Converted: 250905_batch19_01step_ID_match_FLASH.extendedFrags.fastq → 250905_batch19_01step_ID_match_FLASH.extendedFrags.fasta
Converted: 250905_batch19_02step_ID_match_FLASH.extendedFrags.fastq → 250905_batch19_02step_ID_match_FLASH.extendedFrags.fasta
Converted: 250905_batch19_04step_ID_match_FLASH.notCombined_1.fastq → 250905_batch19_04step_ID_match_FLASH.notCombined_1.fasta
Converted: 250905_batch19_04step_ID_match_FLASH.extendedFrags.fastq → 250905_batch19_04step_ID_match_FLASH.extendedFrags.fasta
Converted: 250905_batch19_02step_ID_match_FLASH.notCombined_1.fastq → 250905_batch19_02step_ID_match_FLASH.notC

In [60]:
import os
import gzip
from Bio import SeqIO

# 분석 대상 폴더
input_folder = "fastq_1_2_3_4_5_6/E_fastq_to_fasta"

# 허용 확장자: fasta만
valid_extensions = [".fasta", ".fasta.gz"]

# 파일 열기 함수 결정
def get_format(filename):
    if filename.endswith(".fasta") or filename.endswith(".fasta.gz"):
        return "fasta"
    else:
        return None

# 결과 저장 리스트
read_stats = []

# 파일 순회
for filename in os.listdir(input_folder):
    if any(filename.endswith(ext) for ext in valid_extensions):
        file_path = os.path.join(input_folder, filename)
        file_format = get_format(filename)
        open_func = gzip.open if filename.endswith(".gz") else open

        try:
            total_len = 0
            read_count = 0
            with open_func(file_path, "rt") as handle:
                for record in SeqIO.parse(handle, file_format):
                    total_len += len(record.seq)
                    read_count += 1
            avg_length = total_len / read_count if read_count > 0 else 0
            read_stats.append((filename, read_count, round(avg_length, 2)))
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

# 정렬 및 출력
read_stats.sort(key=lambda x: x[0].lower())
for fname, count, avg_len in read_stats:
    print(f"{fname:40} : {count:6} reads, Avg Length = {avg_len:6} bp")

250905_batch19_01step_ID_match_FLASH.extendedFrags.fasta :   2528 reads, Avg Length =  32.04 bp
250905_batch19_01step_ID_match_FLASH.notCombined_1.fasta :    132 reads, Avg Length =   31.3 bp
250905_batch19_01step_ID_match_FLASH.notCombined_2.fasta :    132 reads, Avg Length =  95.53 bp
250905_batch19_02step_ID_match_FLASH.extendedFrags.fasta :   1921 reads, Avg Length =  52.13 bp
250905_batch19_02step_ID_match_FLASH.notCombined_1.fasta :    271 reads, Avg Length =  43.08 bp
250905_batch19_02step_ID_match_FLASH.notCombined_2.fasta :    271 reads, Avg Length =  53.74 bp
250905_batch19_03step_ID_match_FLASH.extendedFrags.fasta :   1790 reads, Avg Length =  74.12 bp
250905_batch19_03step_ID_match_FLASH.notCombined_1.fasta :    449 reads, Avg Length =  63.16 bp
250905_batch19_03step_ID_match_FLASH.notCombined_2.fasta :    449 reads, Avg Length =  68.64 bp
250905_batch19_04step_ID_match_FLASH.extendedFrags.fasta :   1786 reads, Avg Length =  94.24 bp
250905_batch19_04step_ID_match_FLASH.not

# step reference

In [61]:
import os
from pathlib import Path

def generate_sequences_for_bit(bit_length: int):
    """
    특정 비트 수(bit_length)에 해당하는 모든 binary 조합의 DNA 서열 생성
    """
    sequences = {}

    seq_0 = "ACTCATATACACACTTAATC"
    seq_1 = "ACTCATATACATACACTTAATC"
    prefix = "ACACTTAATC"

    for i in range(2 ** bit_length):
        binary_str = format(i, f'0{bit_length}b')
        sequence = ''.join(seq_1 if bit == '1' else seq_0 for bit in binary_str)
        full_sequence = prefix + sequence
        seq_id = f"seq_{i:04d}_{binary_str}"
        sequences[seq_id] = full_sequence

    return sequences

def write_fasta(sequences: dict, output_path: str):
    """FASTA 파일 생성"""
    with open(output_path, "w") as f:
        for seq_id, sequence in sequences.items():
            f.write(f">{seq_id}\n{sequence}\n")

# ===== 설정 =====
output_dir = Path("step_reference")
output_dir.mkdir(parents=True, exist_ok=True)

MAX_STEP = 12        # 1 ~ 12 step 생성
PAD = len(str(MAX_STEP))  # 패딩 폭(12 → 2자리)
# =================

for step in range(1, MAX_STEP + 1):
    seqs = generate_sequences_for_bit(step)
    out_name = output_dir / f"{step:0{PAD}d}step_reference.fasta"  # → 01step_reference.fasta
    write_fasta(seqs, out_name)
    print(f"✅ {step:0{PAD}d}step FASTA saved: {out_name}")

✅ 01step FASTA saved: step_reference/01step_reference.fasta
✅ 02step FASTA saved: step_reference/02step_reference.fasta
✅ 03step FASTA saved: step_reference/03step_reference.fasta
✅ 04step FASTA saved: step_reference/04step_reference.fasta
✅ 05step FASTA saved: step_reference/05step_reference.fasta
✅ 06step FASTA saved: step_reference/06step_reference.fasta
✅ 07step FASTA saved: step_reference/07step_reference.fasta
✅ 08step FASTA saved: step_reference/08step_reference.fasta
✅ 09step FASTA saved: step_reference/09step_reference.fasta
✅ 10step FASTA saved: step_reference/10step_reference.fasta
✅ 11step FASTA saved: step_reference/11step_reference.fasta
✅ 12step FASTA saved: step_reference/12step_reference.fasta


# fasta to csv

In [62]:
import os
import re
import gzip
import pandas as pd
from pathlib import Path
from Bio import SeqIO

STEP_RE = re.compile(r"([0-9]+)step_reference\.fasta(\.gz)?$", re.IGNORECASE)
ID_RE   = re.compile(r"^seq_(\d{4})_([01]+)$")  # seq_0000_000... 형태

def parse_step_from_filename(fname: str):
    """
    '01step_reference.fasta' / '12step_reference.fasta.gz' 에서 step 정수와 0패딩 문자열을 반환
    """
    m = STEP_RE.search(fname)
    if not m:
        return None, None
    step_str = m.group(1)            # e.g. '01'
    step_int = int(step_str, 10)     # 선행 0 안전
    return step_int, step_str

def parse_id_fields(read_id: str):
    """
    'seq_0007_000111' → index=7, binary='000111'
    매칭 안되면 (None, None)
    """
    m = ID_RE.match(read_id)
    if not m:
        return None, None
    idx = int(m.group(1), 10)
    binary = m.group(2)
    return idx, binary

def fasta_to_csv(fasta_path, csv_path, add_extra_cols=True):
    """
    FASTA → CSV 변환
    - gz 자동 인식
    - 파일명에서 step 추출
    - read_id에서 index/binary 추출
    """
    fasta_path = Path(fasta_path)
    csv_path = Path(csv_path)

    step_int, step_str = parse_step_from_filename(fasta_path.name)
    if step_int is None:
        print(f"⚠️  Skip (no step pattern): {fasta_path.name}")
        return

    open_func = gzip.open if str(fasta_path).endswith(".gz") else open

    rows = []
    with open_func(fasta_path, "rt") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            read_id = record.id
            sequence = str(record.seq)
            if add_extra_cols:
                idx, binary = parse_id_fields(read_id)
                rows.append([read_id, sequence, len(sequence), step_int, step_str, idx, binary])
            else:
                rows.append([read_id, sequence])

    if add_extra_cols:
        cols = ["Read_ID", "Sequence", "SeqLen", "Step", "StepStr", "Index", "Binary"]
    else:
        cols = ["Read_ID", "Sequence"]

    df = pd.DataFrame(rows, columns=cols)
    df.to_csv(csv_path, index=False)
    print(f"✅ Converted: {fasta_path.name} → {csv_path.name} (rows={len(df)}, step={step_str})")

def convert_all_fasta_in_folder(input_folder, output_folder, add_extra_cols=True):
    """
    폴더 내 *step_reference.fasta(.gz) 전부 변환
    출력 파일명은 입력과 동일 베이스에서 확장자만 .csv로 변경
    (예: 01step_reference.fasta → 01step_reference.csv)
    """
    input_folder = Path(input_folder)
    output_folder = Path(output_folder)
    output_folder.mkdir(parents=True, exist_ok=True)

    # step_reference만 대상으로 한정
    targets = sorted(list(input_folder.glob("*step_reference.fasta")) +
                     list(input_folder.glob("*step_reference.fasta.gz")))

    if not targets:
        print(f"⚠️  No step_reference fasta found in {input_folder}")
        return

    for fp in targets:
        out_name = fp.name.replace(".fasta.gz", ".csv").replace(".fasta", ".csv")
        fasta_to_csv(fp, output_folder / out_name, add_extra_cols=add_extra_cols)

# 📌 사용 예시
input_folder = "step_reference"
output_folder = "step_reference/csv"
convert_all_fasta_in_folder(input_folder, output_folder, add_extra_cols=True)

✅ Converted: 01step_reference.fasta → 01step_reference.csv (rows=2, step=01)
✅ Converted: 02step_reference.fasta → 02step_reference.csv (rows=4, step=02)
✅ Converted: 03step_reference.fasta → 03step_reference.csv (rows=8, step=03)
✅ Converted: 04step_reference.fasta → 04step_reference.csv (rows=16, step=04)
✅ Converted: 05step_reference.fasta → 05step_reference.csv (rows=32, step=05)
✅ Converted: 06step_reference.fasta → 06step_reference.csv (rows=64, step=06)
✅ Converted: 07step_reference.fasta → 07step_reference.csv (rows=128, step=07)
✅ Converted: 08step_reference.fasta → 08step_reference.csv (rows=256, step=08)
✅ Converted: 09step_reference.fasta → 09step_reference.csv (rows=512, step=09)
✅ Converted: 10step_reference.fasta → 10step_reference.csv (rows=1024, step=10)
✅ Converted: 11step_reference.fasta → 11step_reference.csv (rows=2048, step=11)
✅ Converted: 12step_reference.fasta → 12step_reference.csv (rows=4096, step=12)


# length check

In [63]:
import os
import gzip
from Bio import SeqIO

# 분석 대상 폴더
input_folder = "step_reference"

# 허용 확장자: fasta만
valid_extensions = [".fasta", ".fasta.gz"]

# 파일 열기 함수 결정
def get_format(filename):
    if filename.endswith(".fasta") or filename.endswith(".fasta.gz"):
        return "fasta"
    else:
        return None

# 결과 저장 리스트
read_stats = []

# 파일 순회
for filename in os.listdir(input_folder):
    if any(filename.endswith(ext) for ext in valid_extensions):
        file_path = os.path.join(input_folder, filename)
        file_format = get_format(filename)
        open_func = gzip.open if filename.endswith(".gz") else open

        try:
            total_len = 0
            read_count = 0
            with open_func(file_path, "rt") as handle:
                for record in SeqIO.parse(handle, file_format):
                    total_len += len(record.seq)
                    read_count += 1
            avg_length = total_len / read_count if read_count > 0 else 0
            read_stats.append((filename, read_count, round(avg_length, 2)))
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

# 정렬 및 출력
read_stats.sort(key=lambda x: x[0].lower())
for fname, count, avg_len in read_stats:
    print(f"{fname:40} : {count:6} reads, Avg Length = {avg_len:6} bp")

01step_reference.fasta                   :      2 reads, Avg Length =   31.0 bp
02step_reference.fasta                   :      4 reads, Avg Length =   52.0 bp
03step_reference.fasta                   :      8 reads, Avg Length =   73.0 bp
04step_reference.fasta                   :     16 reads, Avg Length =   94.0 bp
05step_reference.fasta                   :     32 reads, Avg Length =  115.0 bp
06step_reference.fasta                   :     64 reads, Avg Length =  136.0 bp
07step_reference.fasta                   :    128 reads, Avg Length =  157.0 bp
08step_reference.fasta                   :    256 reads, Avg Length =  178.0 bp
09step_reference.fasta                   :    512 reads, Avg Length =  199.0 bp
10step_reference.fasta                   :   1024 reads, Avg Length =  220.0 bp
11step_reference.fasta                   :   2048 reads, Avg Length =  241.0 bp
12step_reference.fasta                   :   4096 reads, Avg Length =  262.0 bp


# bwa mem algorithm
## reference align

In [64]:
import os
import subprocess

ref_dir = "step_reference"

for filename in os.listdir(ref_dir):
    if filename.endswith(".fasta"):
        fasta_path = os.path.join(ref_dir, filename)
        print(f"🔎 Indexing: {fasta_path}")
        subprocess.run(["bwa", "index", fasta_path])

🔎 Indexing: step_reference/10step_reference.fasta
🔎 Indexing: step_reference/09step_reference.fasta
🔎 Indexing: step_reference/11step_reference.fasta


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.01 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.01 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/10step_reference.fasta
[main] Real time: 0.110 sec; CPU: 0.030 sec
[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.01 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/09step_reference.fasta
[main] Real time: 0.065 sec; CPU: 0.015 sec
[bwa_index] Pack FASTA... 0.01 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.03 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Co

🔎 Indexing: step_reference/08step_reference.fasta
🔎 Indexing: step_reference/12step_reference.fasta


0.01 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.06 seconds elapse.
[bwa_index] Update BWT... 0.01 sec
[bwa_index] Pack forward-only FASTA... 0.01 sec
[bwa_index] Construct SA from BWT and Occ... 0.03 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/12step_reference.fasta
[main] Real time: 0.337 sec; CPU: 0.121 sec
[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/02step_reference.fasta
[main] Real time: 0.028 sec; CPU: 0.004 sec
[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Oc

🔎 Indexing: step_reference/02step_reference.fasta
🔎 Indexing: step_reference/03step_reference.fasta
🔎 Indexing: step_reference/01step_reference.fasta
🔎 Indexing: step_reference/07step_reference.fasta
🔎 Indexing: step_reference/06step_reference.fasta
🔎 Indexing: step_reference/05step_reference.fasta
🔎 Indexing: step_reference/04step_reference.fasta


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/05step_reference.fasta
[main] Real time: 0.027 sec; CPU: 0.003 sec
[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/04step_reference.fasta
[main] Real time: 0.024 sec; CPU: 0.003 sec


In [65]:
%%bash
set -euo pipefail
shopt -s nullglob

ref_dir="step_reference"
query_dir="fastq_1_2_3_4_5_6/E_fastq_to_fasta"
output_dir="fastq_1_2_3_4_5_6/1_align_sam"
mkdir -p "$output_dir"

# 같은 참조는 한 번만 index
declare -A indexed

# *_01step_* 와 *_04step_1_* 모두 매칭되도록 글롭 확장
for query_file in "$query_dir"/*step*ID_match_FLASH.extendedFrags.fasta; do
  filename="$(basename "$query_file")"

  # 파일명에서 step 문자열 추출: ..._NNstep( _ 또는 끝 )
  if [[ "$filename" =~ _([0-9]+)step(_|$) ]]; then
    step_str="${BASH_REMATCH[1]}"               # 예: "01" 또는 "4"
  else
    echo "⚠️  step number not found in $filename"
    continue
  fi

  # 0패딩 통일(레퍼런스가 01step_reference.fasta 형식이므로)
  step_pad=$(printf "%02d" $((10#$step_str)))
  reference_file="${ref_dir}/${step_pad}step_reference.fasta"
  output_file="${output_dir}/${filename%.fasta}.sam"

  if [[ ! -f "$reference_file" ]]; then
    echo "⚠️  Missing reference: $reference_file"
    continue
  fi

  echo "🔎 file: $filename | step=${step_pad}"
  if [[ -z "${indexed[$reference_file]:-}" ]]; then
    bwa index "$reference_file"
    indexed[$reference_file]=1
  fi

  echo "🔄 Aligning to $(basename "$reference_file") ..."
  bwa mem -M -t 4 "$reference_file" "$query_file" > "$output_file"
  echo "✅ Done: $output_file"
done

🔎 file: 250905_batch19_01step_ID_match_FLASH.extendedFrags.fasta | step=01


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/01step_reference.fasta
[main] Real time: 0.042 sec; CPU: 0.004 sec


🔄 Aligning to 01step_reference.fasta ...


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 2528 sequences (80989 bp)...
[M::mem_process_seqs] Processed 2528 reads in 0.010 CPU sec, 0.005 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/01step_reference.fasta fastq_1_2_3_4_5_6/E_fastq_to_fasta/250905_batch19_01step_ID_match_FLASH.extendedFrags.fasta
[main] Real time: 0.059 sec; CPU: 0.016 sec


✅ Done: fastq_1_2_3_4_5_6/1_align_sam/250905_batch19_01step_ID_match_FLASH.extendedFrags.sam
🔎 file: 250905_batch19_02step_ID_match_FLASH.extendedFrags.fasta | step=02


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/02step_reference.fasta
[main] Real time: 0.028 sec; CPU: 0.004 sec


🔄 Aligning to 02step_reference.fasta ...


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1921 sequences (100150 bp)...
[M::mem_process_seqs] Processed 1921 reads in 0.049 CPU sec, 0.019 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/02step_reference.fasta fastq_1_2_3_4_5_6/E_fastq_to_fasta/250905_batch19_02step_ID_match_FLASH.extendedFrags.fasta
[main] Real time: 0.095 sec; CPU: 0.056 sec


✅ Done: fastq_1_2_3_4_5_6/1_align_sam/250905_batch19_02step_ID_match_FLASH.extendedFrags.sam
🔎 file: 250905_batch19_03step_ID_match_FLASH.extendedFrags.fasta | step=03


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/03step_reference.fasta
[main] Real time: 0.051 sec; CPU: 0.005 sec


🔄 Aligning to 03step_reference.fasta ...


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1790 sequences (132667 bp)...
[M::mem_process_seqs] Processed 1790 reads in 0.095 CPU sec, 0.030 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/03step_reference.fasta fastq_1_2_3_4_5_6/E_fastq_to_fasta/250905_batch19_03step_ID_match_FLASH.extendedFrags.fasta
[main] Real time: 0.099 sec; CPU: 0.101 sec


✅ Done: fastq_1_2_3_4_5_6/1_align_sam/250905_batch19_03step_ID_match_FLASH.extendedFrags.sam
🔎 file: 250905_batch19_04step_ID_match_FLASH.extendedFrags.fasta | step=04


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/04step_reference.fasta
[main] Real time: 0.050 sec; CPU: 0.005 sec


🔄 Aligning to 04step_reference.fasta ...


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1786 sequences (168310 bp)...
[M::mem_process_seqs] Processed 1786 reads in 0.142 CPU sec, 0.051 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/04step_reference.fasta fastq_1_2_3_4_5_6/E_fastq_to_fasta/250905_batch19_04step_ID_match_FLASH.extendedFrags.fasta
[main] Real time: 0.121 sec; CPU: 0.150 sec


✅ Done: fastq_1_2_3_4_5_6/1_align_sam/250905_batch19_04step_ID_match_FLASH.extendedFrags.sam
🔎 file: 250910_batch20_05step_ID_match_FLASH.extendedFrags.fasta | step=05


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/05step_reference.fasta
[main] Real time: 0.047 sec; CPU: 0.004 sec


🔄 Aligning to 05step_reference.fasta ...


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1178 sequences (137298 bp)...
[M::mem_process_seqs] Processed 1178 reads in 0.143 CPU sec, 0.054 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/05step_reference.fasta fastq_1_2_3_4_5_6/E_fastq_to_fasta/250910_batch20_05step_ID_match_FLASH.extendedFrags.fasta
[main] Real time: 0.110 sec; CPU: 0.150 sec


✅ Done: fastq_1_2_3_4_5_6/1_align_sam/250910_batch20_05step_ID_match_FLASH.extendedFrags.sam
🔎 file: 250910_batch20_06step_ID_match_FLASH.extendedFrags.fasta | step=06


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/06step_reference.fasta
[main] Real time: 0.048 sec; CPU: 0.006 sec


🔄 Aligning to 06step_reference.fasta ...


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 690 sequences (94440 bp)...
[M::mem_process_seqs] Processed 690 reads in 0.116 CPU sec, 0.037 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/06step_reference.fasta fastq_1_2_3_4_5_6/E_fastq_to_fasta/250910_batch20_06step_ID_match_FLASH.extendedFrags.fasta
[main] Real time: 0.075 sec; CPU: 0.121 sec


✅ Done: fastq_1_2_3_4_5_6/1_align_sam/250910_batch20_06step_ID_match_FLASH.extendedFrags.sam


# sam to bam

In [66]:
%%bash

# Set the path to the directory containing SAM files
sam_dir="fastq_1_2_3_4_5_6/1_align_sam"
# Set the output directory for BAM files
bam_dir="fastq_1_2_3_4_5_6/2_align_bam"


# Make sure the output directory exists or create it if necessary
mkdir -p "$bam_dir"

# Convert SAM files to BAM
for sam_file in "$sam_dir"/*.sam; do
    bam_file="$bam_dir/$(basename "$sam_file" .sam).bam"
    samtools view -bS "$sam_file" -o "$bam_file"
    echo "Conversion from $sam_file to $bam_file is complete."
done

Conversion from fastq_1_2_3_4_5_6/1_align_sam/250905_batch19_01step_ID_match_FLASH.extendedFrags.sam to fastq_1_2_3_4_5_6/2_align_bam/250905_batch19_01step_ID_match_FLASH.extendedFrags.bam is complete.
Conversion from fastq_1_2_3_4_5_6/1_align_sam/250905_batch19_02step_ID_match_FLASH.extendedFrags.sam to fastq_1_2_3_4_5_6/2_align_bam/250905_batch19_02step_ID_match_FLASH.extendedFrags.bam is complete.
Conversion from fastq_1_2_3_4_5_6/1_align_sam/250905_batch19_03step_ID_match_FLASH.extendedFrags.sam to fastq_1_2_3_4_5_6/2_align_bam/250905_batch19_03step_ID_match_FLASH.extendedFrags.bam is complete.
Conversion from fastq_1_2_3_4_5_6/1_align_sam/250905_batch19_04step_ID_match_FLASH.extendedFrags.sam to fastq_1_2_3_4_5_6/2_align_bam/250905_batch19_04step_ID_match_FLASH.extendedFrags.bam is complete.
Conversion from fastq_1_2_3_4_5_6/1_align_sam/250910_batch20_05step_ID_match_FLASH.extendedFrags.sam to fastq_1_2_3_4_5_6/2_align_bam/250910_batch20_05step_ID_match_FLASH.extendedFrags.bam is 

# bam file to csv

In [67]:
import os
import pysam
import pandas as pd

# 입력 폴더 (BAM 파일이 위치한 경로)
input_folder = "fastq_1_2_3_4_5_6/2_align_bam"
# 출력 폴더 (CSV 파일을 저장할 경로, 필요하면 변경)
output_folder = "fastq_1_2_3_4_5_6/3_align_csv"


# 출력 폴더가 없으면 생성
os.makedirs(output_folder, exist_ok=True)

# BAM -> CSV 변환 함수 (옵션 필드 포함)
def bam_to_csv(bam_file, output_folder):
    output_csv = os.path.join(output_folder, os.path.basename(bam_file).replace(".bam", ".csv"))
    
    # BAM 파일 읽기
    with pysam.AlignmentFile(bam_file, "rb") as bam:
        records = []
        all_tags = set()  # 옵션 필드를 저장할 집합
        
        for read in bam:
            # 기본 필드
            record = {
                "QNAME": read.query_name,
                "FLAG": read.flag,
                "RNAME": bam.get_reference_name(read.reference_id) if read.reference_id >= 0 else "*",
                "POS": read.reference_start + 1,
                "MAPQ": read.mapping_quality,
                "CIGAR": read.cigarstring if read.cigarstring else "*",
                "RNEXT": bam.get_reference_name(read.next_reference_id) if read.next_reference_id >= 0 else "*",
                "PNEXT": read.next_reference_start + 1 if read.next_reference_start >= 0 else 0,
                "TLEN": read.template_length,
                "SEQ": read.query_sequence if read.query_sequence else "*",
                "QUAL": read.qual if read.qual else "*",
            }
            
            # 옵션 필드 추가
            for tag, value in read.tags:
                record[tag] = value
                all_tags.add(tag)

            records.append(record)
    
    # 데이터프레임 생성
    df = pd.DataFrame(records)

    # 옵션 필드가 없는 경우 NaN으로 처리
    df = df.fillna("*")

    # CSV 저장
    df.to_csv(output_csv, index=False)
    return output_csv

# 폴더에서 모든 BAM 파일 찾기
bam_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".bam")]

# 모든 BAM 파일을 CSV로 변환
csv_files = []
for bam_file in bam_files:
    csv_file = bam_to_csv(bam_file, output_folder)
    csv_files.append(csv_file)

# 변환된 CSV 파일 목록 출력
csv_files

['fastq_1_2_3_4_5_6/3_align_csv/250905_batch19_03step_ID_match_FLASH.extendedFrags.csv',
 'fastq_1_2_3_4_5_6/3_align_csv/250905_batch19_04step_ID_match_FLASH.extendedFrags.csv',
 'fastq_1_2_3_4_5_6/3_align_csv/250905_batch19_02step_ID_match_FLASH.extendedFrags.csv',
 'fastq_1_2_3_4_5_6/3_align_csv/250910_batch20_06step_ID_match_FLASH.extendedFrags.csv',
 'fastq_1_2_3_4_5_6/3_align_csv/250910_batch20_05step_ID_match_FLASH.extendedFrags.csv',
 'fastq_1_2_3_4_5_6/3_align_csv/250905_batch19_01step_ID_match_FLASH.extendedFrags.csv']

# remove MAPQ=0 

In [68]:
import os
import pandas as pd
from pathlib import Path

# 입력/출력 폴더
input_dir = Path("fastq_1_2_3_4_5_6/3_align_csv")
output_dir = input_dir / "MAPQ0_removed"
output_dir.mkdir(parents=True, exist_ok=True)

# 처리 파라미터
CHUNKSIZE = 200_000   # 파일이 크면 값 늘리면 됨

def process_one_csv(in_path: Path, out_path: Path):
    """
    MAPQ=0 행 제거 후 out_path로 저장 (chunked)
    """
    # 기존 파일 있으면 삭제(덮어쓰기)
    if out_path.exists():
        out_path.unlink()

    kept = 0
    removed = 0
    wrote_header = False

    for chunk in pd.read_csv(in_path, chunksize=CHUNKSIZE):
        if "MAPQ" not in chunk.columns:
            print(f"⚠️  Skip (no MAPQ column): {in_path.name}")
            return

        # MAPQ 숫자로 변환(문자/NaN은 보존; NaN은 제거 대상 아님)
        m = pd.to_numeric(chunk["MAPQ"], errors="coerce")
        # keep: MAPQ != 0 또는 NaN
        keep_mask = (m != 0) | m.isna()

        removed += (~keep_mask).sum()
        kept += keep_mask.sum()

        out_chunk = chunk.loc[keep_mask]

        # append 방식으로 저장
        out_chunk.to_csv(out_path, index=False, mode="a", header=not wrote_header)
        wrote_header = True

    print(f"✅ {in_path.name} → {out_path.name} | kept={kept}, removed={removed}")

# 모든 csv 처리
csv_files = sorted(p for p in input_dir.glob("*.csv"))
if not csv_files:
    print(f"⚠️  No CSV files in {input_dir}")
else:
    for p in csv_files:
        process_one_csv(p, output_dir / p.name)

✅ 250905_batch19_01step_ID_match_FLASH.extendedFrags.csv → 250905_batch19_01step_ID_match_FLASH.extendedFrags.csv | kept=2467, removed=61
✅ 250905_batch19_02step_ID_match_FLASH.extendedFrags.csv → 250905_batch19_02step_ID_match_FLASH.extendedFrags.csv | kept=1914, removed=10
✅ 250905_batch19_03step_ID_match_FLASH.extendedFrags.csv → 250905_batch19_03step_ID_match_FLASH.extendedFrags.csv | kept=1783, removed=7
✅ 250905_batch19_04step_ID_match_FLASH.extendedFrags.csv → 250905_batch19_04step_ID_match_FLASH.extendedFrags.csv | kept=1771, removed=15
✅ 250910_batch20_05step_ID_match_FLASH.extendedFrags.csv → 250910_batch20_05step_ID_match_FLASH.extendedFrags.csv | kept=1164, removed=17
✅ 250910_batch20_06step_ID_match_FLASH.extendedFrags.csv → 250910_batch20_06step_ID_match_FLASH.extendedFrags.csv | kept=673, removed=17


# Histogram

In [69]:
import os
import pandas as pd

# 📁 폴더 설정
input_folder = "fastq_1_2_3_4_5_6/3_align_csv/MAPQ0_removed"
histogram_folder = "fastq_1_2_3_4_5_6/4_align_histogram"
os.makedirs(histogram_folder, exist_ok=True)

# 📄 모든 CSV 파일 처리
files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

for file_name in files:
    file_path = os.path.join(input_folder, file_name)

    # 🔧 파일명 클렌징 (특정 문자열 제거)
    clean_name = file_name
    clean_name = clean_name.replace("assemble", "")
    clean_name = clean_name.replace("ID_match_FLASH.extendedFrags", "")
    clean_name = clean_name.replace("__", "_").strip("_")  # 중복/끝 _ 제거
    output_csv = os.path.join(histogram_folder, f"histogram_{clean_name}")

    try:
        df = pd.read_csv(file_path, dtype=str)
        if 'RNAME' not in df.columns:
            print(f"⚠️ Skipping file: {file_name} (no 'RNAME' column found)")
            continue

        # RNAME 집계 및 정규화
        rname_counts = df['RNAME'].value_counts().reset_index()
        rname_counts.columns = ['RNAME', 'Count']
        rname_counts.insert(0, 'File_Name', clean_name)
        rname_counts['Count'] = rname_counts['Count'].astype(int)
        total_count = rname_counts['Count'].sum()
        rname_counts['Normalized_Count'] = rname_counts['Count'] / total_count

        rname_counts.to_csv(output_csv, index=False)
        print(f"✅ Saved cleaned RNAME histogram: {output_csv}")

    except Exception as e:
        print(f"❌ Error processing file '{file_name}': {e}")

✅ Saved cleaned RNAME histogram: fastq_1_2_3_4_5_6/4_align_histogram/histogram_250905_batch19_01step_.csv
✅ Saved cleaned RNAME histogram: fastq_1_2_3_4_5_6/4_align_histogram/histogram_250910_batch20_05step_.csv
✅ Saved cleaned RNAME histogram: fastq_1_2_3_4_5_6/4_align_histogram/histogram_250910_batch20_06step_.csv
✅ Saved cleaned RNAME histogram: fastq_1_2_3_4_5_6/4_align_histogram/histogram_250905_batch19_02step_.csv
✅ Saved cleaned RNAME histogram: fastq_1_2_3_4_5_6/4_align_histogram/histogram_250905_batch19_04step_.csv
✅ Saved cleaned RNAME histogram: fastq_1_2_3_4_5_6/4_align_histogram/histogram_250905_batch19_03step_.csv


In [70]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# 📁 폴더 설정
histogram_folder = "fastq_1_2_3_4_5_6/4_align_histogram"
summary_folder = "fastq_1_2_3_4_5_6/4_align_histogram/graph_top5"
os.makedirs(summary_folder, exist_ok=True)

# 🔴 하이라이트 매핑 (suffix 기반)
highlight_mapping = {
    "_01step": "seq_0001_1",
    "_02step": "seq_0002_10",
    "_03step": "seq_0005_101",
    "_04step": "seq_0010_1010",
    "_05step": "seq_0021_10101",
    "_06step": "seq_0042_101010",
    "_07step": "seq_0085_1010101",
    "_08step": "seq_0170_10101010",
}

# 📄 CSV 파일 리스트
csv_files = [f for f in os.listdir(histogram_folder) if f.startswith("histogram_") and f.endswith(".csv")]

# 🔁 파일 반복 처리
for file_name in csv_files:
    file_path = os.path.join(histogram_folder, file_name)
    try:
        df = pd.read_csv(file_path)
        if 'RNAME' not in df.columns or 'Normalized_Count' not in df.columns:
            print(f"⚠️ Skipping file: {file_name} (missing column)")
            continue

        # Top 5 RNAME 추출
        top_df = df.sort_values(by="Count", ascending=False).head(5).reset_index(drop=True)
        sample_name = file_name.replace("histogram_", "").replace(".csv", "")

        # 🔍 suffix 기반 하이라이트 RNAME 찾기
        highlight_rname = None
        for suffix, rname in highlight_mapping.items():
            if suffix in file_name:
                highlight_rname = rname
                break

        # 📊 그래프 생성
        plt.figure(figsize=(10, 6))
        bars = plt.bar(top_df["RNAME"], top_df["Normalized_Count"], color='blue')

        # 🔴 매칭되는 RNAME은 빨강색으로
        for bar, rname in zip(bars, top_df["RNAME"]):
            if rname == highlight_rname:
                bar.set_color('red')

        plt.title(f"Top 5 RNAME Histogram - {sample_name}")
        plt.xlabel("RNAME")
        plt.ylabel("Normalized Count")
        plt.xticks(rotation=45)
        plt.ylim(0, 1)
        plt.tight_layout()

        # 💾 저장
        output_png = os.path.join(summary_folder, file_name.replace(".csv", ".png"))
        output_svg = os.path.join(summary_folder, file_name.replace(".csv", ".svg"))
        plt.savefig(output_png)
        plt.savefig(output_svg)
        plt.close()

        print(f"✅ Saved plot: {output_png}, {output_svg}")

    except Exception as e:
        print(f"❌ Error processing {file_name}: {e}")

✅ Saved plot: fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_250910_batch20_05step_.png, fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_250910_batch20_05step_.svg
✅ Saved plot: fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_250905_batch19_01step_.png, fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_250905_batch19_01step_.svg
✅ Saved plot: fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_250905_batch19_04step_.png, fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_250905_batch19_04step_.svg
✅ Saved plot: fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_250910_batch20_06step_.png, fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_250910_batch20_06step_.svg
✅ Saved plot: fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_250905_batch19_03step_.png, fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_250905_batch19_03step_.svg
✅ Saved plot: fastq_1_2_3_4_5_6/4_align_histogram/graph_top5/histogram_250905_batch19

In [71]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

# 폴더 설정
histogram_folder = "fastq_1_2_3_4_5_6/4_align_histogram"
summary_folder = "fastq_1_2_3_4_5_6/5_align_summary"
os.makedirs(summary_folder, exist_ok=True)

# highlight 매핑 (접미사 기준)
highlight_mapping = {
    "_01step": "seq_0001_1",
    "_02step": "seq_0002_10",
    "_03step": "seq_0005_101",
    "_04step": "seq_0010_1010",
    "_05step": "seq_0021_10101",
    "_06step": "seq_0042_101010",
    "_07step": "seq_0085_1010101",
    "_08step": "seq_0170_10101010",
}

# 회색 → 흰색 그라데이션 색상 함수
def blend_color(base_rgb, t):
    white = np.array([255, 255, 255])
    base = np.array(base_rgb)
    blended = (1 - t) * base + t * white
    return tuple(blended / 255)

base_rgb = (137, 137, 138)

# step 번호를 추출하여 오름차순 정렬을 위한 함수
def extract_step_number(name):
    match = re.search(r'_(\d+)step', name)
    return int(match.group(1)) if match else float('inf')

# sample별 데이터 로딩
sample_rname_dfs = {}
for file_name in os.listdir(histogram_folder):
    if file_name.startswith("histogram_") and file_name.endswith(".csv"):
        sample_name = file_name.replace("histogram_", "").replace(".csv", "")
        df = pd.read_csv(os.path.join(histogram_folder, file_name))
        if 'RNAME' not in df.columns or 'Count' not in df.columns:
            continue
        df['Sample'] = sample_name
        df['Count'] = df['Count'].astype(int)
        df['Normalized_Count'] = df['Count'] / df['Count'].sum()
        df = df.sort_values(by='Count', ascending=False).reset_index(drop=True)
        sample_rname_dfs[sample_name] = df

# sample_name을 step 기준으로 정렬
sorted_samples = sorted(sample_rname_dfs.items(), key=lambda x: extract_step_number(x[0]))

# 시각화
fig, ax = plt.subplots(figsize=(24, 12))

for sample_idx, (sample_name, df) in enumerate(sorted_samples):
    # highlight RNAME 찾기
    highlight_rname = None
    for suffix, rname in highlight_mapping.items():
        if suffix in sample_name:  # 정확한 끝이 아니라 포함 여부로 수정
            highlight_rname = rname
            break

    bottom = 0
    top_n = 5
    rest_sum = 0

    for rank, row in df.iterrows():
        rname = row['RNAME']
        height = row['Normalized_Count']

        if rname == highlight_rname:
            ax.bar(sample_name, height, bottom=bottom, color='red', edgecolor='black', linewidth=0.2)
            bottom += height
        elif rank < top_n:
            t = rank / (top_n - 1) if top_n > 1 else 0
            color = blend_color(base_rgb, t)
            ax.bar(sample_name, height, bottom=bottom, color=color, edgecolor='black', linewidth=0.2)
            bottom += height
        else:
            rest_sum += height

    if rest_sum > 0:
        ax.bar(sample_name, rest_sum, bottom=bottom, color='white', edgecolor='black', linewidth=0.2)

# 보조선, 스타일
ax.axhline(y=0.5, color='gray', linestyle='--', linewidth=1, label='y = 0.5')
ax.set_ylabel("Normalized Count", fontsize=20)
ax.set_xlabel("Sample", fontsize=20)
ax.set_title("Stacked Bar Chart (Red = Highlight, Gray→White = Top 5, Rest = One White Box)", fontsize=16)
ax.tick_params(axis='x', labelsize=20)
ax.tick_params(axis='y', labelsize=20)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# 저장
png_path = os.path.join(summary_folder, "stacked_bar_top5_gray_rest_white_box.png")
svg_path = os.path.join(summary_folder, "stacked_bar_top5_gray_rest_white_box.svg")
plt.savefig(png_path)
plt.savefig(svg_path)
plt.close()

print(f"✅ 저장 완료:\n - PNG: {png_path}\n - SVG: {svg_path}")

✅ 저장 완료:
 - PNG: fastq_1_2_3_4_5_6/5_align_summary/stacked_bar_top5_gray_rest_white_box.png
 - SVG: fastq_1_2_3_4_5_6/5_align_summary/stacked_bar_top5_gray_rest_white_box.svg


In [72]:
import os
import pandas as pd
import re

# === Highlight mapping (suffix -> RNAME) ===
highlight_mapping = {
    "_01step": "seq_0001_1",
    "_02step": "seq_0002_10",
    "_03step": "seq_0005_101",
    "_04step": "seq_0010_1010",
    "_05step": "seq_0021_10101",
    "_06step": "seq_0042_101010",
    "_07step": "seq_0085_1010101",
    "_08step": "seq_0170_10101010",
}

# === 폴더 설정 ===
histogram_folder = "fastq_1_2_3_4_5_6/4_align_histogram"
summary_folder = "fastq_1_2_3_4_5_6/5_align_summary"
os.makedirs(summary_folder, exist_ok=True)
highlight_result_csv = os.path.join(summary_folder, "highlight_result.csv")

# === step 번호 추출 함수 ===
def extract_step_number(filename):
    match = re.search(r"_(\d+)step", filename)
    return int(match.group(1)) if match else float("inf")

# === Highlight 요약 정보 수집 ===
highlight_data = []
csv_files = [f for f in os.listdir(histogram_folder) if f.startswith("histogram_") and f.endswith(".csv")]

for file in csv_files:
    file_path = os.path.join(histogram_folder, file)
    try:
        df = pd.read_csv(file_path)
        file_name = file.replace("histogram_", "")

        # suffix 기반 highlight_rname 추출
        highlight_rname = ""
        for suffix, rname in highlight_mapping.items():
            if suffix in file_name:
                highlight_rname = rname
                break

        df['Count'] = df['Count'].astype(int)
        total_count = df['Count'].sum()

        highlight_count = df[df['RNAME'] == highlight_rname]['Count'].sum() if highlight_rname else 0
        highlight_percentage = (highlight_count / total_count) * 100 if total_count > 0 else 0

        sorted_counts = df['Count'].sort_values(ascending=False).values
        second_max_count = sorted_counts[1] if len(sorted_counts) >= 2 else (sorted_counts[0] if len(sorted_counts) == 1 else 0)
        highlight_vs_second_ratio = (highlight_count / second_max_count) if second_max_count > 0 else 0

        highlight_data.append([
            file_name,
            highlight_count,
            total_count,
            round(highlight_percentage, 2),
            highlight_rname,
            round(highlight_vs_second_ratio, 3),
            extract_step_number(file_name)
        ])

    except Exception as e:
        print(f"❌ Error processing file '{file}': {e}")

# === DataFrame 생성 및 step 기준 정렬 후 저장 ===
highlight_df = pd.DataFrame(highlight_data, columns=[
    'File',
    'Highlight_Count',
    'Total_Count',
    'Highlight_Percentage',
    'Highlight_RNAMEs',
    'Highlight_vs_SecondTop_Ratio',
    'Step_Number'
])

highlight_df = highlight_df.sort_values(by='Step_Number').drop(columns='Step_Number')
highlight_df.to_csv(highlight_result_csv, index=False)

print(f"📌 Highlight summary saved to: {highlight_result_csv}")

📌 Highlight summary saved to: fastq_1_2_3_4_5_6/5_align_summary/highlight_result.csv
