# **Install modules**

In [1]:
# Install modules
!sudo pip3 install biopython --break-system-packages
!sudo apt-get install fastp 
!sudo apt-get update 
!sudo apt-get install flash
!sudo pip3 install cutadapt --break-system-packages
!sudo apt-get install bwa 
!sudo pip3 install pysam --break-system-packages
!sudo apt-get install samtools 

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fastp is already the newest version (0.23.4+dfsg-1).
The following packages were automatically installed and are no longer required:
  pigz python3-xopen
Use 'sudo apt autoremove' to remove them.
0 upgraded, 0 newly installed, 0 to remove and 265 not upgraded.
Hit:1 https://packages.microsoft.com/repos/code stable InRelease               
Hit:2 http://ports.ubuntu.com/ubuntu-ports noble InRelease                     
Hit:3 http://ports.ubuntu.com/ubuntu-ports noble-updates InRelease             
Hit:4 http://ports.ubuntu.com/ubuntu-ports noble-backports InRelease
Hit:5 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu noble InRelease
Hit:6 http://ports.ubuntu.com/ubuntu-ports noble-security InRelease
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
flash is already the newest version (1.2.11-2).
The following

# Trimming and Discard

In [47]:
import subprocess
import glob
import os

# Specify the folder containing your input files
# Specify the folder where you want to save the untrimmed sequences (adapter-free sequences)

input_folder = "fastq_7_8_9_10_11_12"
untrimmed_output_folder = "fastq_7_8_9_10_11_12/A_Untrimmed_output"

# Define the adapter sequences for R1 and R2
adapter_sequence_r1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
adapter_sequence_r2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"

# Use glob to get a list of all input file pairs (R1 and R2) in the folder
input_file_pairs = []
for input_r1 in glob.glob(os.path.join(input_folder, "*_R1.fastq.gz")):
    # Assuming R2 files have the same naming format as R1 files
    input_r2 = input_r1.replace("_R1.fastq.gz", "_R2.fastq.gz")
    if os.path.exists(input_r2):  # Ensure R2 file exists
        input_file_pairs.append({"r1": input_r1, "r2": input_r2})

# Create the output folder if it doesn't exist
os.makedirs(untrimmed_output_folder, exist_ok=True)

for input_files in input_file_pairs:
    input_r1 = input_files["r1"]
    input_r2 = input_files["r2"]

    # Define output file paths for untrimmed (clean, adapter-free) sequences
    untrimmed_r1 = os.path.join(untrimmed_output_folder, os.path.basename(input_r1).replace(".fastq.gz", "_untrimmed.fastq.gz"))
    untrimmed_r2 = os.path.join(untrimmed_output_folder, os.path.basename(input_r2).replace(".fastq.gz", "_untrimmed.fastq.gz"))

    # Use cutadapt to keep only untrimmed sequences (completely adapter-free)
    result = subprocess.run([
        "cutadapt",
        "-a", adapter_sequence_r1,  # Adapter for R1
        "-A", adapter_sequence_r2,  # Adapter for R2
        "-O", "15",  # Minimum overlap for adapter trimming
        "--discard-trimmed",  # Discard sequences where trimming occurred
        "-o", untrimmed_r1,  # Save only untrimmed R1 reads
        "-p", untrimmed_r2,  # Save only untrimmed R2 reads
        input_r1, input_r2
    ], capture_output=True, text=True)

    # Log result
    if result.returncode == 0:
        print(f"Untrimmed sequences saved: {untrimmed_r1}, {untrimmed_r2}")
    else:
        print(f"Error processing {input_r1} and {input_r2}:\n{result.stderr}")

Untrimmed sequences saved: fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_09step_R1_untrimmed.fastq.gz, fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_09step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_10step_R1_untrimmed.fastq.gz, fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_10step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_07step_R1_untrimmed.fastq.gz, fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_07step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_7_8_9_10_11_12/A_Untrimmed_output/250905_batch19_08step_R1_untrimmed.fastq.gz, fastq_7_8_9_10_11_12/A_Untrimmed_output/250905_batch19_08step_R2_untrimmed.fastq.gz
Untrimmed sequences saved: fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_08step_R1_untrimmed.fastq.gz, fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_08step_R2_untrimmed.fastq.gz


# Quality check

In [48]:
# import os
# import gzip
# from Bio import SeqIO
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt

# # 📁 입력 폴더와 출력 폴더 설정
# input_folder = "fastq_step/1_3_5_7_9_11/A_Untrimmed_output"
# output_csv_folder = "fastq_step/1_3_5_7_9_11/A_Untrimmed_output/quality_stats_csv"
# output_plot_folder = "fastq_step/1_3_5_7_9_11/A_Untrimmed_output/quality_plots"

# os.makedirs(output_csv_folder, exist_ok=True)
# os.makedirs(output_plot_folder, exist_ok=True)

# # 🔁 품질 통계 추출 함수
# def compute_quality_stats(file_path):
#     position_qualities = {}
#     open_func = gzip.open if file_path.endswith(".gz") else open

#     with open_func(file_path, "rt") as handle:
#         for record in SeqIO.parse(handle, "fastq"):
#             for i, q in enumerate(record.letter_annotations["phred_quality"]):
#                 position_qualities.setdefault(i, []).append(q)

#     stats = []
#     for pos in sorted(position_qualities):
#         scores = np.array(position_qualities[pos])
#         stats.append({
#             "position": pos + 1,
#             "mean": np.mean(scores),
#             "q1": np.percentile(scores, 25),
#             "median": np.median(scores),
#             "q3": np.percentile(scores, 75),
#             "min": np.min(scores),
#             "max": np.max(scores)
#         })
#     return pd.DataFrame(stats)

# # 📊 배경 색상 함수 (fastp 스타일)
# def add_quality_background(ax):
#     ax.axhspan(30, 40, facecolor='lightgreen', alpha=0.5)
#     ax.axhspan(25, 30, facecolor='khaki', alpha=0.5)
#     ax.axhspan(20, 25, facecolor='moccasin', alpha=0.5)
#     ax.axhspan(0, 20, facecolor='lightcoral', alpha=0.5)

# # 📂 폴더 내 모든 FASTQ(.gz 포함) 처리
# for filename in os.listdir(input_folder):
#     if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
#         input_path = os.path.join(input_folder, filename)
#         sample_name = os.path.splitext(filename)[0].replace(".fastq", "").replace(".gz", "")

#         print(f"📌 Processing: {sample_name}")
#         df = compute_quality_stats(input_path)

#         # CSV 저장
#         csv_path = os.path.join(output_csv_folder, f"{sample_name}_quality.csv")
#         df.to_csv(csv_path, index=False)

#         # 그래프 저장
#         plt.figure(figsize=(18, 8))
#         ax = plt.gca()
#         add_quality_background(ax)
#         plt.plot(df["position"], df["mean"], color="blue", linewidth=1.5, label="Mean Quality")

#         for i in range(len(df)):
#             x = df.loc[i, "position"]
#             q1 = df.loc[i, "q1"]
#             q3 = df.loc[i, "q3"]
#             plt.fill_between([x - 0.4, x + 0.4], [q1, q1], [q3, q3], color="yellow", edgecolor="black")

#         plt.vlines(df["position"], df["min"], df["max"], color="black", linewidth=0.5)
#         plt.title(f"Quality scores across all bases: {sample_name}", fontsize=14)
#         plt.xlabel("Position in read (bp)", fontsize=12)
#         plt.ylabel("Quality score", fontsize=12)
#         plt.ylim(0, 40)
#         plt.xlim(1, df["position"].max())
#         plt.legend()
#         plt.tight_layout()
#         plot_path = os.path.join(output_plot_folder, f"{sample_name}_quality_plot.png")
#         plt.savefig(plot_path, dpi=300)
#         plt.close()

#         print(f"✅ Saved: {sample_name}_quality.csv and quality_plot.png")

# read count check

In [49]:
# import os
# import gzip
# from Bio import SeqIO

# # 분석 대상 폴더
# input_folder = "fastq_step/1_3_5_7_9_11/A_Untrimmed_output"  # 여기에 대상 폴더 경로를 입력하세요

# # 허용 확장자
# valid_extensions = [".fastq", ".fastq.gz", ".fasta", ".fasta.gz"]

# # 포맷 결정 함수
# def get_format(filename):
#     if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
#         return "fastq"
#     elif filename.endswith(".fasta") or filename.endswith(".fasta.gz"):
#         return "fasta"
#     else:
#         return None

# # 결과 저장 리스트
# read_counts = []

# # 파일 순회 및 read 수 카운트
# for filename in os.listdir(input_folder):
#     if any(filename.endswith(ext) for ext in valid_extensions):
#         file_path = os.path.join(input_folder, filename)
#         file_format = get_format(filename)
#         open_func = gzip.open if filename.endswith(".gz") else open

#         try:
#             with open_func(file_path, "rt") as handle:
#                 count = sum(1 for _ in SeqIO.parse(handle, file_format))
#             read_counts.append((filename, count))
#         except Exception as e:
#             print(f"❌ Error processing {filename}: {e}")

# # 📄 파일명 기준 정렬 후 출력
# read_counts.sort(key=lambda x: x[0].lower())  # 파일명 기준 (대소문자 구분 없이) 정렬
# for fname, count in read_counts:
#     print(f"{fname:40} : {count} reads")

# length check

In [50]:
import os
import gzip
from Bio import SeqIO

# 분석 대상 폴더
input_folder = "fastq_7_8_9_10_11_12/A_Untrimmed_output"  # 여기에 대상 폴더 경로를 입력하세요

# 허용 확장자
valid_extensions = [".fastq", ".fastq.gz"]

# 파일 포맷 결정 함수
def get_format(filename):
    if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
        return "fastq"
    else:
        return None

# 결과 저장 리스트
read_stats = []

# 파일 순회 및 분석
for filename in os.listdir(input_folder):
    if any(filename.endswith(ext) for ext in valid_extensions):
        file_path = os.path.join(input_folder, filename)
        file_format = get_format(filename)
        open_func = gzip.open if filename.endswith(".gz") else open

        try:
            total_len = 0
            read_count = 0
            with open_func(file_path, "rt") as handle:
                for record in SeqIO.parse(handle, file_format):
                    total_len += len(record.seq)
                    read_count += 1
            avg_length = total_len / read_count if read_count > 0 else 0
            read_stats.append((filename, read_count, round(avg_length, 2)))
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

# 정렬 후 출력
read_stats.sort(key=lambda x: x[0].lower())
for fname, count, avg_len in read_stats:
    print(f"{fname:40} : {count:6} reads, Avg Length = {avg_len:6} bp")

250905_batch19_08step_R1_untrimmed.fastq.gz :   1903 reads, Avg Length =  151.0 bp
250905_batch19_08step_R2_untrimmed.fastq.gz :   1903 reads, Avg Length =  151.0 bp
250910_batch20_07step_R1_untrimmed.fastq.gz :   1829 reads, Avg Length =  151.0 bp
250910_batch20_07step_R2_untrimmed.fastq.gz :   1829 reads, Avg Length =  151.0 bp
250910_batch20_08step_R1_untrimmed.fastq.gz :   1829 reads, Avg Length =  151.0 bp
250910_batch20_08step_R2_untrimmed.fastq.gz :   1829 reads, Avg Length =  151.0 bp
250910_batch20_09step_R1_untrimmed.fastq.gz :   1956 reads, Avg Length =  151.0 bp
250910_batch20_09step_R2_untrimmed.fastq.gz :   1956 reads, Avg Length =  151.0 bp
250910_batch20_10step_R1_untrimmed.fastq.gz :   1695 reads, Avg Length =  151.0 bp
250910_batch20_10step_R2_untrimmed.fastq.gz :   1695 reads, Avg Length =  151.0 bp


# Q filtering

In [70]:
import os
import subprocess

# 품질 기준(Q30)
quality_threshold = 30

# 입력 폴더와 출력 폴더 설정

input_folder = "fastq_7_8_9_10_11_12/A_Untrimmed_output"
output_folder = "fastq_7_8_9_10_11_12/B_Qfiltered"

os.makedirs(output_folder, exist_ok=True)  # 출력 폴더가 없으면 생성

# 입력 폴더 내 파일들을 순회하며, "_trimmed.fastq.gz"로 끝나는 파일만 처리
for filename in os.listdir(input_folder):
    if filename.endswith("_untrimmed.fastq.gz"):
        # 입력 파일 경로
        input_file = os.path.join(input_folder, filename)
        
        # 출력 파일 이름(예: sample_trimmed.fastq.gz -> sample_trimmed_filtered.fastq.gz)
        output_file = os.path.join(
            output_folder, 
            filename.replace("_untrimmed.fastq.gz", "_Qfiltered.fastq.gz")
        )
        
        # fastp 실행 (싱글 엔드 모드)
        subprocess.call([
            "fastp",
            "-i", input_file,               # 입력 파일
            "-o", output_file,              # 출력 파일
            "-q", str(quality_threshold),   # Q30 미만 품질 제거
            "-u", "15",                      # low-quality base 비율 20% 이상이면 read 제거
            "-l", "151",                      # 최소 read 길이
            "--cut_mean_quality", "30",     # 평균 Q<30이면 read 제거
            "--html", f"{output_file}.html",  # HTML 리포트
            "--json", f"{output_file}.json"   # JSON 리포트
        ])
        
        print(f"Filtering for {filename} is complete.\n"
              f"Output FASTQ : {output_file}\n"
              f"Reports      : {output_file}.html / {output_file}.json\n")

print("All filtering processes are done.")

Detecting adapter sequence for read1...
No adapter detected for read1

Read1 before filtering:
total reads: 1956
total bases: 295356
Q20 bases: 256879(86.9727%)
Q30 bases: 231776(78.4734%)

Read1 after filtering:
total reads: 852
total bases: 128652
Q20 bases: 125281(97.3798%)
Q30 bases: 119457(92.8528%)

Filtering result:
reads passed filter: 852
reads failed due to low quality: 1104
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 27.2495%

JSON report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_09step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_09step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_09step_R1_untrimmed.fastq.gz -o fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_09step_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --h

Filtering for 250910_batch20_09step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_09step_R1_Qfiltered.fastq.gz
Reports      : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_09step_R1_Qfiltered.fastq.gz.html / fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_09step_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 1956
total bases: 295356
Q20 bases: 254346(86.1151%)
Q30 bases: 226989(76.8527%)

Read1 after filtering:
total reads: 692
total bases: 104492
Q20 bases: 100693(96.3643%)
Q30 bases: 94792(90.717%)

Filtering result:
reads passed filter: 692
reads failed due to low quality: 1264
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 17.0245%

JSON report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_09step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_09step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_09step_R2_untrimmed.fastq.gz -o fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_09step_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_09step_R2_Qfiltered.f

Filtering for 250910_batch20_09step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_09step_R2_Qfiltered.fastq.gz
Reports      : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_09step_R2_Qfiltered.fastq.gz.html / fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_09step_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 1903
total bases: 287353
Q20 bases: 253719(88.2952%)
Q30 bases: 230302(80.146%)

Read1 after filtering:
total reads: 899
total bases: 135749
Q20 bases: 130905(96.4316%)
Q30 bases: 123946(91.3053%)

Filtering result:
reads passed filter: 899
reads failed due to low quality: 1004
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 26.3269%

JSON report: fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_7_8_9_10_11_12/A_Untrimmed_output/250905_batch19_08step_R2_untrimmed.fastq.gz -o fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R2_Qfiltered.

Filtering for 250905_batch19_08step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R2_Qfiltered.fastq.gz
Reports      : fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R2_Qfiltered.fastq.gz.html / fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 1903
total bases: 287353
Q20 bases: 261970(91.1666%)
Q30 bases: 241965(84.2048%)

Read1 after filtering:
total reads: 1167
total bases: 176217
Q20 bases: 172491(97.8856%)
Q30 bases: 165465(93.8984%)

Filtering result:
reads passed filter: 1167
reads failed due to low quality: 736
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 46.6106%

JSON report: fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_7_8_9_10_11_12/A_Untrimmed_output/250905_batch19_08step_R1_untrimmed.fastq.gz -o fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R1_Qfiltere

Filtering for 250905_batch19_08step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R1_Qfiltered.fastq.gz
Reports      : fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R1_Qfiltered.fastq.gz.html / fastq_7_8_9_10_11_12/B_Qfiltered/250905_batch19_08step_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 1695
total bases: 255945
Q20 bases: 216727(84.6772%)
Q30 bases: 192828(75.3396%)

Read1 after filtering:
total reads: 572
total bases: 86372
Q20 bases: 83774(96.9921%)
Q30 bases: 79404(91.9326%)

Filtering result:
reads passed filter: 572
reads failed due to low quality: 1123
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 18.6431%

JSON report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_10step_R1_untrimmed.fastq.gz -o fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R1_Qfiltered.fa

Filtering for 250910_batch20_10step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R1_Qfiltered.fastq.gz
Reports      : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R1_Qfiltered.fastq.gz.html / fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 1695
total bases: 255945
Q20 bases: 216366(84.5361%)
Q30 bases: 190719(74.5156%)

Read1 after filtering:
total reads: 430
total bases: 64930
Q20 bases: 62401(96.105%)
Q30 bases: 58299(89.7875%)

Filtering result:
reads passed filter: 430
reads failed due to low quality: 1265
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 12.6844%

JSON report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_10step_R2_untrimmed.fastq.gz -o fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R2_Qfiltered.fas

Filtering for 250910_batch20_10step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R2_Qfiltered.fastq.gz
Reports      : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R2_Qfiltered.fastq.gz.html / fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_10step_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 1829
total bases: 276179
Q20 bases: 250103(90.5583%)
Q30 bases: 228867(82.8691%)

Read1 after filtering:
total reads: 1042
total bases: 157342
Q20 bases: 154282(98.0552%)
Q30 bases: 147600(93.8084%)

Filtering result:
reads passed filter: 1042
reads failed due to low quality: 787
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 44.5599%

JSON report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_07step_R1_untrimmed.fastq.gz -o fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R1_Qfiltere

Filtering for 250910_batch20_07step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R1_Qfiltered.fastq.gz
Reports      : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R1_Qfiltered.fastq.gz.html / fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R1_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 1829
total bases: 276179
Q20 bases: 240704(87.1551%)
Q30 bases: 215126(77.8937%)

Read1 after filtering:
total reads: 696
total bases: 105096
Q20 bases: 101193(96.2863%)
Q30 bases: 94970(90.365%)

Filtering result:
reads passed filter: 696
reads failed due to low quality: 1133
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 21.3778%

JSON report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_08step_R2_untrimmed.fastq.gz -o fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R2_Qfiltered.f

Filtering for 250910_batch20_08step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R2_Qfiltered.fastq.gz
Reports      : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R2_Qfiltered.fastq.gz.html / fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R2_Qfiltered.fastq.gz.json



Read1 before filtering:
total reads: 1829
total bases: 276179
Q20 bases: 242175(87.6877%)
Q30 bases: 218794(79.2218%)

Read1 after filtering:
total reads: 838
total bases: 126538
Q20 bases: 123192(97.3557%)
Q30 bases: 117361(92.7476%)

Filtering result:
reads passed filter: 838
reads failed due to low quality: 991
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 31.5473%

JSON report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R1_Qfiltered.fastq.gz.json
HTML report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R1_Qfiltered.fastq.gz.html

fastp -i fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_08step_R1_untrimmed.fastq.gz -o fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R1_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R1_Qfiltered.

Filtering for 250910_batch20_08step_R1_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R1_Qfiltered.fastq.gz
Reports      : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R1_Qfiltered.fastq.gz.html / fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_08step_R1_Qfiltered.fastq.gz.json

Filtering for 250910_batch20_07step_R2_untrimmed.fastq.gz is complete.
Output FASTQ : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R2_Qfiltered.fastq.gz
Reports      : fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R2_Qfiltered.fastq.gz.html / fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R2_Qfiltered.fastq.gz.json

All filtering processes are done.


Read1 before filtering:
total reads: 1829
total bases: 276179
Q20 bases: 246099(89.1085%)
Q30 bases: 221783(80.3041%)

Read1 after filtering:
total reads: 909
total bases: 137259
Q20 bases: 132728(96.6989%)
Q30 bases: 124722(90.8662%)

Filtering result:
reads passed filter: 909
reads failed due to low quality: 920
reads failed due to too many N: 0
reads failed due to too short: 0
reads with adapter trimmed: 0
bases trimmed due to adapters: 0

Duplication rate (may be overestimated since this is SE data): 36.9054%

JSON report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R2_Qfiltered.fastq.gz.json
HTML report: fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R2_Qfiltered.fastq.gz.html

fastp -i fastq_7_8_9_10_11_12/A_Untrimmed_output/250910_batch20_07step_R2_untrimmed.fastq.gz -o fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R2_Qfiltered.fastq.gz -q 30 -u 15 -l 151 --cut_mean_quality 30 --html fastq_7_8_9_10_11_12/B_Qfiltered/250910_batch20_07step_R2_Qfiltered.

# Quality check

In [71]:
# import os
# import gzip
# from Bio import SeqIO
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt

# # 📁 입력 폴더와 출력 폴더 설정
# input_folder = "fastq_step/1_3_5_7_9_11/B_Qfiltered"
# output_csv_folder = "fastq_step/1_3_5_7_9_11/B_Qfiltered/quality_stats_csv"
# output_plot_folder = "fastq_step/1_3_5_7_9_11/B_Qfiltered/quality_plots"
# os.makedirs(output_csv_folder, exist_ok=True)
# os.makedirs(output_plot_folder, exist_ok=True)

# # 🔁 품질 통계 추출 함수
# def compute_quality_stats(file_path):
#     position_qualities = {}
#     open_func = gzip.open if file_path.endswith(".gz") else open

#     with open_func(file_path, "rt") as handle:
#         for record in SeqIO.parse(handle, "fastq"):
#             for i, q in enumerate(record.letter_annotations["phred_quality"]):
#                 position_qualities.setdefault(i, []).append(q)

#     stats = []
#     for pos in sorted(position_qualities):
#         scores = np.array(position_qualities[pos])
#         stats.append({
#             "position": pos + 1,
#             "mean": np.mean(scores),
#             "q1": np.percentile(scores, 25),
#             "median": np.median(scores),
#             "q3": np.percentile(scores, 75),
#             "min": np.min(scores),
#             "max": np.max(scores)
#         })
#     return pd.DataFrame(stats)

# # 📊 배경 색상 함수 (fastp 스타일)
# def add_quality_background(ax):
#     ax.axhspan(30, 40, facecolor='lightgreen', alpha=0.5)
#     ax.axhspan(25, 30, facecolor='khaki', alpha=0.5)
#     ax.axhspan(20, 25, facecolor='moccasin', alpha=0.5)
#     ax.axhspan(0, 20, facecolor='lightcoral', alpha=0.5)

# # 📂 폴더 내 모든 FASTQ(.gz 포함) 처리
# for filename in os.listdir(input_folder):
#     if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
#         input_path = os.path.join(input_folder, filename)
#         sample_name = os.path.splitext(filename)[0].replace(".fastq", "").replace(".gz", "")

#         print(f"📌 Processing: {sample_name}")
#         df = compute_quality_stats(input_path)

#         # CSV 저장
#         csv_path = os.path.join(output_csv_folder, f"{sample_name}_quality.csv")
#         df.to_csv(csv_path, index=False)

#         # 그래프 저장
#         plt.figure(figsize=(18, 8))
#         ax = plt.gca()
#         add_quality_background(ax)
#         plt.plot(df["position"], df["mean"], color="blue", linewidth=1.5, label="Mean Quality")

#         for i in range(len(df)):
#             x = df.loc[i, "position"]
#             q1 = df.loc[i, "q1"]
#             q3 = df.loc[i, "q3"]
#             plt.fill_between([x - 0.4, x + 0.4], [q1, q1], [q3, q3], color="yellow", edgecolor="black")

#         plt.vlines(df["position"], df["min"], df["max"], color="black", linewidth=0.5)
#         plt.title(f"Quality scores across all bases: {sample_name}", fontsize=14)
#         plt.xlabel("Position in read (bp)", fontsize=12)
#         plt.ylabel("Quality score", fontsize=12)
#         plt.ylim(0, 40)
#         plt.xlim(1, df["position"].max())
#         plt.legend()
#         plt.tight_layout()
#         plot_path = os.path.join(output_plot_folder, f"{sample_name}_quality_plot.png")
#         plt.savefig(plot_path, dpi=300)
#         plt.close()

#         print(f"✅ Saved: {sample_name}_quality.csv and quality_plot.png")

# read count check

In [72]:
# import os
# import gzip
# from Bio import SeqIO

# # 분석 대상 폴더
# input_folder = "fastq_step/1_3_5_7_9_11/B_Qfiltered"  # 여기에 대상 폴더 경로를 입력하세요

# # 허용 확장자
# valid_extensions = [".fastq", ".fastq.gz", ".fasta", ".fasta.gz"]

# # 포맷 결정 함수
# def get_format(filename):
#     if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
#         return "fastq"
#     elif filename.endswith(".fasta") or filename.endswith(".fasta.gz"):
#         return "fasta"
#     else:
#         return None

# # 결과 저장 리스트
# read_counts = []

# # 파일 순회 및 read 수 카운트
# for filename in os.listdir(input_folder):
#     if any(filename.endswith(ext) for ext in valid_extensions):
#         file_path = os.path.join(input_folder, filename)
#         file_format = get_format(filename)
#         open_func = gzip.open if filename.endswith(".gz") else open

#         try:
#             with open_func(file_path, "rt") as handle:
#                 count = sum(1 for _ in SeqIO.parse(handle, file_format))
#             read_counts.append((filename, count))
#         except Exception as e:
#             print(f"❌ Error processing {filename}: {e}")

# # 📄 파일명 기준 정렬 후 출력
# read_counts.sort(key=lambda x: x[0].lower())  # 파일명 기준 (대소문자 구분 없이) 정렬
# for fname, count in read_counts:
#     print(f"{fname:40} : {count} reads")

# ID matching

In [73]:
import gzip
import glob
import os

def extract_matching_reads(r1_path, r2_path, out_r1_path, out_r2_path):
    def get_read_id(header):
        # FASTQ header에서 ID 추출
        return header.split()[0].replace('/1', '').replace('/2', '')

    r1_ids = set()
    r2_ids = set()

    with gzip.open(r1_path, 'rt') as r1_file:
        while True:
            header = r1_file.readline()
            if not header:
                break
            r1_ids.add(get_read_id(header.strip()))
            [r1_file.readline() for _ in range(3)]  # read 나머지 3줄 skip

    with gzip.open(r2_path, 'rt') as r2_file:
        while True:
            header = r2_file.readline()
            if not header:
                break
            r2_ids.add(get_read_id(header.strip()))
            [r2_file.readline() for _ in range(3)]

    matching_ids = r1_ids & r2_ids
    r1_only = r1_ids - r2_ids
    r2_only = r2_ids - r1_ids

    print(f"Processing {os.path.basename(r1_path)} and {os.path.basename(r2_path)}")
    print(f"Total R1 IDs: {len(r1_ids)}, Total R2 IDs: {len(r2_ids)}, Matching IDs: {len(matching_ids)}")
    print(f"IDs only in R1: {len(r1_only)}, IDs only in R2: {len(r2_only)}\n")

    # 결과 폴더 생성
    for out_path in [out_r1_path, out_r2_path]:
        os.makedirs(os.path.dirname(out_path), exist_ok=True)

    def write_matching_reads(input_path, output_path, matching_ids):
        with gzip.open(input_path, 'rt') as infile, gzip.open(output_path, 'wt') as outfile:
            while True:
                lines = [infile.readline() for _ in range(4)]
                if not lines[0]:
                    break
                read_id = get_read_id(lines[0].strip())
                if read_id in matching_ids:
                    outfile.writelines(lines)

    write_matching_reads(r1_path, out_r1_path, matching_ids)
    write_matching_reads(r2_path, out_r2_path, matching_ids)

# ----------------------
# 전체 파일에 대해 적용
# ----------------------

input_folder = "fastq_7_8_9_10_11_12/B_Qfiltered"
output_folder = "fastq_7_8_9_10_11_12/C_ID_matched"

# 모든 R1 파일 찾기
r1_files = glob.glob(os.path.join(input_folder, "*_R1_Qfiltered.fastq.gz"))

# 각 R1에 대해 짝이 맞는 R2를 찾고 작업 실행
for r1_file in r1_files:
    r2_file = r1_file.replace("_R1_Qfiltered.fastq.gz", "_R2_Qfiltered.fastq.gz")
    
    if os.path.exists(r2_file):
        # 결과 output 경로 설정
        base_name = os.path.basename(r1_file).replace("_R1_Qfiltered.fastq.gz", "")
        out_r1 = os.path.join(output_folder, f"{base_name}_ID_match_R1.fastq.gz")
        out_r2 = os.path.join(output_folder, f"{base_name}_ID_match_R2.fastq.gz")
        
        # 함수 실행
        extract_matching_reads(r1_file, r2_file, out_r1, out_r2)
    else:
        print(f"Warning: {r2_file} not found. Skipping.")

Processing 250910_batch20_10step_R1_Qfiltered.fastq.gz and 250910_batch20_10step_R2_Qfiltered.fastq.gz
Total R1 IDs: 572, Total R2 IDs: 430, Matching IDs: 349
IDs only in R1: 223, IDs only in R2: 81

Processing 250910_batch20_08step_R1_Qfiltered.fastq.gz and 250910_batch20_08step_R2_Qfiltered.fastq.gz
Total R1 IDs: 838, Total R2 IDs: 696, Matching IDs: 590
IDs only in R1: 248, IDs only in R2: 106

Processing 250910_batch20_07step_R1_Qfiltered.fastq.gz and 250910_batch20_07step_R2_Qfiltered.fastq.gz
Total R1 IDs: 1042, Total R2 IDs: 909, Matching IDs: 814
IDs only in R1: 228, IDs only in R2: 95

Processing 250910_batch20_09step_R1_Qfiltered.fastq.gz and 250910_batch20_09step_R2_Qfiltered.fastq.gz
Total R1 IDs: 852, Total R2 IDs: 692, Matching IDs: 605
IDs only in R1: 247, IDs only in R2: 87

Processing 250905_batch19_08step_R1_Qfiltered.fastq.gz and 250905_batch19_08step_R2_Qfiltered.fastq.gz
Total R1 IDs: 1167, Total R2 IDs: 899, Matching IDs: 839
IDs only in R1: 328, IDs only in R2: 6

# DNA Fragmentation R1(Front, Back), R2(Front, Back)

In [74]:
import gzip
import glob
import os

def split_fastq_by_position(r1_path, r2_path, n, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    sample_base = os.path.basename(r1_path).replace("_ID_match_R1.fastq.gz", "")
    r1_f_path = os.path.join(output_dir, f"{sample_base}_R1_F.fastq.gz")
    r1_b_path = os.path.join(output_dir, f"{sample_base}_R1_B.fastq.gz")
    r2_f_path = os.path.join(output_dir, f"{sample_base}_R2_F.fastq.gz")
    r2_b_path = os.path.join(output_dir, f"{sample_base}_R2_B.fastq.gz")

    with gzip.open(r1_path, 'rt') as r1_file, \
         gzip.open(r2_path, 'rt') as r2_file, \
         gzip.open(r1_f_path, 'wt') as r1_f_out, \
         gzip.open(r1_b_path, 'wt') as r1_b_out, \
         gzip.open(r2_f_path, 'wt') as r2_f_out, \
         gzip.open(r2_b_path, 'wt') as r2_b_out:

        while True:
            r1_lines = [r1_file.readline() for _ in range(4)]
            r2_lines = [r2_file.readline() for _ in range(4)]

            if not r1_lines[0] or not r2_lines[0]:
                break

            header1, seq1, plus1, qual1 = [line.strip() for line in r1_lines]
            header2, seq2, plus2, qual2 = [line.strip() for line in r2_lines]

            r1_f_out.write(f"{header1}\n{seq1[:151-n]}\n{plus1}\n{qual1[:151-n]}\n")
            r1_b_out.write(f"{header1}\n{seq1[-n:]}\n{plus1}\n{qual1[-n:]}\n")
            r2_f_out.write(f"{header2}\n{seq2[:151-n]}\n{plus2}\n{qual2[:151-n]}\n")
            r2_b_out.write(f"{header2}\n{seq2[-n:]}\n{plus2}\n{qual2[-n:]}\n")

    print(f"✅ 분리 완료: {sample_base} → {output_dir} (N={n})")

# -----------------------------------
# 전체 파일에 대해 split 적용하는 코드
# -----------------------------------

input_folder = "fastq_7_8_9_10_11_12/C_id_matched"
output_folder = "fastq_7_8_9_10_11_12/D_split_reads"
os.makedirs(output_folder, exist_ok=True)

# prefix별 N값 설정
sample_n_mapping = {
    "07step": 144,
    "08step": 124,
    "09step": 102,
    "10step": 82,
    "11step": 60,
    "12step": 40, 
}


# 모든 R1 파일 리스트 찾기
r1_files = glob.glob(os.path.join(input_folder, "*_ID_match_R1.fastq.gz"))

for r1_file in r1_files:
    r2_file = r1_file.replace("_R1.fastq.gz", "_R2.fastq.gz")

    if not os.path.exists(r2_file):
        print(f"⚠️ 짝이 맞는 R2 파일이 없습니다: {r2_file}")
        continue

    # 파일 이름에 맞는 N값 찾기
    matched_n = None
    for prefix, n_value in sample_n_mapping.items():
        if prefix in os.path.basename(r1_file):
            matched_n = n_value
            break

    if matched_n is None:
        print(f"⚠️ N값을 찾을 수 없습니다: {r1_file} → 스킵")
        continue

    # split 실행
    split_fastq_by_position(r1_file, r2_file, matched_n, output_folder)

✅ 분리 완료: 250910_batch20_09step → fastq_7_8_9_10_11_12/D_split_reads (N=102)
✅ 분리 완료: 250910_batch20_08step → fastq_7_8_9_10_11_12/D_split_reads (N=124)
✅ 분리 완료: 250910_batch20_10step → fastq_7_8_9_10_11_12/D_split_reads (N=82)
✅ 분리 완료: 250910_batch20_07step → fastq_7_8_9_10_11_12/D_split_reads (N=144)
✅ 분리 완료: 250905_batch19_08step → fastq_7_8_9_10_11_12/D_split_reads (N=124)


# R2 DNA reverse complementary

In [75]:
import gzip
import glob
import os
from Bio import SeqIO

def reverse_complement_fastq(input_fastq_path, output_fastq_path):
    with gzip.open(input_fastq_path, "rt") as infile, gzip.open(output_fastq_path, "wt") as outfile:
        for record in SeqIO.parse(infile, "fastq"):
            record.seq = record.seq.reverse_complement()
            record.letter_annotations["phred_quality"] = record.letter_annotations["phred_quality"][::-1]
            SeqIO.write(record, outfile, "fastq")
    print(f"✅ Reverse complemented: {os.path.basename(output_fastq_path)}")

# --------------------------------------
# 전체 파일에 대해 reverse complement 수행
# --------------------------------------

input_folder = "fastq_7_8_9_10_11_12/D_split_reads"
os.makedirs(input_folder, exist_ok=True)

# _R2_B.fastq.gz 또는 _R2_F.fastq.gz로 끝나는 파일만 찾기
input_files = glob.glob(os.path.join(input_folder, "*_R2_[BF].fastq.gz"))

for input_path in input_files:
    base = os.path.basename(input_path)
    name_without_ext = base.replace(".fastq.gz", "")  # .fastq.gz 제거
    output_path = os.path.join(input_folder, f"{name_without_ext}_revcomp.fastq.gz")
    
    reverse_complement_fastq(input_path, output_path)

✅ Reverse complemented: 250910_batch20_08step_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_10step_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_09step_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_07step_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 250905_batch19_08step_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 250905_batch19_08step_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_07step_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_08step_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_09step_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_10step_R2_F_revcomp.fastq.gz


In [76]:
import gzip
import glob
import os
from Bio import SeqIO

def reverse_complement_fastq(input_fastq_path, output_fastq_path):
    with gzip.open(input_fastq_path, "rt") as infile, gzip.open(output_fastq_path, "wt") as outfile:
        for record in SeqIO.parse(infile, "fastq"):
            record.seq = record.seq.reverse_complement()
            record.letter_annotations["phred_quality"] = record.letter_annotations["phred_quality"][::-1]
            SeqIO.write(record, outfile, "fastq")
    print(f"✅ Reverse complemented: {os.path.basename(output_fastq_path)}")

# --------------------------------------
# 전체 파일에 대해 reverse complement 수행
# --------------------------------------

input_folder = "fastq_7_8_9_10_11_12/D_split_reads"
os.makedirs(input_folder, exist_ok=True)

# _R2_B.fastq.gz 또는 _R2_F.fastq.gz로 끝나는 파일만 찾기
input_files = glob.glob(os.path.join(input_folder, "*_R2_[BF].fastq.gz"))

for input_path in input_files:
    base = os.path.basename(input_path)
    name_without_ext = base.replace(".fastq.gz", "")  # .fastq.gz 제거
    output_path = os.path.join(input_folder, f"{name_without_ext}_revcomp.fastq.gz")
    
    reverse_complement_fastq(input_path, output_path)

✅ Reverse complemented: 250910_batch20_08step_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_10step_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_09step_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_07step_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 250905_batch19_08step_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 250905_batch19_08step_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_07step_R2_B_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_08step_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_09step_R2_F_revcomp.fastq.gz
✅ Reverse complemented: 250910_batch20_10step_R2_F_revcomp.fastq.gz


In [77]:
import os
import glob
import subprocess

# === 폴더 설정 ===
input_folder = "fastq_7_8_9_10_11_12/D_split_reads"
output_folder = "fastq_7_8_9_10_11_12/E_merged_output"
os.makedirs(output_folder, exist_ok=True)

# === Prefix별 N값 설정 ===
sample_n_mapping = {
    "07step": 144,
    "08step": 124,
    "09step": 102,
    "10step": 82,
    "11step": 60,
    "12step": 40, 
}

# === R1_B 파일 리스트 찾기 ===
r1_files = glob.glob(os.path.join(input_folder, "*_R1_B.fastq.gz"))

print(f"🔎 Found {len(r1_files)} R1_B files.")

# === 각 R1_B 파일에 대해 ===
for r1_path in r1_files:
    sample_base = os.path.basename(r1_path).replace("_R1_B.fastq.gz", "")
    r2_path = os.path.join(input_folder, f"{sample_base}_R2_B.fastq.gz")

    if not os.path.exists(r2_path):
        print(f"⚠️ Matching R2_B file not found for {sample_base} → Skipping.")
        continue

    # 파일 이름에 맞는 N값 찾기
    matched_n = None
    for prefix, n_value in sample_n_mapping.items():
        if prefix in sample_base:
            matched_n = n_value
            break

    if matched_n is None:
        print(f"⚠️ No N value matched for {sample_base} → Skipping.")
        continue

    output_name = f"{sample_base}_FLASH"

    print(f"🔵 Running FLASH for sample: {sample_base} (N={matched_n})")

    try:
        subprocess.check_call([
            "flash",
            "-m", str(matched_n),   # 최소 overlap
            "-M", str(matched_n),   # 최대 overlap
            "-o", output_name,      # 결과 파일 prefix
            "-d", output_folder,    # 결과 저장 폴더
            r1_path,
            r2_path
        ])
        print(f"✅ FLASH merging complete → {os.path.join(output_folder, output_name)}.fastq")
    except subprocess.CalledProcessError as e:
        print(f"❌ FLASH merging failed for {sample_base}: {e}")

🔎 Found 5 R1_B files.
🔵 Running FLASH for sample: 250910_batch20_07step (N=144)
[FLASH] Starting FLASH v1.2.11
[FLASH] Fast Length Adjustment of SHort reads
[FLASH]  
[FLASH] Input files:
[FLASH]     fastq_7_8_9_10_11_12/D_split_reads/250910_batch20_07step_R1_B.fastq.gz
[FLASH]     fastq_7_8_9_10_11_12/D_split_reads/250910_batch20_07step_R2_B.fastq.gz
[FLASH]  
[FLASH] Output files:
[FLASH]     fastq_7_8_9_10_11_12/E_merged_output/250910_batch20_07step_FLASH.extendedFrags.fastq
[FLASH]     fastq_7_8_9_10_11_12/E_merged_output/250910_batch20_07step_FLASH.notCombined_1.fastq
[FLASH]     fastq_7_8_9_10_11_12/E_merged_output/250910_batch20_07step_FLASH.notCombined_2.fastq
[FLASH]     fastq_7_8_9_10_11_12/E_merged_output/250910_batch20_07step_FLASH.hist
[FLASH]     fastq_7_8_9_10_11_12/E_merged_output/250910_batch20_07step_FLASH.histogram
[FLASH]  
[FLASH] Parameters:
[FLASH]     Min overlap:           144
[FLASH]     Max overlap:           144
[FLASH]     Max mismatch density:  0.250000
[F

In [78]:
import os
import gzip
import glob
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

def load_fastq_to_dict(file_path):
    """FASTQ 파일을 dict로 불러오기: key=read_id, value=(seq, qual)"""
    data = {}
    open_func = gzip.open if file_path.endswith(".gz") else open

    with open_func(file_path, "rt") as handle:
        for record in SeqIO.parse(handle, "fastq"):
            seq = str(record.seq)
            qual = record.letter_annotations["phred_quality"]
            data[record.id] = (seq, qual)
    return data

def assemble_fastq(r1_path, merged_path, r2_path, output_path):
    print(f"🔄 Assembling for sample: {os.path.basename(output_path)}")
    r1_dict = load_fastq_to_dict(r1_path)
    r2_dict = load_fastq_to_dict(r2_path)

    with open(merged_path, "r") as merged_file, gzip.open(output_path, "wt") as output_file:
        for record in SeqIO.parse(merged_file, "fastq"):
            read_id = record.id
            merged_seq = str(record.seq)
            merged_qual = record.letter_annotations["phred_quality"]

            if read_id not in r1_dict or read_id not in r2_dict:
                continue  # 둘 다 있어야 합침

            r1_seq, r1_qual = r1_dict[read_id]
            r2_seq, r2_qual = r2_dict[read_id]

            # 순서대로 이어붙이기: R1_F → FLASH → R2_F_revcomp
            full_seq = r1_seq + merged_seq + r2_seq
            full_qual = r1_qual + merged_qual + r2_qual

            new_record = SeqRecord(
                Seq(full_seq),
                id=read_id,
                description="",
                letter_annotations={"phred_quality": full_qual}
            )

            SeqIO.write(new_record, output_file, "fastq")

    print(f"✅ Assembled FASTQ saved: {output_path}")

# ===== 전체 처리 자동화 =====

# 경로 설정
input_merged_folder = "fastq_7_8_9_10_11_12/E_merged_output"
input_split_folder = "fastq_7_8_9_10_11_12/D_split_reads"
output_folder = "fastq_7_8_9_10_11_12/1_assemble"
os.makedirs(output_folder, exist_ok=True)

# 모든 merged 파일 리스트
merged_files = glob.glob(os.path.join(input_merged_folder, "*_FLASH.extendedFrags.fastq"))

print(f"🔍 Found {len(merged_files)} merged samples to assemble.")

for merged_file in merged_files:
    sample_base = os.path.basename(merged_file).replace("_FLASH.extendedFrags.fastq", "")

    r1_path = os.path.join(input_split_folder, f"{sample_base}_R1_F.fastq.gz")
    r2_path = os.path.join(input_split_folder, f"{sample_base}_R2_F_revcomp.fastq.gz")
    output_path = os.path.join(output_folder, f"{sample_base}_assemble.fastq.gz")

    if os.path.exists(r1_path) and os.path.exists(r2_path):
        assemble_fastq(r1_path, merged_file, r2_path, output_path)
    else:
        print(f"⚠️ Missing split files for {sample_base}, skipping.")

🔍 Found 5 merged samples to assemble.
🔄 Assembling for sample: 250910_batch20_08step_assemble.fastq.gz
✅ Assembled FASTQ saved: fastq_7_8_9_10_11_12/1_assemble/250910_batch20_08step_assemble.fastq.gz
🔄 Assembling for sample: 250910_batch20_07step_assemble.fastq.gz
✅ Assembled FASTQ saved: fastq_7_8_9_10_11_12/1_assemble/250910_batch20_07step_assemble.fastq.gz
🔄 Assembling for sample: 250905_batch19_08step_assemble.fastq.gz
✅ Assembled FASTQ saved: fastq_7_8_9_10_11_12/1_assemble/250905_batch19_08step_assemble.fastq.gz
🔄 Assembling for sample: 250910_batch20_09step_assemble.fastq.gz
✅ Assembled FASTQ saved: fastq_7_8_9_10_11_12/1_assemble/250910_batch20_09step_assemble.fastq.gz
🔄 Assembling for sample: 250910_batch20_10step_assemble.fastq.gz
✅ Assembled FASTQ saved: fastq_7_8_9_10_11_12/1_assemble/250910_batch20_10step_assemble.fastq.gz


# Convert fastq ➔ csv

In [79]:
import os
import gzip
import pandas as pd
from Bio import SeqIO

def fastq_to_csv(fastq_path, csv_path):
    data = []

    # .gz 확장자 여부에 따라 open 방식 선택
    open_func = gzip.open if fastq_path.endswith(".gz") else open

    with open_func(fastq_path, "rt") as handle:
        for record in SeqIO.parse(handle, "fastq"):
            read_id = record.id
            sequence = str(record.seq)
            quality = "".join(chr(q + 33) for q in record.letter_annotations["phred_quality"])
            data.append([read_id, sequence, quality])

    df = pd.DataFrame(data, columns=["Read_ID", "Sequence", "Quality_Score"])
    df.to_csv(csv_path, index=False)
    print(f"✅ Converted: {os.path.basename(fastq_path)} → {os.path.basename(csv_path)}")

def convert_all_fastq_in_folder(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(input_folder):
        if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
            input_path = os.path.join(input_folder, filename)
            csv_filename = filename.replace(".fastq.gz", ".csv").replace(".fastq", ".csv")
            output_path = os.path.join(output_folder, csv_filename)

            fastq_to_csv(input_path, output_path)

# 📌 사용 예시
input_folder = "fastq_7_8_9_10_11_12/1_assemble"
output_folder = "fastq_7_8_9_10_11_12/1_assemble/csv"

convert_all_fastq_in_folder(input_folder, output_folder)

✅ Converted: 250910_batch20_07step_assemble.fastq.gz → 250910_batch20_07step_assemble.csv
✅ Converted: 250910_batch20_09step_assemble.fastq.gz → 250910_batch20_09step_assemble.csv
✅ Converted: 250910_batch20_10step_assemble.fastq.gz → 250910_batch20_10step_assemble.csv
✅ Converted: 250905_batch19_08step_assemble.fastq.gz → 250905_batch19_08step_assemble.csv
✅ Converted: 250910_batch20_08step_assemble.fastq.gz → 250910_batch20_08step_assemble.csv


# fastq -> fasta

In [80]:
import os
import gzip
from Bio import SeqIO

# 필터링된 FASTQ.GZ 파일이 위치한 폴더
# 변환된 FASTA 파일을 저장할 폴더

input_folder = "fastq_7_8_9_10_11_12/1_assemble"
output_folder = "fastq_7_8_9_10_11_12/2_fastq_to_fasta"
os.makedirs(output_folder, exist_ok=True)  # 출력 폴더 생성

for filename in os.listdir(input_folder):
    # 예: "_filtered.fastq.gz"로 끝나는 파일만 변환
    if filename.endswith("_assemble.fastq.gz"):
        # 입력 FASTQ.GZ 파일 경로
        input_file = os.path.join(input_folder, filename)
        
        # 출력 FASTA 파일 경로 (확장자만 .fasta로 변경)
        output_file = os.path.join(
            output_folder,
            filename.replace("_assemble.fastq.gz", "_assemble.fasta")
        )

        # FASTQ.GZ 파일을 읽어서 FASTA로 변환
        records = []
        with gzip.open(input_file, "rt") as fastq_file:  # "rt" = read text mode
            for record in SeqIO.parse(fastq_file, "fastq"):
                records.append(record)

        # 변환된 시퀀스를 FASTA로 저장
        with open(output_file, "w") as fasta_file:
            SeqIO.write(records, fasta_file, "fasta")

        print(f"Conversion from {filename} → {os.path.basename(output_file)} is complete.")

print("All conversions are done.")

Conversion from 250910_batch20_07step_assemble.fastq.gz → 250910_batch20_07step_assemble.fasta is complete.
Conversion from 250910_batch20_09step_assemble.fastq.gz → 250910_batch20_09step_assemble.fasta is complete.
Conversion from 250910_batch20_10step_assemble.fastq.gz → 250910_batch20_10step_assemble.fasta is complete.
Conversion from 250905_batch19_08step_assemble.fastq.gz → 250905_batch19_08step_assemble.fasta is complete.
Conversion from 250910_batch20_08step_assemble.fastq.gz → 250910_batch20_08step_assemble.fasta is complete.
All conversions are done.


# Answer sequence - Sample matching

In [81]:
%%bash
set -euo pipefail
shopt -s nullglob

# 디렉토리 설정
ref_dir="step_reference"
query_dir="fastq_7_8_9_10_11_12/2_fastq_to_fasta"
output_dir="fastq_7_8_9_10_11_12/3_align_sam"
mkdir -p "$output_dir"

# 같은 참조는 한 번만 index
declare -A indexed

# *_07step_assemble.fasta 같은 케이스 우선, 없으면 일반 *_07step_*.fasta 도 수집
for query_file in "$query_dir"/*step*assemble.fasta "$query_dir"/*step*.fasta; do
  filename="$(basename "$query_file")"

  # 파일명에서 두 자리 step 추출 (예: _07step_, _12step_)
  if [[ "$filename" =~ _([0-9]{2})step_ ]]; then
    step="${BASH_REMATCH[1]}"
  else
    echo "⚠️ Skipping (no '_NNstep_' pattern): $filename"
    continue
  fi

  # 07~12만 처리
  case "$step" in
    07|08|09|10|11|12) ;;
    *) echo "⚠️ Skipping (step not in 07–12): $filename"; continue ;;
  esac

  reference_file="${ref_dir}/${step}step_reference.fasta"
  if [[ ! -f "$reference_file" ]]; then
    echo "⚠️ Missing reference: $reference_file"
    continue
  fi

  out="${output_dir}/${filename%.fasta}.sam"

  echo "🔄 Aligning: $filename → $(basename "$reference_file")"
  if [[ -z "${indexed[$reference_file]:-}" ]]; then
    bwa index "$reference_file"
    indexed[$reference_file]=1
  fi

  bwa mem -M -t 4 "$reference_file" "$query_file" > "$out"
  echo "✅ Done: $out"
done

🔄 Aligning: 250905_batch19_08step_assemble.fasta → 08step_reference.fasta


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/08step_reference.fasta
[main] Real time: 0.073 sec; CPU: 0.011 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 290 sequences (51620 bp)...
[M::mem_process_seqs] Processed 290 reads in 0.113 CPU sec, 0.041 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/08step_reference.fasta fastq_7_8_9_10_11_12/2_fastq_to_fasta/250905_batch19_08step_assemble.fasta
[main] Real time: 0.076 sec; CPU: 0.118 sec


✅ Done: fastq_7_8_9_10_11_12/3_align_sam/250905_batch19_08step_assemble.sam
🔄 Aligning: 250910_batch20_07step_assemble.fasta → 07step_reference.fasta


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/07step_reference.fasta
[main] Real time: 0.052 sec; CPU: 0.006 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 437 sequences (69046 bp)...
[M::mem_process_seqs] Processed 437 reads in 0.101 CPU sec, 0.037 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/07step_reference.fasta fastq_7_8_9_10_11_12/2_fastq_to_fasta/250910_batch20_07step_assemble.fasta
[main] Real time: 0.073 sec; CPU: 0.105 sec


✅ Done: fastq_7_8_9_10_11_12/3_align_sam/250910_batch20_07step_assemble.sam
🔄 Aligning: 250910_batch20_08step_assemble.fasta → 08step_reference.fasta


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 127 sequences (22606 bp)...
[M::mem_process_seqs] Processed 127 reads in 0.033 CPU sec, 0.014 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/08step_reference.fasta fastq_7_8_9_10_11_12/2_fastq_to_fasta/250910_batch20_08step_assemble.fasta
[main] Real time: 0.039 sec; CPU: 0.037 sec


✅ Done: fastq_7_8_9_10_11_12/3_align_sam/250910_batch20_08step_assemble.sam
🔄 Aligning: 250910_batch20_09step_assemble.fasta → 09step_reference.fasta


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.01 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/09step_reference.fasta
[main] Real time: 0.083 sec; CPU: 0.016 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 161 sequences (32200 bp)...
[M::mem_process_seqs] Processed 161 reads in 0.261 CPU sec, 0.083 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/09step_reference.fasta fastq_7_8_9_10_11_12/2_fastq_to_fasta/250910_batch20_09step_assemble.fasta
[main] Real time: 0.120 sec; CPU: 0.266 sec


✅ Done: fastq_7_8_9_10_11_12/3_align_sam/250910_batch20_09step_assemble.sam
🔄 Aligning: 250910_batch20_10step_assemble.fasta → 10step_reference.fasta


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.01 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.01 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index step_reference/10step_reference.fasta
[main] Real time: 0.114 sec; CPU: 0.029 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 44 sequences (9680 bp)...
[M::mem_process_seqs] Processed 44 reads in 0.109 CPU sec, 0.035 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/10step_reference.fasta fastq_7_8_9_10_11_12/2_fastq_to_fasta/250910_batch20_10step_assemble.fasta
[main] Real time: 0.075 sec; CPU: 0.114 sec


✅ Done: fastq_7_8_9_10_11_12/3_align_sam/250910_batch20_10step_assemble.sam
🔄 Aligning: 250905_batch19_08step_assemble.fasta → 08step_reference.fasta


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 290 sequences (51620 bp)...
[M::mem_process_seqs] Processed 290 reads in 0.112 CPU sec, 0.037 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/08step_reference.fasta fastq_7_8_9_10_11_12/2_fastq_to_fasta/250905_batch19_08step_assemble.fasta
[main] Real time: 0.074 sec; CPU: 0.116 sec


✅ Done: fastq_7_8_9_10_11_12/3_align_sam/250905_batch19_08step_assemble.sam
🔄 Aligning: 250910_batch20_07step_assemble.fasta → 07step_reference.fasta


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 437 sequences (69046 bp)...
[M::mem_process_seqs] Processed 437 reads in 0.111 CPU sec, 0.036 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/07step_reference.fasta fastq_7_8_9_10_11_12/2_fastq_to_fasta/250910_batch20_07step_assemble.fasta
[main] Real time: 0.063 sec; CPU: 0.115 sec


✅ Done: fastq_7_8_9_10_11_12/3_align_sam/250910_batch20_07step_assemble.sam
🔄 Aligning: 250910_batch20_08step_assemble.fasta → 08step_reference.fasta


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 127 sequences (22606 bp)...
[M::mem_process_seqs] Processed 127 reads in 0.035 CPU sec, 0.018 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/08step_reference.fasta fastq_7_8_9_10_11_12/2_fastq_to_fasta/250910_batch20_08step_assemble.fasta
[main] Real time: 0.047 sec; CPU: 0.039 sec


✅ Done: fastq_7_8_9_10_11_12/3_align_sam/250910_batch20_08step_assemble.sam
🔄 Aligning: 250910_batch20_09step_assemble.fasta → 09step_reference.fasta


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 161 sequences (32200 bp)...
[M::mem_process_seqs] Processed 161 reads in 0.233 CPU sec, 0.074 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/09step_reference.fasta fastq_7_8_9_10_11_12/2_fastq_to_fasta/250910_batch20_09step_assemble.fasta
[main] Real time: 0.107 sec; CPU: 0.237 sec


✅ Done: fastq_7_8_9_10_11_12/3_align_sam/250910_batch20_09step_assemble.sam
🔄 Aligning: 250910_batch20_10step_assemble.fasta → 10step_reference.fasta


[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 44 sequences (9680 bp)...
[M::mem_process_seqs] Processed 44 reads in 0.107 CPU sec, 0.040 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -M -t 4 step_reference/10step_reference.fasta fastq_7_8_9_10_11_12/2_fastq_to_fasta/250910_batch20_10step_assemble.fasta
[main] Real time: 0.073 sec; CPU: 0.112 sec


✅ Done: fastq_7_8_9_10_11_12/3_align_sam/250910_batch20_10step_assemble.sam


# sam to bam

In [82]:
%%bash

# Set the path to the directory containing SAM files
sam_dir="fastq_7_8_9_10_11_12/3_align_sam"
# Set the output directory for BAM files
bam_dir="fastq_7_8_9_10_11_12/4_align_bam"


# Make sure the output directory exists or create it if necessary
mkdir -p "$bam_dir"

# Convert SAM files to BAM
for sam_file in "$sam_dir"/*.sam; do
    bam_file="$bam_dir/$(basename "$sam_file" .sam).bam"
    samtools view -bS "$sam_file" -o "$bam_file"
    echo "Conversion from $sam_file to $bam_file is complete."
done

Conversion from fastq_7_8_9_10_11_12/3_align_sam/250905_batch19_08step_assemble.sam to fastq_7_8_9_10_11_12/4_align_bam/250905_batch19_08step_assemble.bam is complete.
Conversion from fastq_7_8_9_10_11_12/3_align_sam/250910_batch20_07step_assemble.sam to fastq_7_8_9_10_11_12/4_align_bam/250910_batch20_07step_assemble.bam is complete.
Conversion from fastq_7_8_9_10_11_12/3_align_sam/250910_batch20_08step_assemble.sam to fastq_7_8_9_10_11_12/4_align_bam/250910_batch20_08step_assemble.bam is complete.
Conversion from fastq_7_8_9_10_11_12/3_align_sam/250910_batch20_09step_assemble.sam to fastq_7_8_9_10_11_12/4_align_bam/250910_batch20_09step_assemble.bam is complete.
Conversion from fastq_7_8_9_10_11_12/3_align_sam/250910_batch20_10step_assemble.sam to fastq_7_8_9_10_11_12/4_align_bam/250910_batch20_10step_assemble.bam is complete.


# bam to csv

In [83]:
import os
import pysam
import pandas as pd

# 입력 폴더 (BAM 파일이 위치한 경로)
input_folder = "fastq_7_8_9_10_11_12/4_align_bam"
# 출력 폴더 (CSV 파일을 저장할 경로, 필요하면 변경)
output_folder = "fastq_7_8_9_10_11_12/4_align_bam/csv"


# 출력 폴더가 없으면 생성
os.makedirs(output_folder, exist_ok=True)

# BAM -> CSV 변환 함수 (옵션 필드 포함)
def bam_to_csv(bam_file, output_folder):
    output_csv = os.path.join(output_folder, os.path.basename(bam_file).replace(".bam", ".csv"))
    
    # BAM 파일 읽기
    with pysam.AlignmentFile(bam_file, "rb") as bam:
        records = []
        all_tags = set()  # 옵션 필드를 저장할 집합
        
        for read in bam:
            # 기본 필드
            record = {
                "QNAME": read.query_name,
                "FLAG": read.flag,
                "RNAME": bam.get_reference_name(read.reference_id) if read.reference_id >= 0 else "*",
                "POS": read.reference_start + 1,
                "MAPQ": read.mapping_quality,
                "CIGAR": read.cigarstring if read.cigarstring else "*",
                "RNEXT": bam.get_reference_name(read.next_reference_id) if read.next_reference_id >= 0 else "*",
                "PNEXT": read.next_reference_start + 1 if read.next_reference_start >= 0 else 0,
                "TLEN": read.template_length,
                "SEQ": read.query_sequence if read.query_sequence else "*",
                "QUAL": read.qual if read.qual else "*",
            }
            
            # 옵션 필드 추가
            for tag, value in read.tags:
                record[tag] = value
                all_tags.add(tag)

            records.append(record)
    
    # 데이터프레임 생성
    df = pd.DataFrame(records)

    # 옵션 필드가 없는 경우 NaN으로 처리
    df = df.fillna("*")

    # CSV 저장
    df.to_csv(output_csv, index=False)
    return output_csv

# 폴더에서 모든 BAM 파일 찾기
bam_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".bam")]

# 모든 BAM 파일을 CSV로 변환
csv_files = []
for bam_file in bam_files:
    csv_file = bam_to_csv(bam_file, output_folder)
    csv_files.append(csv_file)

# 변환된 CSV 파일 목록 출력
csv_files

['fastq_7_8_9_10_11_12/4_align_bam/csv/250910_batch20_10step_assemble.csv',
 'fastq_7_8_9_10_11_12/4_align_bam/csv/250910_batch20_08step_assemble.csv',
 'fastq_7_8_9_10_11_12/4_align_bam/csv/250910_batch20_07step_assemble.csv',
 'fastq_7_8_9_10_11_12/4_align_bam/csv/250905_batch19_08step_assemble.csv',
 'fastq_7_8_9_10_11_12/4_align_bam/csv/250910_batch20_09step_assemble.csv']

In [84]:
import os
import pandas as pd
from pathlib import Path

# 입력/출력 폴더
csv_dir = Path("fastq_7_8_9_10_11_12/4_align_bam/csv")
out_dir = csv_dir / "MAPQ_removed"
out_dir.mkdir(parents=True, exist_ok=True)

# ===== 임계값 (이하 ≤ 는 제거) =====
MAPQ_THRESHOLD = 10
# =================================

for in_path in sorted(csv_dir.glob("*.csv")):
    try:
        df = pd.read_csv(in_path)
    except Exception as e:
        print(f"⚠️  Read fail: {in_path.name} -> {e}")
        continue

    if "MAPQ" not in df.columns:
        print(f"⚠️  Skip (no MAPQ column): {in_path.name}")
        continue

    m = pd.to_numeric(df["MAPQ"], errors="coerce")
    # keep: MAPQ > cutoff (NaN은 기본적으로 keep; NaN도 제거하려면 .isna() 빼세요)
    keep_mask = (m > MAPQ_THRESHOLD) | m.isna()
    kept = int(keep_mask.sum())
    removed = int((~keep_mask).sum())

    out_path = out_dir / in_path.name
    df.loc[keep_mask].to_csv(out_path, index=False)
    print(f"✅ {in_path.name} -> kept={kept}, removed={removed}, saved: {out_path.name}")

✅ 250905_batch19_08step_assemble.csv -> kept=271, removed=19, saved: 250905_batch19_08step_assemble.csv
✅ 250910_batch20_07step_assemble.csv -> kept=425, removed=12, saved: 250910_batch20_07step_assemble.csv
✅ 250910_batch20_08step_assemble.csv -> kept=120, removed=7, saved: 250910_batch20_08step_assemble.csv
✅ 250910_batch20_09step_assemble.csv -> kept=136, removed=25, saved: 250910_batch20_09step_assemble.csv
✅ 250910_batch20_10step_assemble.csv -> kept=27, removed=17, saved: 250910_batch20_10step_assemble.csv


# Histogram

In [85]:
import os
import pandas as pd

# 📁 폴더 설정
input_folder = "fastq_7_8_9_10_11_12/4_align_bam/csv/MAPQ_removed"
histogram_folder = "fastq_7_8_9_10_11_12/5_align_histogram"
os.makedirs(histogram_folder, exist_ok=True)

# 📄 모든 CSV 파일 처리
files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

for file_name in files:
    file_path = os.path.join(input_folder, file_name)

    # 🔧 파일명 클렌징 (특정 문자열 제거)
    clean_name = file_name
    clean_name = clean_name.replace("assemble", "")
    clean_name = clean_name.replace("ID_match_FLASH.extendedFrags", "")
    clean_name = clean_name.replace("__", "_").strip("_")  # 중복/끝 _ 제거
    output_csv = os.path.join(histogram_folder, f"histogram_{clean_name}")

    try:
        df = pd.read_csv(file_path, dtype=str)
        if 'RNAME' not in df.columns:
            print(f"⚠️ Skipping file: {file_name} (no 'RNAME' column found)")
            continue

        # RNAME 집계 및 정규화
        rname_counts = df['RNAME'].value_counts().reset_index()
        rname_counts.columns = ['RNAME', 'Count']
        rname_counts.insert(0, 'File_Name', clean_name)
        rname_counts['Count'] = rname_counts['Count'].astype(int)
        total_count = rname_counts['Count'].sum()
        rname_counts['Normalized_Count'] = rname_counts['Count'] / total_count

        rname_counts.to_csv(output_csv, index=False)
        print(f"✅ Saved cleaned RNAME histogram: {output_csv}")

    except Exception as e:
        print(f"❌ Error processing file '{file_name}': {e}")

✅ Saved cleaned RNAME histogram: fastq_7_8_9_10_11_12/5_align_histogram/histogram_250910_batch20_09step_.csv
✅ Saved cleaned RNAME histogram: fastq_7_8_9_10_11_12/5_align_histogram/histogram_250905_batch19_08step_.csv
✅ Saved cleaned RNAME histogram: fastq_7_8_9_10_11_12/5_align_histogram/histogram_250910_batch20_07step_.csv
✅ Saved cleaned RNAME histogram: fastq_7_8_9_10_11_12/5_align_histogram/histogram_250910_batch20_08step_.csv
✅ Saved cleaned RNAME histogram: fastq_7_8_9_10_11_12/5_align_histogram/histogram_250910_batch20_10step_.csv


In [86]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# 📁 폴더 설정
histogram_folder = "fastq_7_8_9_10_11_12/5_align_histogram"
summary_folder = "fastq_7_8_9_10_11_12/5_align_histogram/graph_top5"
os.makedirs(summary_folder, exist_ok=True)

# 🔴 하이라이트 매핑 (suffix 기반)
highlight_mapping = {
    "_01step": "seq_0001_1",
    "_02step": "seq_0002_10",
    "_03step": "seq_0005_101",
    "_04step": "seq_0010_1010",
    "_05step": "seq_0021_10101",
    "_06step": "seq_0042_101010",
    "_07step": "seq_0085_1010101",
    "_08step": "seq_0170_10101010",
    "_09step": "seq_0341_101010101",
    "_10step": "seq_0682_1010101010",
    "_11step": "seq_1365_10101010101",
    "_12step": "seq_2730_101010101010",
}

# 📄 CSV 파일 리스트
csv_files = [f for f in os.listdir(histogram_folder) if f.startswith("histogram_") and f.endswith(".csv")]

# 🔁 파일 반복 처리
for file_name in csv_files:
    file_path = os.path.join(histogram_folder, file_name)
    try:
        df = pd.read_csv(file_path)
        if 'RNAME' not in df.columns or 'Normalized_Count' not in df.columns:
            print(f"⚠️ Skipping file: {file_name} (missing column)")
            continue

        # Top 5 RNAME 추출
        top_df = df.sort_values(by="Count", ascending=False).head(5).reset_index(drop=True)
        sample_name = file_name.replace("histogram_", "").replace(".csv", "")

        # 🔍 suffix 기반 하이라이트 RNAME 찾기
        highlight_rname = None
        for suffix, rname in highlight_mapping.items():
            if suffix in file_name:
                highlight_rname = rname
                break

        # 📊 그래프 생성
        plt.figure(figsize=(10, 6))
        bars = plt.bar(top_df["RNAME"], top_df["Normalized_Count"], color='blue')

        # 🔴 매칭되는 RNAME은 빨강색으로
        for bar, rname in zip(bars, top_df["RNAME"]):
            if rname == highlight_rname:
                bar.set_color('red')

        plt.title(f"Top 5 RNAME Histogram - {sample_name}")
        plt.xlabel("RNAME")
        plt.ylabel("Normalized Count")
        plt.xticks(rotation=45)
        plt.ylim(0, 1)
        plt.tight_layout()

        # 💾 저장
        output_png = os.path.join(summary_folder, file_name.replace(".csv", ".png"))
        output_svg = os.path.join(summary_folder, file_name.replace(".csv", ".svg"))
        plt.savefig(output_png)
        plt.savefig(output_svg)
        plt.close()

        print(f"✅ Saved plot: {output_png}, {output_svg}")

    except Exception as e:
        print(f"❌ Error processing {file_name}: {e}")

✅ Saved plot: fastq_7_8_9_10_11_12/5_align_histogram/graph_top5/histogram_250910_batch20_08step_.png, fastq_7_8_9_10_11_12/5_align_histogram/graph_top5/histogram_250910_batch20_08step_.svg
✅ Saved plot: fastq_7_8_9_10_11_12/5_align_histogram/graph_top5/histogram_250910_batch20_09step_.png, fastq_7_8_9_10_11_12/5_align_histogram/graph_top5/histogram_250910_batch20_09step_.svg
✅ Saved plot: fastq_7_8_9_10_11_12/5_align_histogram/graph_top5/histogram_250910_batch20_07step_.png, fastq_7_8_9_10_11_12/5_align_histogram/graph_top5/histogram_250910_batch20_07step_.svg
✅ Saved plot: fastq_7_8_9_10_11_12/5_align_histogram/graph_top5/histogram_250910_batch20_10step_.png, fastq_7_8_9_10_11_12/5_align_histogram/graph_top5/histogram_250910_batch20_10step_.svg
✅ Saved plot: fastq_7_8_9_10_11_12/5_align_histogram/graph_top5/histogram_250905_batch19_08step_.png, fastq_7_8_9_10_11_12/5_align_histogram/graph_top5/histogram_250905_batch19_08step_.svg


In [87]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

# 폴더 설정
histogram_folder = "fastq_7_8_9_10_11_12/5_align_histogram"
summary_folder = "fastq_7_8_9_10_11_12/6_align_summary"
os.makedirs(summary_folder, exist_ok=True)

# highlight 매핑 (접미사 기준)
highlight_mapping = {
    "_01step": "seq_0001_1",
    "_02step": "seq_0002_10",
    "_03step": "seq_0005_101",
    "_04step": "seq_0010_1010",
    "_05step": "seq_0021_10101",
    "_06step": "seq_0042_101010",
    "_07step": "seq_0085_1010101",
    "_08step": "seq_0170_10101010",
    "_09step": "seq_0341_101010101",
    "_10step": "seq_0682_1010101010",
    "_11step": "seq_1365_10101010101",
    "_12step": "seq_2730_101010101010",
}

# 회색 → 흰색 그라데이션 색상 함수
def blend_color(base_rgb, t):
    white = np.array([255, 255, 255])
    base = np.array(base_rgb)
    blended = (1 - t) * base + t * white
    return tuple(blended / 255)

base_rgb = (137, 137, 138)

# step 번호를 추출하여 오름차순 정렬을 위한 함수
def extract_step_number(name):
    match = re.search(r'_(\d+)step', name)
    return int(match.group(1)) if match else float('inf')

# sample별 데이터 로딩
sample_rname_dfs = {}
for file_name in os.listdir(histogram_folder):
    if file_name.startswith("histogram_") and file_name.endswith(".csv"):
        sample_name = file_name.replace("histogram_", "").replace(".csv", "")
        df = pd.read_csv(os.path.join(histogram_folder, file_name))
        if 'RNAME' not in df.columns or 'Count' not in df.columns:
            continue
        df['Sample'] = sample_name
        df['Count'] = df['Count'].astype(int)
        df['Normalized_Count'] = df['Count'] / df['Count'].sum()
        df = df.sort_values(by='Count', ascending=False).reset_index(drop=True)
        sample_rname_dfs[sample_name] = df

# sample_name을 step 기준으로 정렬
sorted_samples = sorted(sample_rname_dfs.items(), key=lambda x: extract_step_number(x[0]))

# 시각화
fig, ax = plt.subplots(figsize=(24, 12))

for sample_idx, (sample_name, df) in enumerate(sorted_samples):
    # highlight RNAME 찾기
    highlight_rname = None
    for suffix, rname in highlight_mapping.items():
        if suffix in sample_name:  # 정확한 끝이 아니라 포함 여부로 수정
            highlight_rname = rname
            break

    bottom = 0
    top_n = 5
    rest_sum = 0

    for rank, row in df.iterrows():
        rname = row['RNAME']
        height = row['Normalized_Count']

        if rname == highlight_rname:
            ax.bar(sample_name, height, bottom=bottom, color='red', edgecolor='black', linewidth=0.2)
            bottom += height
        elif rank < top_n:
            t = rank / (top_n - 1) if top_n > 1 else 0
            color = blend_color(base_rgb, t)
            ax.bar(sample_name, height, bottom=bottom, color=color, edgecolor='black', linewidth=0.2)
            bottom += height
        else:
            rest_sum += height

    if rest_sum > 0:
        ax.bar(sample_name, rest_sum, bottom=bottom, color='white', edgecolor='black', linewidth=0.2)

# 보조선, 스타일
ax.axhline(y=0.5, color='gray', linestyle='--', linewidth=1, label='y = 0.5')
ax.set_ylabel("Normalized Count", fontsize=20)
ax.set_xlabel("Sample", fontsize=20)
ax.set_title("Stacked Bar Chart (Red = Highlight, Gray→White = Top 5, Rest = One White Box)", fontsize=16)
ax.tick_params(axis='x', labelsize=20)
ax.tick_params(axis='y', labelsize=20)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# 저장
png_path = os.path.join(summary_folder, "stacked_bar_top5_gray_rest_white_box.png")
svg_path = os.path.join(summary_folder, "stacked_bar_top5_gray_rest_white_box.svg")
plt.savefig(png_path)
plt.savefig(svg_path)
plt.close()

print(f"✅ 저장 완료:\n - PNG: {png_path}\n - SVG: {svg_path}")

✅ 저장 완료:
 - PNG: fastq_7_8_9_10_11_12/6_align_summary/stacked_bar_top5_gray_rest_white_box.png
 - SVG: fastq_7_8_9_10_11_12/6_align_summary/stacked_bar_top5_gray_rest_white_box.svg


In [88]:
import os
import pandas as pd
import re

# === Highlight mapping (suffix -> RNAME) ===
highlight_mapping = {
    "_01step": "seq_0001_1",
    "_02step": "seq_0002_10",
    "_03step": "seq_0005_101",
    "_04step": "seq_0010_1010",
    "_05step": "seq_0021_10101",
    "_06step": "seq_0042_101010",
    "_07step": "seq_0085_1010101",
    "_08step": "seq_0170_10101010",
    "_09step": "seq_0341_101010101",
    "_10step": "seq_0682_1010101010",
    "_11step": "seq_1365_10101010101",
    "_12step": "seq_2730_101010101010",
}

# === 폴더 설정 ===
histogram_folder = "fastq_7_8_9_10_11_12/5_align_histogram"
summary_folder = "fastq_7_8_9_10_11_12/6_align_summary"
os.makedirs(summary_folder, exist_ok=True)
highlight_result_csv = os.path.join(summary_folder, "highlight_result.csv")

# === step 번호 추출 함수 ===
def extract_step_number(filename):
    match = re.search(r"_(\d+)step", filename)
    return int(match.group(1)) if match else float("inf")

# === Highlight 요약 정보 수집 ===
highlight_data = []
csv_files = [f for f in os.listdir(histogram_folder) if f.startswith("histogram_") and f.endswith(".csv")]

for file in csv_files:
    file_path = os.path.join(histogram_folder, file)
    try:
        df = pd.read_csv(file_path)
        file_name = file.replace("histogram_", "")

        # suffix 기반 highlight_rname 추출
        highlight_rname = ""
        for suffix, rname in highlight_mapping.items():
            if suffix in file_name:
                highlight_rname = rname
                break

        df['Count'] = df['Count'].astype(int)
        total_count = df['Count'].sum()

        highlight_count = df[df['RNAME'] == highlight_rname]['Count'].sum() if highlight_rname else 0
        highlight_percentage = (highlight_count / total_count) * 100 if total_count > 0 else 0

        sorted_counts = df['Count'].sort_values(ascending=False).values
        second_max_count = sorted_counts[1] if len(sorted_counts) >= 2 else (sorted_counts[0] if len(sorted_counts) == 1 else 0)
        highlight_vs_second_ratio = (highlight_count / second_max_count) if second_max_count > 0 else 0

        highlight_data.append([
            file_name,
            highlight_count,
            total_count,
            round(highlight_percentage, 2),
            highlight_rname,
            round(highlight_vs_second_ratio, 3),
            extract_step_number(file_name)
        ])

    except Exception as e:
        print(f"❌ Error processing file '{file}': {e}")

# === DataFrame 생성 및 step 기준 정렬 후 저장 ===
highlight_df = pd.DataFrame(highlight_data, columns=[
    'File',
    'Highlight_Count',
    'Total_Count',
    'Highlight_Percentage',
    'Highlight_RNAMEs',
    'Highlight_vs_SecondTop_Ratio',
    'Step_Number'
])

highlight_df = highlight_df.sort_values(by='Step_Number').drop(columns='Step_Number')
highlight_df.to_csv(highlight_result_csv, index=False)

print(f"📌 Highlight summary saved to: {highlight_result_csv}")

📌 Highlight summary saved to: fastq_7_8_9_10_11_12/6_align_summary/highlight_result.csv
