In [6]:
import os
import re
from pathlib import Path
import pandas as pd
import gzip

In [16]:
# Path to your merged FASTQ files
path = Path("/Users/leandro/Desktop/github/NGS-data/SeV-ORF1_040725/merged")
control_filename = "RTtest6-c1_S1_L001.merged.fastq.gz"
control_path = path / control_filename

# Patterns to be searched for
pattern_wt = re.compile(r"ccaggagaacttccccaatctagcaaggca", re.IGNORECASE)   # ORF1RP
pattern_edit = re.compile(r"gagaacttccccaaCTTAtctagc", re.IGNORECASE) # ORF1RP
#pattern_edit = re.compile(r"gagcACGGCAGAgCTTGg", re.IGNORECASE)
#patter_wt_INF = re.compile(r"gagcACGGCAGAggaaag", re.IGNORECASE)

def count_matches(file_path, pattern):
    count = 0
    # Automatically detect gzipped files
    open_func = gzip.open if str(file_path).endswith(".gz") else open

    with open_func(file_path, 'rt') as f:  # 'rt' = read text mode
        for i, line in enumerate(f):
            if i % 4 == 1:  # sequence line in FASTQ
                count += len(pattern.findall(line))
    return count

# Count spontaneous background signal from control sample
if not control_path.exists():
    raise FileNotFoundError(f"Control sample not found at: {control_path}")

spont = count_matches(control_path, pattern_edit)
print(f" Background signal (from control): {spont}")

# Analyze all .merged.fastq files
results = []
for fastq_path in sorted(path.glob("*.merged.fastq.gz")):
    sample = fastq_path.name
    total_reads = 0
    edited_reads = 0

    with gzip.open(fastq_path, 'rt') as f:  # correct model 'rt' for startswith to accept strings!
        for i, line in enumerate(f):
            if i % 4 == 0 and line.startswith("@"):
                total_reads += 1
            elif i % 4 == 1:  # sequence line
                edited_reads += len(pattern_edit.findall(line))

    if total_reads > 0:
        efficiency = round(((edited_reads - spont) / total_reads) * 100, 3)
    else:
        efficiency = None

    results.append({
        "Sample": sample,
        "Total_Reads": total_reads,
        "Edited_Reads": edited_reads,
        "Efficiency(%)": efficiency
    })

 Background signal (from control): 0


In [17]:
# Convert to DataFrame
df = pd.DataFrame(results)
df = df.sort_values(by="Efficiency(%)", ascending=False)

# Show DataFrame in notebook
df.head()

# Export as CSV
output_csv = path / "editing_output.csv"
df.to_csv(output_csv, index=False)

print(f"✅ Results exported to: {output_csv}")

✅ Results exported to: /Users/leandro/Desktop/github/NGS-data/SeV-ORF1_040725/merged/editing_output.csv
