In [2]:
import os
import re
from pathlib import Path
import pandas as pd
import gzip

In [18]:
# Path to your merged FASTQ files
path = Path("/Users/leandro/Desktop/github/NGS-data/SeV-ORF1_251003/merged")
control_filename = "control-index5_S5_L001.merged.fastq.gz"
control_path = path / control_filename

# Patterns to be searched for
pattern_wt = re.compile(r"GAGAACTTCCCCAATCTAGCAAGG", re.IGNORECASE)   # ORF1
pattern_edit = re.compile(r"GAGAACTTCCCCAACTTATCTAGC", re.IGNORECASE) # ORF1
#pattern_wt = re.compile(r"ACTGAGCACGTGATGGCAGA", re.IGNORECASE) # HEK3
#pattern_edit = re.compile(r"ACTGAGCACGcttgTGATGGCAGA", re.IGNORECASE) # HEK3

def count_matches(file_path, pattern):
    count = 0
    # Automatically detect gzipped files
    open_func = gzip.open if str(file_path).endswith(".gz") else open

    with open_func(file_path, 'rt') as f:  # 'rt' = read text mode
        for i, line in enumerate(f):
            if i % 4 == 1:  # sequence line in FASTQ
                count += len(pattern.findall(line))
    return count


# Analyze all .merged.fastq files
results = []

for fastq_path in sorted(path.glob("*.merged.fastq.gz")):
    sample = fastq_path.name
    total_reads = 0
    wt_reads = 0
    edited_reads = 0

    with gzip.open(fastq_path, "rt") as f:
        for i, line in enumerate(f):
            if i % 4 == 0 and line.startswith("@"):
                total_reads += 1
            elif i % 4 == 1:  # sequence line
                seq = line.strip()
                if pattern_edit.search(seq):
                    edited_reads += 1
                elif pattern_wt.search(seq):
                    wt_reads += 1

    if (edited_reads + wt_reads) > 0:
        efficiency = round((edited_reads / (edited_reads + wt_reads)) * 100, 3)
    else:
        efficiency = None

    results.append({
        "sample": sample,
        "total_reads": total_reads,
        "edited_reads": edited_reads,
        "wt_reads": wt_reads,
        "Efficiency": efficiency
    })

In [19]:
# Convert to DataFrame
df = pd.DataFrame(results)
df = df.sort_values(by="Efficiency", ascending=False)

# Show DataFrame in notebook
df.head()

# Export as CSV
output_csv = path / "editing_output.csv"
df.to_csv(output_csv, index=False)

print(f"✅ Results exported to: {output_csv}")

✅ Results exported to: /Users/leandro/Desktop/github/NGS-data/SeV-ORF1_251003/merged/editing_output.csv
