In [1]:
import os
import re
from pathlib import Path
import pandas as pd

In [11]:
# Path to your merged FASTQ files
path = Path("/Users/leandrojorqueravalero/Desktop/PhD/Miseq/RT_test-v2/feb-INF/merged")
control_filename = "feb-control_S24_L001.assembled.fastq"
control_path = path / control_filename

# Patterns to be searched for
#pattern_wt_OOF = re.compile(r"gagcACGTGATGGCAGAg", re.IGNORECASE)
#pattern_edit_OOF = re.compile(r"gagcACGcttgTGATGGCAGAg", re.IGNORECASE)
pattern_edit = re.compile(r"gagcACGGCAGAgCTTGg", re.IGNORECASE)
#patter_wt_INF = re.compile(r"gagcACGGCAGAggaaag", re.IGNORECASE)

# Function to count pattern matches in sequence lines only (line 2 in each FASTQ entry)
def count_matches(file_path, pattern):
    count = 0
    with open(file_path, 'r') as f:
        for i, line in enumerate(f):
            if i % 4 == 1:  # sequence line
                count += len(pattern.findall(line))
    return count

# Count spontaneous background signal from control sample
if not control_path.exists():
    raise FileNotFoundError(f"Control sample not found at: {control_path}")

spont = count_matches(control_path, pattern_edit)
print(f" Background signal (from control): {spont}")

# Analyze all .assembled.fastq files
results = []
for fastq_path in sorted(path.glob("*.assembled.fastq")):
    sample = fastq_path.name
    total_reads = 0
    edited_reads = 0

    with open(fastq_path, 'r') as f:
        for i, line in enumerate(f):
            if i % 4 == 0 and line.startswith("@"):
                total_reads += 1
            elif i % 4 == 1:  # sequence line
                edited_reads += len(pattern_edit.findall(line))

    if total_reads > 0:
        efficiency = round(((edited_reads - spont) / total_reads) * 100, 3)
    else:
        efficiency = None

    results.append({
        "Sample": sample,
        "Total_Reads": total_reads,
        "Edited_Reads": edited_reads,
        "Efficiency(%)": efficiency
    })

 Background signal (from control): 0


In [12]:
# Convert to DataFrame
df = pd.DataFrame(results)
df = df.sort_values(by="Efficiency(%)", ascending=False)

# Show DataFrame in notebook
df.head()

# Export as CSV
output_csv = path / "editing_output.csv"
df.to_csv(output_csv, index=False)

print(f"✅ Results exported to: {output_csv}")

✅ Results exported to: /Users/leandrojorqueravalero/Desktop/PhD/Miseq/RT_test-v2/feb-INF/merged/editing_output.csv
