In [1]:
# Import the SeqIO module from the Biopython library for reading sequence files
from Bio import SeqIO
import os

# --- 1. Define the path to our data file ---
# We'll start by analyzing the first file of the pair (_1.fastq)
# os.path.join is used to create a file path that works on any operating system
file_path = os.path.join('..', 'data', '01_raw', 'SRR26872904_1.fastq')

# --- 2. Parse the FASTQ file and inspect the first few records ---
# We use a counter to limit the output to just the first 5 records for a quick look.
# This is important because the file contains over 1.5 million sequences!

print(f"Inspecting the first 5 records from: {file_path}\n")

# We'll use a list to store the first few records to analyze them later
first_five_records = []

# SeqIO.parse() is a powerful function that reads the file record by record.
# This is memory-efficient as it doesn't load the whole file at once.
with open(file_path, "r") as handle:
    for i, record in enumerate(SeqIO.parse(handle, "fastq")):
        if i < 5:
            print("--- Record", i+1, "---")
            print("ID:", record.id)
            print("Sequence:", record.seq)
            print("Length:", len(record.seq))
            # The letter_annotations attribute holds the quality scores for each base
            print("Quality Scores (first 20):", record.letter_annotations["phred_quality"][:20])
            print("\n")
            first_five_records.append(record)
        else:
            # Once we have 5 records, we stop the loop
            break

# --- 3. Initial Summary ---
# Now let's get a total count of records in the file to understand its size.
# This can take a moment to run.
print("Calculating total number of sequences in the file...")

# Re-opening the file to count all records
with open(file_path, "r") as handle:
    total_records = sum(1 for record in SeqIO.parse(handle, "fastq"))

print(f"\nTotal number of sequences in the file: {total_records}")



Inspecting the first 5 records from: ..\data\01_raw\SRR26872904_1.fastq

--- Record 1 ---
ID: SRR26872904.1
Sequence: CNGTCACGTTCGTTGCTTTCAAGGATGACAGACTCAGTCTGACAGTTGTTTATAACAACCCGACCTATGTCTTTTTACATCCTATTATCTGAAACGCAATTAAGCAGACAGATCACTCCACGAACTAAGAACGGCCATGCACCACCTATCA
Length: 151
Quality Scores (first 20): [37, 2, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37]


--- Record 2 ---
ID: SRR26872904.2
Sequence: ANGGGCACCACAAGAACGCGTGGAGCGTGTGGCTTAATTTGACTCAACGCGGGAAATCTTACCGGGTCCGGACACACTGAGGATTGACAGATATATGCACGTTCACGCCTTCGGGTGTGCGTGCTTAAAGATGCTAGTTCTTTCATGATTA
Length: 151
Quality Scores (first 20): [37, 2, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37]


--- Record 3 ---
ID: SRR26872904.3
Sequence: CNGTCACGTTCGTTGCCTTCAGGATGAGTGTATCACGACACCGCAGCGGGGCCACTCACAGTTAAGCGAGTGGGGAACCCCACGCGGTACGCTAAATGAATACCATCCATTGCGAAACGCAATTAAGCAGACAGATCACTCCACGAACTAA
Length: 151
Quality Scores (first 20): [37, 2, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,