In [1]:
# Import the 'SeqIO' module from the Biopython library
# SeqIO is designed for reading and writing sequence files
from Bio import SeqIO
import os

# --- 1. Define the path to our data file ---
# We are in the 'notebooks' folder, so we need to go 'up' one level (..)
# and then down into 'data/raw'
file_path = os.path.join("..", "data", "raw", "SRR35536287_1.fastq")

# --- 2. Initialize a counter and set a limit ---
# We don't want to print all 144,000+ sequences!
limit = 5
count = 0

print(f"--- Reading first {limit} sequences from the file ---")

# --- 3. Loop through the FASTQ file ---
# SeqIO.parse is a smart way to read a sequence file one record at a time
# It's memory efficient because it doesn't load the whole file at once.
for record in SeqIO.parse(file_path, "fastq"):
    if count < limit:
        # Print the information for the current record
        print(f"ID: {record.id}")
        print(f"Sequence: {record.seq[:60]}...") # Print first 60 letters
        print(f"Length: {len(record.seq)} bp")  # bp stands for base pairs
        
        # Get the quality scores for the first 60 bases
        quality_scores = record.letter_annotations["phred_quality"][:60]
        print(f"Quality: {quality_scores}")
        print("-" * 30) # A separator for readability
        
        count += 1
    else:
        # Stop the loop once we reach our limit
        break

print(f"\nSuccessfully read {count} records.")

--- Reading first 5 sequences from the file ---
ID: SRR35536287.1
Sequence: GGTACTGGTTGAACAGTATATCCTCCGTTATCATCCTTTCCACTAGCATTTCCTTTTTCT...
Length: 301 bp
Quality: [12, 12, 32, 27, 34, 37, 37, 23, 36, 34, 31, 12, 34, 26, 11, 11, 34, 27, 37, 34, 35, 36, 37, 34, 36, 37, 10, 34, 36, 22, 34, 34, 11, 11, 27, 12, 36, 27, 31, 31, 34, 11, 34, 11, 11, 11, 26, 11, 11, 27, 11, 11, 11, 34, 11, 11, 26, 36, 36, 37]
------------------------------
ID: SRR35536287.2
Sequence: GGTACAGGATGAACAGTTTATCCTCCTTTGTGTTCGATTACTTGCCATTGTTCTCCTTCA...
Length: 301 bp
Quality: [24, 34, 34, 34, 34, 37, 37, 23, 11, 34, 34, 12, 34, 34, 11, 34, 36, 37, 37, 34, 34, 36, 37, 36, 37, 37, 38, 38, 38, 11, 11, 11, 28, 27, 37, 21, 11, 34, 34, 11, 26, 34, 11, 11, 21, 34, 11, 31, 11, 11, 27, 34, 34, 32, 34, 34, 37, 11, 26, 11]
------------------------------
ID: SRR35536287.3
Sequence: GGTACTGGTTGAACAGTATATCCTCCGTTATCAGCCTTTCCACAAGCAATTCCTGTTTCT...
Length: 299 bp
Quality: [12, 32, 34, 34, 34, 37, 35, 36, 36, 36, 21, 31, 27, 26, 27,