This example showcases how to find sequences matching a particular template in a FASTQ file, and converts it into a list of sequence counts

In [None]:
import os

from pepars.fileio import fileio
from pepars.utils import FASTQ_File_Set
from pepars.alignment.Perfect_Match_Aligner import Perfect_Match_Aligner

In [None]:
# Download example FASTQ data for testing
REMOTE_FILE_URL = "https://caltech.box.com/shared/static/5a1zi1pawtn1x15tupr1pub01wqa5kfg.gz"
FASTQ_FILE_PATH = os.path.join("data", "example_reads.fastq.gz")

# This downloads the file, but only if it doesn't already exist
fileio.download_remote_file(REMOTE_FILE_URL, FASTQ_FILE_PATH)

In [None]:
# If there is a template mismatch, but it is below this quality threshold - count it as a match anyway
TEMPLATE_MISMATCH_QUALITY_THRESHOLD = 20

# If there is an element of the variant sequence that is below this threshold, throw it away
VARIANT_QUALITY_THRESHOLD = 30

# The template to align against. All IUPAC grammar is allowed - any degenerate nucleotides are considered
# part of the "variant" region
# X's are a special character meaning to ignore that position
# I's are a special character indicating this is the location of a UID, which should be counted as one read
# if there are duplicates
TEMPLATE = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXGAGTGCCCAANNKNNKNNKNNKNNKNNKNNKGCACAGGCGCXXXXXXXXXXXXXXXXXXXX"

In [None]:
FASTQ_file_sets = [FASTQ_File_Set([FASTQ_FILE_PATH])]

In [None]:
aligner = Perfect_Match_Aligner()

alignment_parameters = {
    "variant_sequence_quality_threshold": VARIANT_QUALITY_THRESHOLD,
    "mismatch_quality_threshold": TEMPLATE_MISMATCH_QUALITY_THRESHOLD
}

sequence_counts, statistics = aligner.align(
    TEMPLATE,
    FASTQ_file_sets,
    alignment_parameters,
    print
)

In [None]:
for key, value in statistics.items():
    print("%s: %.4f" % (key, value))