In [None]:
import os
import pandas as pd
from pepars.utils import Illumina_FASTQ_File_Set
from pepars.utils import DNA as DNA_utils
from pepars.utils import FASTQ as FASTQ_utils
from pepars.analysis import DNA as DNA_analysis

from tqdm.notebook import tqdm

from Bio.Seq import reverse_complement 

This code is adapted from work by David Brown for the MCREATE paper, which was presented here: https://doi.org/10.1038/s41592-020-0799-7

The following code takes in illumina fastq files, collects the sequence between the region for 452-458, 492-498, and 585-591, and counts the prevalence. These counts are then stored in a file and saved.

In [None]:
file_path = 'raw'

sample_names = ['2sll',
                '2svg',
                '3sll',
                '3svg']
index = 0

for sample in sample_names:

    file_set = Illumina_FASTQ_File_Set(file_path, sample)

    counts = {}
    
    for sequences in tqdm(file_set.get_sequence_iterator()):
        
        seq_452 = sequences[0][29:50]
        seq_492 = sequences[0][149:170]
        seq_585 = reverse_complement(sequences[1][27:48])
        
        seq = seq_452 + seq_585
        seq = seq_452 + seq_492 + seq_585
        
        if seq not in counts:
            counts[seq] = 1
        else:
            counts[seq] += 1

    sorted_barcode_counts = [(barcode, count) for barcode, count in sorted(counts.items(), key=lambda x: x[1], reverse=True)]

    file_set.close()
    
    path = "analysis/"

    outfilename = path + sample + "_counts.csv"
    
    df = pd.DataFrame(sorted_barcode_counts, columns = ['barcode', 'counts'])
    df.to_csv(outfilename, index=False)
    
    print(index)
    index += 1