In [None]:
import os
import pandas as pd
from pepars.utils import Illumina_FASTQ_File_Set
from pepars.utils import DNA as DNA_utils
from pepars.utils import FASTQ as FASTQ_utils
from pepars.analysis import DNA as DNA_analysis

from tqdm.notebook import tqdm

from Bio.Seq import reverse_complement 

For CAP-B analysis, we determine the frequency of read counts within each sequenced library. The files are read from a folder titled `raw`, and the text before the first underscore is indicated in the `sample_names` variable. The output files can then be analyzed with `analysis_and_plotting.ipynb`

In [None]:
file_path = 'raw'

sample_names = ['CAP-B-pool',
                'CREATE-Brain-1',
                'CREATE-Brain-2',
                'CREATE-Liver-1',
                'CREATE-Liver-2',
                'T7-Brain-1',
                'T7-Brain-2',
                'T7-Liver-1',
                'T7-Liver-2']
index = 0

for sample in sample_names:

    file_set = Illumina_FASTQ_File_Set(file_path, sample)

    counts = {}
    
    for sequences in tqdm(file_set.get_sequence_iterator()):
        
        # This code determines which nucleotides are examined as counts
        seq = sequences[0][29:50]
        
        # Here we can quantify the number of counts
        if seq not in counts:
            counts[seq] = 1
        else:
            counts[seq] += 1
    
    # We can sort the sequences by prevalence
    sorted_barcode_counts = [(barcode, count) for barcode, count in sorted(counts.items(), key=lambda x: x[1], reverse=True)]

    file_set.close()
    
    # Save the file
    path = "analysis/"

    outfilename = path + sample + "_counts.csv"
    
    df = pd.DataFrame(sorted_barcode_counts, columns = ['barcode', 'counts'])
    df.to_csv(outfilename, index=False)
    
    #print progress
    print(index)
    index += 1