In [1]:
from Bio.Seq import Seq
from Bio.SeqIO.QualityIO import FastqGeneralIterator
import numpy as np
import re
import os
import gzip

In [9]:
i5_index = {
    "D501": "AGGCTATA",
    "D502": "GCCTCTAT",
    "D503": "AGGATAGG",
    "D504": "TCAGAGCC",
    "D505": "CTTCGCCT",
    "D506": "TAAGATTA",
    "D507": "ACGTCCTG",
    "D508": "GTCAGTAC"
}

for k, v in i5_index.items():
    i5_index[k] = str(Seq(v).reverse_complement())
    
i7_index = {
    "D701": "ATTACTCG",
    "D702": "TCCGGAGA", 
    "D703": "CGCTCATT", 
    "D704": "GAGATTCC", 
    "D705": "ATTCAGAA",
    "D706": "GAATTCGT",
    "D707": "CTGAAGCT",
    "D708": "TAATGCGC",
    "D709": "CGGCTATG",
    "D710": "TCCGCGAA",
    "D711": "TCTCGCGC",
    "D712": "AGCGATAG"
}

In [11]:
# BPS double barcode cross validation
dual_indices = [i7_index["D701"] + "_" +i5_index["D501"],
               i7_index["D702"] + "_" +i5_index["D502"],
               i7_index["D703"] + "_" +i5_index["D503"],
               i7_index["D704"] + "_" +i5_index["D504"],
               i7_index["D705"] + "_" +i5_index["D505"], 
               i7_index["D706"] + "_" +i5_index["D506"],
               i7_index["D707"] + "_" +i5_index["D507"],
               i7_index["D708"] + "_" +i5_index["D508"],
               i7_index["D712"] + "_" +i5_index["D508"],
               i7_index["D711"] + "_" +i5_index["D507"],
               i7_index["D710"] + "_" +i5_index["D506"],
               i7_index["D709"] + "_" +i5_index["D505"],
               i7_index["D708"] + "_" +i5_index["D504"],
               i7_index["D707"] + "_" +i5_index["D503"],
               i7_index["D706"] + "_" +i5_index["D502"],
               i7_index["D705"] + "_" +i5_index["D501"]
               ]



In [12]:
for i in dual_indices:
    vars()[i] = open(i + ".txt", "w")
f_file = FastqGeneralIterator(gzip.open("./Undetermined_S0_L001_R1_001.fastq.gz", "rt"))
r_file = FastqGeneralIterator(gzip.open("./Undetermined_S0_L001_R2_001.fastq.gz", "rt"))
seqtag_pos = 0
f_multitag_pos = 8
f_barcode_pos = 57
r_multitag_pos = 7
r_barcode_pos = 32
f_barcode_length = 34
r_barcode_length = 27

recipient_re = re.compile('\D*?(.ACA|G.CA|GA.A|GAC.)\D{4,7}?AA\D{4,7}?AA\D{4,7}?TT\D{4,7}?(.TCG|C.CG|CT.G|CTC.)\D*')
donor_re = re.compile('\D*?(.GGC|T.GC|TG.C|TGG.)\D{4,7}?AA\D{4,7}?TT\D{4,7}?(.CGG|G.GG|GC.G|GCG.)\D*')
recipient_f_clipper = re.compile('(.ACA|G.CA|GA.A|GAC.)')
donor_f_clipper = re.compile('(.GGC|T.GC|TG.C|TGG.)')
recipient_r_clipper = re.compile('(.CTC|G.TC|GC.C|GCT.)')
donor_r_clipper = re.compile('(.GCG|G.CG|GG.G|GGC.)')

min_qs = 25
quality_bps_count = 0

for f_record, r_record in zip(f_file, r_file):
    
    fr = f_record[1]
    rr = r_record[1]
    fq = [ord(i) -33 for i in list(f_record[2])]
    rq = [ord(i) - 33 for i in list(r_record[2])]
    recipient_tag_grep = recipient_re.match(str(fr)[f_barcode_pos : f_barcode_pos + f_barcode_length])
    donor_tag_grep = donor_re.match(str(rr)[r_barcode_pos : r_barcode_pos + r_barcode_length])
    
    if recipient_tag_grep is not None and donor_tag_grep is not None:
        if np.mean(fq[recipient_tag_grep.start() : recipient_tag_grep.end()]) >= min_qs and \
        np.mean(rq[donor_tag_grep.start() : donor_tag_grep.end()]) >= min_qs:
            index_read = f_record[0][-17:].replace("+", "_")
            f_multitag = fr[f_multitag_pos : f_multitag_pos + 6]
            f_seqtag = fr[0:8]
            r_seqtag = rr[0:7]
            r_multitag = rr[r_multitag_pos : r_multitag_pos + 9]
            quality_bps_count += 1
            if index_read in dual_indices:
                vars()[index_read].write(recipient_tag_grep.group()[4:-4] + "," + donor_tag_grep.group()[4:-4] 
                                     + "," + f_multitag + "," + r_multitag + "," + f_seqtag + r_seqtag + "\n")
#             vars()[index_read].write(rr + "\n")


for i in dual_indices:
    vars()[i].close()

In [140]:

print(quality_bps_count)

3790468
