In [33]:
import gzip
import glob
import re
import pickle
import pandas as pd
from Bio import SeqIO 
from collections import defaultdict

In [34]:
def read2bcd(seqs, gzfile): 
    r2b = defaultdict(list) 
    seqs_set = set(seqs) # convert seqs to a set for faster lookup
    
    with gzip.open(gzfile, 'rt') as fastq:         
        for read in SeqIO.parse(fastq, 'fastq'):  
            read_seq = str(read.seq)  # Convert Bio.Seq to str once per read
            for seq in seqs_set: 
                if seq in read_seq:
                    r2b[read.name].append(seq)
    return dict(r2b)   

In [35]:
tfBcd = {'HNF1B': 'ATCATATTGAATATGAAGCG',
         'SOX17': 'TTTTTGTGAGTTTCAATACG',
         'FOXA2': 'TAAGGTAAGTTTATTCCGGA',
         'PDX1' : 'ATCTCCTAAATTGCCGCTCC',
         'SOX9' : 'TGGATTACTTTGATAGCAGG',
         'HNF6' : 'TCTAGAATAAAAGCCTTTGT',
         'GFP'  : 'GTACCGAGCTCGAATTCCAG'
        }

In [36]:
bcd_R2 = 'Sample_Cust-*/*R2_001.fastq.gz'

In [38]:
# for each sample, collect the reads containing TF barcodes

sm2r2b = defaultdict(list) 

for R2 in glob.glob(bcd_R2):
    sample = re.search('Sample_Cust-(.+?)/', R2).group(1)
    
    sm2r2b[sample] = read2bcd(tfBcd.values(), R2)  
    

In [39]:
for sm, r2b in sm2r2b.items():  
    # write to file 
    with open (sm+'_read2TF.tsv', 'w') as f: 
        f.write('Read\tBarcode\n')
        for read, barcodes in r2b.items():
            for barcode in barcodes:
                f.write(f'{read}\t{barcode}\n')            

In [40]:
bcd_R1 = 'Sample_Cust-*/*R1_001.fastq.gz'

In [42]:
# Now map reads to cells 
# the cell barcode is the first 16 bp of the read 

sm2r2c = defaultdict(list)

for R1 in glob.glob(bcd_R1):
    sample = re.search('Sample_Cust-(.+?)/', R1).group(1)
    r2c = defaultdict(list)    
        
    with gzip.open(R1, 'rt') as fastq:
        for read in SeqIO.parse(fastq, 'fastq'):                    
            r2c[read.name].append(read.seq[:16])
            
    sm2r2c[sample] = dict(r2c)
        

In [43]:
# write sm2r2c to tsv
for sm, r2c in sm2r2c.items():
    with open (sm+'_read2cell.tsv', 'w') as f: 
        f.write('Read\tBarcode\n')
        for read_c, barcodes_c in r2c.items():
            for barcode_c in barcodes_c:
                f.write(f'{read_c}\t{barcode_c}\n')    
