In [1]:
import gzip
import itertools
import subprocess
import os
import re
import progressbar
import time
from itertools import izip_longest, izip, islice, tee, repeat

In [2]:
__author__='Maggie Ruimin Sun'

Reference source code: umitag.py, scripted by Martin Aryee.

Source code can be found at https://github.com/aryeelab/umi/blob/dev/umitag.py

In [72]:
source = '/home/yaneng/RSun/Data/qiagen-colon/undetermined/'
out_dir = '/home/yaneng/RSun/Data/qiagen-colon/umi_tagged/'
read1 = source + 'QIAGEN-2959YJ_S2_L001_R1_001_undetermined.fq'
read2 = source + 'QIAGEN-2959YJ_S2_L001_R2_001_undetermined.fq'
index1 = source + 'QIAGEN-2959YJ_S2_L001_I1_001_undetermined.fq'
index2 = source + 'QIAGEN-2959YJ_S2_L001_I2_001_undetermined.fq'
read1_out = out_dir + 'QIAGEN-2959YJ_S2_L001_R1_001_umi.fq'
read2_out = out_dir + 'QIAGEN-2959YJ_S2_L001_R2_001_umi.fq'

In [32]:
def read_fq(filename):
    if re.search('.gz$',filename):
        fastq = gzip.open(filename, 'rb')
    else:
        fastq = open(filename)

    with fastq as f:
        while True:
            l1 = f.readline()
            if not l1:
                break
            l2 = f.readline()
            l3 = f.readline()
            l4 = f.readline()

            yield [l1, l2, l3, l4]

In [68]:
def get_umi(r1, r2, i1, i2):
    molecular_barcode = i2[1].strip()
    if len(r1[1]) < 7:
        s1 = 'N' * 6
    else:
        s1 = r1[1][0:6]
    if len(r2[1]) < 7:
        s2 = 'N' * 6
    else:
        s2 = r2[1][0:6]
    return '(%s_%s_%s)' % (molecular_barcode, s1, s2)

In [70]:
def umitag(read1, read2, index1, index2, read1_out, read2_out, out_dir):
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    
    r1_umitagged_unsorted_file = read1_out + '.tmp'
    r2_umitagged_unsorted_file = read2_out + '.tmp'
    
    #Create UMI-tagged R1 and R2 FASTQs
    r1_umitagged = open(r1_umitagged_unsorted_file, 'w')
    r2_umitagged = open(r2_umitagged_unsorted_file, 'w')
    
    Nreads = 0
    ferr = open(out_dir+'error_reads.fastq', 'w')
    for r1, r2, i1, i2 in itertools.izip(read_fq(read1), read_fq(read2), read_fq(index1), read_fq(index2)):
        molecular_id = get_umi(r1, r2, i1, i2)
        if r1[0][0] != '@' or r2[0][0] != '@':
            ferr.write(str(Nreads)+'\t'+r1[0].strip()+'\t'+r2[0].stwc -rip()+'\t'+molecular_id+'\n')
            ferr.write(r1[1].strip()+'\t'+r2[1].strip()+'\n')
            ferr.write('+\t+\n')
            ferr.write(r1[3].strip()+'\t'+r2[3].strip()+'\n')
            
        r1[0] = '%s %s\n' % (r1[0].rstrip(), molecular_id)
        r2[0] = '%s %s\n' % (r2[0].rstrip(), molecular_id)
        for line in r1:
            r1_umitagged.write(line.strip()+'\n')
            
        for line in r2:
            r2_umitagged.write(line.strip()+'\n')
        Nreads += 1
    r1_umitagged.close()
    r2_umitagged.close()
    print Nreads
    #Sort FASTQs based on molecular barcodes
    cmd = 'cat '+r1_umitagged_unsorted_file + ' | paste - - - - | sort -k 3,3 -k 1,1 | tr "\t" "\n" > ' + read1_out
    subprocess.check_call(cmd, shell=True, env=os.environ.copy())
    cmd = 'cat '+r2_umitagged_unsorted_file + ' | paste - - - - | sort -k 3,3 -k 1,1 | tr "\t" "\n" > ' + read2_out
    subprocess.check_call(cmd, shell=True, env=os.environ.copy())
    
    os.remove(r1_umitagged_unsorted_file)
    os.remove(r2_umitagged_unsorted_file)

In [11]:
def main():
    umitag(read1, read2, index1, index2, read1_out, read2_out, source+out_dir)

In [69]:
if __name__=='__main__':
    main()

1608349
