In [2]:
# Parsing and exploration of FASTQ identifiers
import sys
import re
import os
import shutil

def splitIdentifiers(idFile):
    with open(idFile, 'r') as f:
        identifiers = f.readlines()
    identifiers = [x.rstrip('\n') for x in identifiers]

    # example id: @ERR013138.1 IL39_4668:5:1:1035:1408/1
    # Splits into name, instrument, flow_cell_lane, flow_cell_number, x_coord, y_coord
    # Regex pattern1 based on https://en.wikipedia.org/wiki/FASTQ_format#Illumina_sequence_identifiers: 
    #               (\@\w*.\d+)\s(\w+):(\d+):(\d+):(\d+):(\d+)(#\d+)*\/(\d+)$
    #               https://regex101.com/r/RnuEOJ/3
    p = re.compile(r'(\@\w*.\d+)\s(\w+):(\d+):(\d+):(\d+):(\d+)(#\d+)*\/(\d+)$')
    id_components = {"whole_identifier":[],
              "names":[], 
              "instrument":[], 
              "flow_cell_lane": [],
              "flow_cell_number": [],
              "x_coord": [],
              "y_coord": [],
              #"index_number": [],
              "pair_member": []
              }

    for ident in identifiers:
        #print(ident)
        match = p.match(ident)
        if not match:
            print("ERROR: Pattern matching failed on identifiers!")
            sys.exit()
        id_components["whole_identifier"].append(match.group(0))
        id_components["names"].append(match.group(1))
        id_components["instrument"].append(match.group(2))
        id_components["flow_cell_lane"].append(match.group(3))
        id_components["flow_cell_number"].append(match.group(4))
        id_components["x_coord"].append(match.group(5))
        id_components["y_coord"].append(match.group(6))
        #id_components["index_number"].append(match.group(7))
        id_components["pair_member"].append(match.group(8))
    
    # print first 5 id components to make sure we did it right
    # for key in id_components:
    #     print(key, ' : ')
    #     for i in range(5):
    #         print(id_components[key][i])
    #     print()
    return id_components

def writeIdentifiers(id_components):
    # Write components of identifiers to individual text files in a new directory 
    # Make a new directory to hold our files
    dirName = 'identifierComponents'
    try:
        os.mkdir(dirName)
    except FileExistsError:
        shutil.rmtree(dirName)
        os.mkdir(dirName)

    for key in id_components:
        with open('identifierComponents/'+key+'.txt', 'w') as f:
            for item in id_components[key]:
                f.write(item+'\n')

def reassembleIdentifiers():
    # Just a quick sanity check that I parsed identifiers properly
    id_components = {"whole_identifier":[],
              "names":[], 
              "instrument":[], 
              "flow_cell_lane": [],
              "flow_cell_number": [],
              "x_coord": [],
              "y_coord": [],
              #"index_number": [],
              "pair_member": []
              }
    for key in id_components:
        with open('identifierComponents/'+key+'.txt', 'r') as f:
            lines = f.readlines()

        lines = [x.rstrip('\n') for x in lines]
        id_components[key] = lines

    with open('reassembled.txt', 'w') as f:
        for i in range(len(id_components["names"])):
            # example id: @ERR013138.1 IL39_4668:5:1:1035:1408/1
            f.write(f'{id_components["names"][i]} {id_components["instrument"][i]}:{id_components["flow_cell_lane"][i]}:{id_components["flow_cell_number"][i]}:{id_components["x_coord"][i]}:{id_components["y_coord"][i]}/{id_components["pair_member"][i]}\n')

result = splitIdentifiers('identifiers.txt')
writeIdentifiers(result)
# reassembleIdentifiers() # Can do this to check parsing didn't mess up the files


# Print the first 5 records
for key in result:
    print(key, ' : ')
    for i in range(5):
        print(result[key][i])
    print()


whole_identifier  : 
@ERR013138.1 IL39_4668:5:1:1035:1408/1
@ERR013138.2 IL39_4668:5:1:1035:8133/1
@ERR013138.3 IL39_4668:5:1:1035:6544/1
@ERR013138.4 IL39_4668:5:1:1035:2520/1
@ERR013138.5 IL39_4668:5:1:1036:15336/1

names  : 
@ERR013138.1
@ERR013138.2
@ERR013138.3
@ERR013138.4
@ERR013138.5

instrument  : 
IL39_4668
IL39_4668
IL39_4668
IL39_4668
IL39_4668

flow_cell_lane  : 
5
5
5
5
5

flow_cell_number  : 
1
1
1
1
1

x_coord  : 
1035
1035
1035
1035
1036

y_coord  : 
1408
8133
6544
2520
15336

pair_member  : 
1
1
1
1
1

