In [17]:
# This program aims to process the data sequences and translate them into ascii texts
# Data sequence processing workflow:
# 1. Identify and exclude PCR primers. The PCR primers also corresponds to the Chapter number.
# 2. RS correcting the data sequence and extract data sequence.
# 3. Decode segment number (the first four bases after forward primer).
# 4. Convert data sequence to hex data.
# 5. Convert hex data to texts


from reedsolo import RSCodec, ReedSolomonError

rsc = RSCodec(2)

%store -r primerLibrary
%store -r referenceStrands
%store -r array_data_payload

def converter(seq):
    converter = {'A': '00', 'C': '01', 'G': '10', 'T': '11'} 
    bases = list(seq) 
    bases = [converter[base] for base in bases] 
    return ''.join(bases)

def deconverter(seq):
    deconverter = {'00': 'A', '01': 'C', '10': 'G', '11': 'T'} 
    doubleBits = [seq[i:i+2] for i in range(0, len(seq), 2)]
    doubleBits = [deconverter[doubleBit] for doubleBit in doubleBits] 
    return ''.join(doubleBits)

def oligoToBase3(seq):
    oligoToBase3 = {'G': '0', 'T': '1', 'A': '2', 'C': '3', 'N': '3'}
    oligoToBase3Converted = []
    for i in range (0, len(seq)):
        if (oligoToBase3[seq[i]] == '3'):
            return -1
        else:
            oligoToBase3Converted.append(oligoToBase3[seq[i]])
            if seq[i] == 'C':
                oligoToBase3 = {'G': '0', 'T': '1', 'A': '2', 'C': '3', 'N': '3'}
            elif seq[i] == 'G':
                oligoToBase3 = {'T': '0', 'A': '1', 'C': '2', 'G': '3', 'N': '3'}
            elif seq[i] == 'T':
                oligoToBase3 = {'A': '0', 'C': '1', 'G': '2', 'T': '3', 'N': '3'}
            elif seq[i] == 'A':
                oligoToBase3 = {'C': '0', 'G': '1', 'T': '2', 'A': '3', 'N': '3'}


    return ''.join(oligoToBase3Converted)

def ternaryToDecimal(n):
    decimal = 0
    n = ''.join(reversed(n))
    for i in range (0, len(n)):
        decimal += (int(n[i]))*(pow(3, i))
    return decimal

def most_frequent(List):
    counter = 0
    string = List[0]
     
    for i in List:
        curr_frequency = List.count(i)
        if(curr_frequency > counter):
            counter = curr_frequency
            string = i
            
    return string

def binaryToHex(binary_string):
    decimal_representation = int(binary_string, 2)
    hexadecimal_string = hex(decimal_representation)
    return hexadecimal_string

def hexToText(hex_string):
    hex_string = hex_string[2:]
    bytes_object = bytes.fromhex(hex_string)
    ascii_string = bytes_object.decode("utf-8")
    return ascii_string



%store -r array_data

# Read input FASTQ file
file_R1 = open('amplicon-4_S6_L001_R1_001.fastq', 'r')
file_R2 = open('amplicon-4_S6_L001_R2_001.fastq', 'r')

Lines_R1 = file_R1.readlines()
Lines_R2 = file_R2.readlines()

new_Lines_R1 = []
new_Lines_R2 = []

complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}

for i in range (0, int(len(Lines_R1))):
    if i%4 == 1:
        new_Lines_R1.append(Lines_R1[i][0:151].strip('\n'))

for i in range (0, int(len(Lines_R2))):
    if i%4 == 1:
        reverse_complement_R2 = "".join(complement.get(base, base) for base in reversed(Lines_R2[i][0:151].strip('\n')))
        new_Lines_R2.append(reverse_complement_R2)

print('Total number of reads: ' + str(len(new_Lines_R1)))

new_Lines_Combo = []

# Define a list to store Read Number of those reads which passed the following conditions:
read_index_1 = []

for i in range (0, len(new_Lines_R1)):
    
    # Pick out those with 149-nt read overlap 
    if (new_Lines_R1[i][0:149] == new_Lines_R2[i][0:149]): # Ensure paired-end read generates consistent results.
        new_Lines_Combo.append(new_Lines_R1[i][0:149])
        read_index_1.append(i)
        
    # If the length of payload is exactly 142 nt, some reads may be excluded from the above condition because they have mismatches outside payload region.
    # Then we should pick out those with 142-nt read overlap. 
    elif (new_Lines_R1[i][0:142] == new_Lines_R2[i][0:142]): # Ensure paired-end read generates consistent results.
        new_Lines_Combo.append(new_Lines_R1[i][0:142])
        read_index_1.append(i)

    # If payload is longer than 151 nt, then we have to find overlap between the two reads
    else:                                  
        for j in range (0, len(new_Lines_R1[i])):
            if (new_Lines_R1[i][j:] == new_Lines_R2[i][0:len(new_Lines_R1[i])-j]): # Detecting maximal overlapped region
                new_Lines_Combo_Unit = new_Lines_R1[i] + new_Lines_R2[i][-(j+len(new_Lines_R2[i])-len(new_Lines_R1[i])):]

                # Exclude those longer than 170 nt.
                if len(new_Lines_Combo_Unit) > 170:
                    continue 

                new_Lines_Combo.append(new_Lines_Combo_Unit)
                read_index_1.append(i)
                break
        
        
Lines = new_Lines_Combo

# Get the number of valid reads
res_read_index_1 = [*set(read_index_1)]


primerLength = 21
dataStrandsOccup = [[None]*100 for _ in range(40)]
dataStrandstoText = [[None]*100 for _ in range(40)]
dataStrands = [[None]*100 for _ in range(40)]
candi_dataStrands = [ [ [] for i in range(100) ] for i in range(40) ]
valid_reads = [ [ [0] for i in range(100) ] for i in range(40) ]

RS_before = 0
RS_after = 0
read_index_2 = []
read_index_3 = []

read_number = -1

for data in Lines:
    read_number += 1
    data = data.strip('\n')   # In '.txt' files, there may be '\n' symbols meaning the start of a new line. Those symbols needs to be eliminated. 
    # Extract forward and reverse PCR primer sequence and identify the Chapter Number
    chapterStartSequence = data[0:primerLength]
    chapterEndSequence = data[(len(data) - primerLength):]
    try:
        chapterStart = primerLibrary.index(chapterStartSequence)
    except:
        continue
    try:
        chapterEnd = primerLibrary.index(chapterEndSequence)
    except:
        continue
    
    chapterEnd = len(primerLibrary) - chapterEnd - 1
    if (chapterStart == chapterEnd):
        Chapter = chapterStart
    else:
        continue
    
    data = data[primerLength:(len(data)-primerLength)]
    # Count matched sequence before RS correction
    for m in range (0, 40):
        for n in range (0, 100):
            if data == array_data[m][n]:
                read_index_2.append(read_index_1[read_number])   # Store Read Number with successful decoding attempt before RS correction
    
    RS = data[-12:]  # Extract RS Sequence (base-3)
    
    data_to_be_modulated = data[4:(len(data)-12)]
    data_modulated = ''
    for i in range (0, int(len(data_to_be_modulated)/7)):
        data_modulated_segment = data_to_be_modulated[i*7:(i+1)*7]
        data_modulated_segment = data_modulated_segment + data_to_be_modulated[i*7+6]  # Add one more Pointer Base to make the sequence checkable by RS
        data_modulated = data_modulated + data_modulated_segment
        data_modulated_segment = ''
    
    data = data[0:4] + data_modulated
    
    # Convert RS Sequence to ternary number, then to binary number
    RS_segment_binary_total = []
    for i in range (0, int(len(RS)/6)):
        RS_segment = RS[i*6:(i+1)*6]
        RS_segment_base3 = oligoToBase3(RS_segment)
        if (RS_segment_base3 == -1):
            break
        RS_segment_decimal = ternaryToDecimal(RS_segment_base3)
        RS_segment_binary = bin(RS_segment_decimal)
        RS_segment_binary = RS_segment_binary[2:]
        RS_segment_binary = '0'*(8-len(RS_segment_binary)) + RS_segment_binary
        RS_segment_binary = str(RS_segment_binary)
        RS_segment_binary_total.append(RS_segment_binary)
    RS_segment_binary_total = ''.join(RS_segment_binary_total)

    if (len(RS_segment_binary_total) != 16):
        continue
    
    binaryConverted = converter(data)    # Convert Data Sequence to binary number 
    binaryConverted = binaryConverted + RS_segment_binary_total  # Combine the data and RS code in binary form

    binaryList = [int(binaryConverted[i:i + 8], 2) for i in range(0, len(binaryConverted), 8)]
    bytesList = bytes(binaryList)

    try:
        RSDecoded = rsc.decode(bytesList)[0]   # RS correction
    except:
        continue

    bytes_as_bits = ''.join(format(byte, '08b') for byte in RSDecoded)
    baseDeconverted = deconverter(bytes_as_bits)   # Convert corrected bytes back to DNA sequences 
    # Extract and identify Segment Number
    segmentSequence = baseDeconverted[0:4]   
    segmentBase3 = oligoToBase3(segmentSequence)
    if (segmentBase3 == -1):
        continue
    segmentBase10 = ternaryToDecimal(segmentBase3)
    Segment = segmentBase10

    
    # In each segment, get Index Bases and Pointer Base in 8-nt unit
    baseDeconverted = baseDeconverted[4:]
    corrected_data = ''
    characterLength = int(len(baseDeconverted)/8)
    
    for i in range (0, characterLength):
        characterWhole = baseDeconverted[i*8:(i+1)*8]
        characterIndexSeq = characterWhole[0:6]
        corrected_data += characterIndexSeq
        characterPointerSeq = characterWhole[6]
        corrected_data += characterPointerSeq
    
    # Count matched sequence after RS correction
    for m in range (0, 40):
        for n in range (0, 100):
            if corrected_data == array_data_payload[m][n]:
                dataStrandsOccup[Chapter][Segment] = 1
                read_index_3.append(read_index_1[read_number])   # Store Read Number with successful decoding attempt after RS correction
    
    candi_dataStrands[Chapter][Segment].append(corrected_data) # Store the RS-corrected sequence in candidate sequence list
    valid_reads[Chapter][Segment][0] += 1

for m in range (0, 40):
    for n in range (0, 100):
        if (dataStrands[m][n] == None):
            try:
                dataStrands[m][n] = most_frequent(candi_dataStrands[m][n])  # Only keep the sequence appearing most frequently
            except:
                pass
            
for m in range (0, 40):
    for n in range (0, 100):      
        if (dataStrands[m][n] != None):
            baseDeconverted = dataStrands[m][n]  
            getSegmentData = []
            characterLength = int(len(baseDeconverted)/7)
            for i in range (0, characterLength):
                characterWhole = baseDeconverted[i*7:(i+1)*7]
                characterIndexSeq = characterWhole[0:6]
                characterPointerSeq = characterWhole[6]
                ternaryCharacterIndex = oligoToBase3(characterIndexSeq)
                if (ternaryCharacterIndex == -1):
                    continue
                characterIndex = ternaryToDecimal(ternaryCharacterIndex)   # Convert ternary Index Number to decimal number

                if (characterIndex > 720):
                    continue

                # Locate Combination by looking up the Index Number and Pointer from the decoded Reference Strands
                getReference = []
                for i in range (0, len(referenceStrands)):
                    getReference.append(referenceStrands[i][characterIndex])
                getReference = ''.join(getReference)
                getCharacterData = [None]*16
                for i in range (0, len(getReference)):
                    if getReference[i] == characterPointerSeq:
                        getCharacterData[i] = '1'
                    else:
                        getCharacterData[i] = '0'
                getCharacterData = ''.join(getCharacterData)
                getSegmentData.append(getCharacterData)
            getSegmentData = ''.join(getSegmentData)


            # Eliminate redundant '00100000's which adds up oligo length to ensure the quality of batch production in oligo synthesis
            while (len(getSegmentData) > 0):
                if (getSegmentData[-16:-8] == '00100000'):
                    getSegmentData = getSegmentData[0: len(getSegmentData)-8]
                else:
                    break

            # Convert binary data to hex data
            getSegmentData_hex = binaryToHex(getSegmentData)

            # Convert hex data to ASCII texts
            try:
                getSegmentData_text = hexToText(getSegmentData_hex)
                dataStrandstoText[m][n] = getSegmentData_text 

            except:
                pass

            
for i in range (0, len(dataStrandstoText)):
    
    text = ''
    for j in range (0, len(dataStrandstoText[i])):
        try:
            text += dataStrandstoText[i][j]
        except:
            pass
    if (text != ''):
        print('This is Chapter ' + str(i))
        print(text)


for i in range (0, len(dataStrandsOccup)):
    for j in range (0, len(dataStrandsOccup[i])):
        if dataStrandsOccup[i][j] == 1:
            if dataStrandsOccup[i][j+1] == None: 
                print('Chapter ' + str(i) + " is found")
                print("Last segment is " + str(j))
                break
    
    
# Get the number of successfully decoded reads before and after RS correction    
res_read_index_2 = [*set(read_index_2)]
res_read_index_3 = [*set(read_index_3)]
print('Total number of valid reads: ' + str(len(res_read_index_1)))
print('Exact matched sequences before RS correction: ' + str(len(res_read_index_2)))
print('Matched sequences after RS correction: ' + str(len(res_read_index_3)))


for i in range (0, len(valid_reads)):
    read_number = 0
    for j in range (0, len(valid_reads[i])):
        read_number = read_number + valid_reads[i][j][0]
    if read_number != 0:
        print(i)
        print(read_number)
        
        

Total number of reads: 7069
This is Chapter 0
Digital production, transmission and storage have revolutionized how we access and use information but have also made archiving an increasingly complex task that requires active, continuing maintenance of digital media. This challenge has focused some interest on DNA as an attractive target for information storage because of its capacity for high-density information encoding, longevity under easily achieved conditions and proven track record as an information bearer. Previous DNA-based information storage approaches have encoded only trivial amounts of information or were not amenable to scaling-up, and used no robust error-correction and lacked examination of their cost-efficiency for large-scale information archival. Here we describe a scalable method that can reliably store more information than has been handled before. We encoded computer files totalling 739 kilobytes of hard-disk storage and with an estimated Shannon information of 5.2