In [1]:
# This program aims to process the data sequences and translate them into ascii texts
# Data sequence processing workflow:
# 1. Identify and exclude PCR primers. The PCR primers also corresponds to the Chapter number.
# 2. RS correcting the data sequence and extract data sequence.
# 3. Decode segment number (the first four bases after forward primer).
# 4. Convert data sequence to hex data.
# 5. Convert hex data to texts


from reedsolo import RSCodec, ReedSolomonError

rsc = RSCodec(2)

%store -r primerLibrary
%store -r referenceStrands
%store -r array_data_payload

def converter(seq):
    converter = {'A': '00', 'C': '01', 'G': '10', 'T': '11'} 
    bases = list(seq) 
    bases = [converter[base] for base in bases] 
    return ''.join(bases)

def deconverter(seq):
    deconverter = {'00': 'A', '01': 'C', '10': 'G', '11': 'T'} 
    doubleBits = [seq[i:i+2] for i in range(0, len(seq), 2)]
    doubleBits = [deconverter[doubleBit] for doubleBit in doubleBits] 
    return ''.join(doubleBits)

def oligoToBase3(seq):
    oligoToBase3 = {'G': '0', 'T': '1', 'A': '2', 'C': '3', 'N': '3'}
    oligoToBase3Converted = []
    for i in range (0, len(seq)):
        if (oligoToBase3[seq[i]] == '3'):
            return -1
        else:
            oligoToBase3Converted.append(oligoToBase3[seq[i]])
            if seq[i] == 'C':
                oligoToBase3 = {'G': '0', 'T': '1', 'A': '2', 'C': '3', 'N': '3'}
            elif seq[i] == 'G':
                oligoToBase3 = {'T': '0', 'A': '1', 'C': '2', 'G': '3', 'N': '3'}
            elif seq[i] == 'T':
                oligoToBase3 = {'A': '0', 'C': '1', 'G': '2', 'T': '3', 'N': '3'}
            elif seq[i] == 'A':
                oligoToBase3 = {'C': '0', 'G': '1', 'T': '2', 'A': '3', 'N': '3'}


    return ''.join(oligoToBase3Converted)

def ternaryToDecimal(n):
    decimal = 0
    n = ''.join(reversed(n))
    for i in range (0, len(n)):
        decimal += (int(n[i]))*(pow(3, i))
    return decimal

def binaryToHex(binary_string):
    decimal_representation = int(binary_string, 2)
    hexadecimal_string = hex(decimal_representation)
    return hexadecimal_string

def hexToText(hex_string):
    hex_string = hex_string[2:]
    bytes_object = bytes.fromhex(hex_string)
    ascii_string = bytes_object.decode("utf-8")
    return ascii_string

%store -r array_data

# Read input FASTQ file
file_R1 = open('amplicon-3_S5_L001_R1_001.fastq', 'r')
file_R2 = open('amplicon-3_S5_L001_R2_001.fastq', 'r')

Lines_R1 = file_R1.readlines()
Lines_R2 = file_R2.readlines()

new_Lines_R1 = []
new_Lines_R2 = []

complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}

for i in range (0, int(len(Lines_R1))):
    if i%4 == 1:
        new_Lines_R1.append(Lines_R1[i][0:151].strip('\n'))

for i in range (0, int(len(Lines_R2))):
    if i%4 == 1:
        reverse_complement_R2 = "".join(complement.get(base, base) for base in reversed(Lines_R2[i][0:151].strip('\n')))
        new_Lines_R2.append(reverse_complement_R2)

print('Total number of reads: ' + str(len(new_Lines_R1)))


primerLength = 21
dataStrandsOccup = [[None]*100 for _ in range(40)]
dataStrandstoText = [[None]*100 for _ in range(40)]
sequenced_count_array = [[0]*100 for _ in range(40)]
before_count_array = [[0]*100 for _ in range(40)]
afterProcess_count_array = [[0]*100 for _ in range(40)]
afterProcess2_count_array = [[0]*100 for _ in range(40)]

for data in new_Lines_R1:
    data = data.strip('\n')   # In '.txt' files, there may be '\n' symbols meaning the start of a new line. Those symbols needs to be eliminated. 
    # Extract forward and reverse PCR primer sequence and identify the Chapter Number
    chapterStartSequence = data[0:primerLength]
    chapterEndSequence = data[(len(data) - primerLength):]
    try:
        chapterStart = primerLibrary.index(chapterStartSequence)
    except:
        continue
    try:
        chapterEnd = primerLibrary.index(chapterEndSequence)
    except:
        continue
    
    chapterEnd = len(primerLibrary) - chapterEnd - 1
    if (chapterStart == chapterEnd):
        Chapter = chapterStart
    else:
        continue
    
    data = data[primerLength:(len(data)-primerLength)]
    segmentSequence = data[0:4]   
    segmentBase3 = oligoToBase3(segmentSequence)
    if (segmentBase3 == -1):
        continue
    segmentBase10 = ternaryToDecimal(segmentBase3)
    Segment = segmentBase10
    
    before_count_array[Chapter][Segment] += 1
    
    
    
    
    
new_Lines_Combo = []

# Define a list to store Read Number of those reads which passed the following conditions:
read_index_1 = []

for i in range (0, len(new_Lines_R1)):
    # The sequencing file may contain sequences with a read length fewer than 151 nt, owever, these sequences may also be decodable since some Data Strands are shorter than 151 nt.
    if (new_Lines_R1[i][0:149] == new_Lines_R2[i][0:149]): # Ensure paired-end read generates consistent results.
        new_Lines_Combo.append(new_Lines_R1[i][0:149])
        read_index_1.append(i)
    # Extract sequences exactly overlap 149 nt when read length equals to or longer than 151 nt.    
    if (new_Lines_R1[i][0:149] == new_Lines_R2[i][2:151]): # Ensure paired-end read generates consistent results.
        new_Lines_Combo.append(new_Lines_R1[i][0:149])
        read_index_1.append(i)
    # If length of sequence to be decoded is more than 151 nt, then we have to find overlapped region starting from the middle of new_Lines_R1         
    else:
        for j in range (0, len(new_Lines_R1[i])):
            if (new_Lines_R1[i][j:] == new_Lines_R2[i][0:len(new_Lines_R1[i])-j]): # Case 1: The read lengths of R1 and R2 are equal 
                new_Lines_Combo_Unit = new_Lines_R1[i] + new_Lines_R2[i][-(j+abs(len(new_Lines_R1[i])-len(new_Lines_R2[i]))):]
                # Exclude those longer than 170 nt.
                if len(new_Lines_Combo_Unit) > 170:
                    continue 
                
                new_Lines_Combo.append(new_Lines_Combo_Unit)
                read_index_1.append(i)
                break
                
        for j in range (0, len(new_Lines_R1[i])):          
            if (new_Lines_R1[i][j:] == new_Lines_R2[i][0:len(new_Lines_R2[i])-j+abs(len(new_Lines_R1[i])-len(new_Lines_R2[i]))]): # Case 2: The read lengths of R1 and R2 are not equal.
                new_Lines_Combo_Unit = new_Lines_R1[i] + new_Lines_R2[i][-(j-(abs(len(new_Lines_R1[i])-len(new_Lines_R2[i])))):]
                # Exclude those longer than 170 nt.
                if len(new_Lines_Combo_Unit) > 170:
                    continue 
                    
                new_Lines_Combo.append(new_Lines_Combo_Unit)
                read_index_1.append(i)
                break
        
Lines = new_Lines_Combo

for data in Lines:
    data = data.strip('\n')   # In '.txt' files, there may be '\n' symbols meaning the start of a new line. Those symbols needs to be eliminated. 
    # Extract forward and reverse PCR primer sequence and identify the Chapter Number
    chapterStartSequence = data[0:primerLength]
    chapterEndSequence = data[(len(data) - primerLength):]
    try:
        chapterStart = primerLibrary.index(chapterStartSequence)
    except:
        continue
    try:
        chapterEnd = primerLibrary.index(chapterEndSequence)
    except:
        continue
    
    chapterEnd = len(primerLibrary) - chapterEnd - 1
    if (chapterStart == chapterEnd):
        Chapter = chapterStart
    else:
        continue
    
    data = data[primerLength:(len(data)-primerLength)]
    segmentSequence = data[0:4]   
    segmentBase3 = oligoToBase3(segmentSequence)
    if (segmentBase3 == -1):
        continue
    segmentBase10 = ternaryToDecimal(segmentBase3)
    Segment = segmentBase10
    
    afterProcess_count_array[Chapter][Segment] += 1

# Get the number of valid reads
res_read_index_1 = [*set(read_index_1)]


RS_before = 0
RS_after = 0
read_index_2 = []
read_index_3 = []

read_number = -1

for data in Lines:
    read_number += 1
    data = data.strip('\n')   # In '.txt' files, there may be '\n' symbols meaning the start of a new line. Those symbols needs to be eliminated. 
    # Extract forward and reverse PCR primer sequence and identify the Chapter Number
    chapterStartSequence = data[0:primerLength]
    chapterEndSequence = data[(len(data) - primerLength):]
    try:
        chapterStart = primerLibrary.index(chapterStartSequence)
    except:
        continue
    try:
        chapterEnd = primerLibrary.index(chapterEndSequence)
    except:
        continue
    
    chapterEnd = len(primerLibrary) - chapterEnd - 1
    if (chapterStart == chapterEnd):
        Chapter = chapterStart
    else:
        continue
    
    data = data[primerLength:(len(data)-primerLength)]
    # Count matched sequence before RS correction
    for m in range (0, 40):
        for n in range (0, 100):
            if data == array_data[m][n]:
                afterProcess2_count_array[m][n] += 1
                read_index_2.append(read_index_1[read_number])   # Store Read Number with successful decoding attempt before RS correction
    
    RS = data[-12:]  # Extract RS Sequence (base-3)
    
    data_to_be_modulated = data[4:(len(data)-12)]
    data_modulated = ''
    for i in range (0, int(len(data_to_be_modulated)/7)):
        data_modulated_segment = data_to_be_modulated[i*7:(i+1)*7]
        data_modulated_segment = data_modulated_segment + data_to_be_modulated[i*7+6]
        data_modulated = data_modulated + data_modulated_segment
        data_modulated_segment = ''
    
    data = data[0:4] + data_modulated
    
    # Convert RS Sequence to ternary number, then to binary number
    RS_segment_binary_total = []
    for i in range (0, int(len(RS)/6)):
        RS_segment = RS[i*6:(i+1)*6]
        RS_segment_base3 = oligoToBase3(RS_segment)
        if (RS_segment_base3 == -1):
            break
        RS_segment_decimal = ternaryToDecimal(RS_segment_base3)
        RS_segment_binary = bin(RS_segment_decimal)
        RS_segment_binary = RS_segment_binary[2:]
        RS_segment_binary = '0'*(8-len(RS_segment_binary)) + RS_segment_binary
        RS_segment_binary = str(RS_segment_binary)
        RS_segment_binary_total.append(RS_segment_binary)
    RS_segment_binary_total = ''.join(RS_segment_binary_total)

    if (len(RS_segment_binary_total) != 16):
        continue
    
    binaryConverted = converter(data)    # Convert Data Sequence to binary number 
    binaryConverted = binaryConverted + RS_segment_binary_total  # Combine the data and RS code in binary form

    binaryList = [int(binaryConverted[i:i + 8], 2) for i in range(0, len(binaryConverted), 8)]
    bytesList = bytes(binaryList)

    try:
        RSDecoded = rsc.decode(bytesList)[0]   # RS correction
    except:
        continue

    bytes_as_bits = ''.join(format(byte, '08b') for byte in RSDecoded)
    baseDeconverted = deconverter(bytes_as_bits)   # Convert corrected bytes back to DNA sequences 
    # Extract and identify Segment Number
    segmentSequence = baseDeconverted[0:4]   
    segmentBase3 = oligoToBase3(segmentSequence)
    if (segmentBase3 == -1):
        continue
    segmentBase10 = ternaryToDecimal(segmentBase3)
    Segment = segmentBase10

    
    # In each segment, get Index Bases and Pointer Base in 8-nt unit
    baseDeconverted = baseDeconverted[4:]
    corrected_data = ''
    characterLength = int(len(baseDeconverted)/8)
    getSegmentData = []
    for i in range (0, characterLength):
        characterWhole = baseDeconverted[i*8:(i+1)*8]
        characterIndexSeq = characterWhole[0:6]
        corrected_data += characterIndexSeq
        characterPointerSeq = characterWhole[6]
        corrected_data += characterPointerSeq
        ternaryCharacterIndex = oligoToBase3(characterIndexSeq)
        if (ternaryCharacterIndex == -1):
            continue
        characterIndex = ternaryToDecimal(ternaryCharacterIndex)   # Convert ternary Index Number to decimal number

        if (characterIndex > 720):
            continue

        # Locate Combination by looking up the Index Number and Pointer from the decoded Reference Strands
        getReference = []
        for i in range (0, len(referenceStrands)):
            getReference.append(referenceStrands[i][characterIndex])
        getReference = ''.join(getReference)
        getCharacterData = [None]*16
        for i in range (0, len(getReference)):
            if getReference[i] == characterPointerSeq:
                getCharacterData[i] = '1'
            else:
                getCharacterData[i] = '0'
        getCharacterData = ''.join(getCharacterData)
        getSegmentData.append(getCharacterData)
    getSegmentData = ''.join(getSegmentData)
    
    # Eliminate redundant '00100000's which adds up oligo length to ensure the quality of batch production in oligo synthesis
    while (len(getSegmentData) > 0):
        if (getSegmentData[-16:-8] == '00100000'):
            getSegmentData = getSegmentData[0: len(getSegmentData)-8]
        else:
            break
    
    # Convert binary data to hex data
    getSegmentData_hex = binaryToHex(getSegmentData)
    
    # Convert hex data to ASCII texts
    try:
        getSegmentData_text = hexToText(getSegmentData_hex)
        dataStrandstoText[Chapter][Segment] = getSegmentData_text 
        
    except:
        continue
        
    # Count matched sequence after RS correction
    for m in range (0, 40):
        for n in range (0, 100):
            if corrected_data == array_data_payload[m][n]:
                dataStrandsOccup[Chapter][Segment] = 1
                sequenced_count_array[Chapter][Segment] += 1
                read_index_3.append(read_index_1[read_number])   # Store Read Number with successful decoding attempt after RS correction
    


for i in range (0, len(dataStrandstoText)):
    
    text = ''
    for j in range (0, len(dataStrandstoText[i])):
        try:
            text += dataStrandstoText[i][j]
        except:
            pass
    if (text != ''):
        print('This is Chapter ' + str(i))
        print(text)


for i in range (0, len(dataStrandsOccup)):
    for j in range (0, len(dataStrandsOccup[i])):
        if dataStrandsOccup[i][j] == 1:
            if dataStrandsOccup[i][j+1] == None: 
                print('Chapter ' + str(i) + " is found")
                print("Last segment is " + str(j))
                break
                
                
for i in range (0, len(sequenced_count_array)):
    for j in range (0, len(sequenced_count_array[i])):
        if sequenced_count_array[i][j] != 0:
            print('Chapter ' + str(i) + " is found")
            print(before_count_array[i])
            print(afterProcess_count_array[i])
            print(afterProcess2_count_array[i])
            print(sequenced_count_array[i])
            break

            
            
count_afterProcess_count_array = 0
count_afterProcess2_count_array = 0
count_sequenced_count_array = 0     
min_afterProcess_count_array = 100
min_afterProcess2_count_array = 100
min_sequenced_count_array = 100
    
for i in range (0, len(afterProcess_count_array)):
    for j in range (0, len(afterProcess_count_array[i])):
        afterProcess_count_array[i][j] = afterProcess_count_array[i][j]/len(Lines)
        if afterProcess_count_array[i][j] > 0:
            if afterProcess_count_array[i][j] < min_afterProcess_count_array:
                min_afterProcess_count_array = afterProcess_count_array[i][j]
            if afterProcess_count_array[i][j] < 0.0005:
                count_afterProcess_count_array += 1
                                           

    
for i in range (0, len(afterProcess2_count_array)):
    for j in range (0, len(afterProcess2_count_array[i])):
        afterProcess2_count_array[i][j] = afterProcess2_count_array[i][j]/len(Lines)
        if afterProcess2_count_array[i][j] > 0:
            if afterProcess2_count_array[i][j] < min_afterProcess2_count_array:
                min_afterProcess2_count_array = afterProcess2_count_array[i][j]
            if afterProcess2_count_array[i][j] < 0.0005:
                count_afterProcess2_count_array += 1

        
for i in range (0, len(sequenced_count_array)):
    for j in range (0, len(sequenced_count_array[i])):
        sequenced_count_array[i][j] = sequenced_count_array[i][j]/len(Lines)
        if sequenced_count_array[i][j] > 0:
            if sequenced_count_array[i][j] < min_sequenced_count_array:
                min_sequenced_count_array = sequenced_count_array[i][j]
            if sequenced_count_array[i][j] < 0.0005:
                count_sequenced_count_array += 1

        
for i in range (0, 40):
    for j in range (0, 100):
        if sequenced_count_array[i][j] != 0:
            print('Chapter ' + str(i) + " is found")
            print(afterProcess_count_array[i])
            print(afterProcess2_count_array[i])
            print(sequenced_count_array[i])
            break
    
    
    
# Get the number of successfully decoded reads before and after RS correction    
res_read_index_2 = [*set(read_index_2)]
res_read_index_3 = [*set(read_index_3)]
print('Total number of valid reads: ' + str(len(res_read_index_1)))
print('Exact matched sequences before RS correction: ' + str(len(res_read_index_2)))
print('Matched sequences after RS correction: ' + str(len(res_read_index_3)))

print(min_afterProcess_count_array)
print(min_afterProcess2_count_array)
print(min_sequenced_count_array)
print(count_afterProcess_count_array)
print(count_afterProcess2_count_array)
print(count_sequenced_count_array)

Total number of reads: 14121
This is Chapter 12
We provide a simple description of the most general collective Gaussian attack in continuous-varblquantu  cryptography. In the scenario of such general attacks, we analyze the asymptotic secret-key rates which are achievable with coherent states, joint measurements of the quadratures and one-way classical communication.
This is Chapter 14
Device-independent cryptography goes beyond conventional quantum cryptography by providing security that holds independently of the quality of the underlying physical devices. Device-independent protocols are based on the quantum phenomena of non-locality and the violation of Bell inequalities. This high level of security could so far only be established under conditions which are not achievable experimentally. Here we present a property of entropy, termed "entropy accumulation", which asserts that the total amount of entropy of a large system is the sum of its parts. We use this property to prove the se