In [3]:
# This program aims to generate reference sequences
# The structure of a reference sequence looks like this:
# 1. 16 bulk sequences which represents 16 bits as a block defined in the hexdata preparation step. The length of this sequence equals to the number of Groups found by Step 1.
#    In this work, the length of each Reference Strand before division equals to 717, which is further adjusted to 720 for conveniency of RS correction.
# 2. The original Reference Sequence is too long for oligo synthesis and sequencing. Therefore, we divide the sequence into 9 small segments, each contaning 80 bases.
# 3. Position Symbol. Each strand shall contain two position symbols, one for bit position notation and one for segment notation. The generation of position symbols follows the algorithm of base-3 conversion (Goldman et al., 2013, Nature). The symbol is made up of 4 nt and has a maximum volume of 3^4 = 81, which me can have 81 bits and 81 segments in each strand at most.
# 4. Reed-Solomon error correction code. A 12-nt sequence mapping to the base-3 converted RS code. The code can correct up to 1 base-calling error or multiple errors in a single 4-base block.
#    The error correction code does not correct PCR primer sequence so the total length it oligos it covers is 80 + 4 + 4 + 12 = 100 nt
# 5. Universal PCR forward and reverse primers which are selected from 'primerLibrary'.  


import math
from array import *
import collections
from reedsolo import RSCodec

%store -r primerLibrary

%store -r Groups

arrayGap = 16
sumConsecutive = 0

# 2 error correction codes
rsc = RSCodec(2)

primerIndex = 40   

def DNA_reverse_complement(DNA):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    return ''.join(complement.get(base, base) for base in reversed(DNA))

def converter(seq):
    converter = {'A': '00', 'C': '01', 'G': '10', 'T': '11'} 
    bases = list(seq) 
    bases = [converter[base] for base in bases] 
    return ''.join(bases)

def deconverter(seq):
    deconverter = {'00': 'A', '01': 'C', '10': 'G', '11': 'T'} 
    doubleBits = [seq[i:i+2] for i in range(0, len(seq), 2)]
    doubleBits = [deconverter[doubleBit] for doubleBit in doubleBits] 
    return ''.join(doubleBits)

def GCPercent(seq):
    counts = collections.Counter(seq)
    percent = (counts['C'] + counts['G'])/float(len(seq)) 
    return percent

def ternary(n):
    if n == 0:
        return '0'
    nums = []
    while n:
        n, r = divmod(n, 3)
        nums.append(str(r))
    return ''.join(reversed(nums))
    
def base3ToOligo(seq):
    base3ToOligo = {'0': 'G', '1': 'T', '2': 'A'}
    base3ToOligoConverted = []
    for i in range (0, len(seq)):
        base3ToOligoConverted.append(base3ToOligo[seq[i]])
        if base3ToOligo[seq[i]] == 'C':
            base3ToOligo = {'0': 'G', '1': 'T', '2': 'A'}
        elif base3ToOligo[seq[i]] == 'G':
            base3ToOligo = {'0': 'T', '1': 'A', '2': 'C'}
        elif base3ToOligo[seq[i]] == 'T':
            base3ToOligo = {'0': 'A', '1': 'C', '2': 'G'}
        elif base3ToOligo[seq[i]] == 'A':
            base3ToOligo = {'0': 'C', '1': 'G', '2': 'T'}
    return ''.join(base3ToOligoConverted)

def seqCorrectionForRS(seq):
    if len(seq)%4 == 0:
        seq = seq
    elif len(seq)%4 == 1:
        seq = seq + 'ACT'
    elif len(seq)%4 == 2:
        seq = seq + 'AC'
    elif len(seq)%4 == 3:
        seq = seq + 'A'
    return seq
    
# Extracting Groups with different numbers of Combinations
for k in range (0, len(Groups)):
    length1 = [list1 for list1 in Groups if len(list1) == 1]
    length2 = [list2 for list2 in Groups if len(list2) == 2]
    length3 = [list3 for list3 in Groups if len(list3) == 3]

GroupsNew = []
    
for i in range (0, len(length1)):
    GroupsNew.append(length1[i])
    
for j in range (0, len(length2)):
    GroupsNew.append(length2[j])
    
for k in range (0, len(length3)):
    GroupsNew.append(length3[k])

# Define Base Mapping Matrix
baseMappingMatrix = ['ACGT', 'CGTA', 'GTAC', 'TACG']

# Define space for reference oligos
oligos = [[None]*len(Groups) for _ in range(arrayGap)]
array_reference = [[None] * 9 for _ in range(16)]

# Map base mapping matrix to reference oligos
for i1 in range (0, len(length1)):
    for k1 in range (0, len(length1[i1][0])):
        oligos[length1[i1][0][k1]][i1] = baseMappingMatrix[0][i1%4]
    for j1 in range (0, arrayGap):
        if (oligos[j1][i1] == None):
            oligos[j1][i1] = baseMappingMatrix[1][i1%4]
        
for i2 in range (0, len(length2)):
    for k21 in range (0, len(length2[i2][0])):
        oligos[length2[i2][0][k21]][i2 + len(length1)] = baseMappingMatrix[0][(i2 + len(length1))%4]
    for k22 in range (0, len(length2[i2][1])):
        oligos[length2[i2][1][k22]][i2 + len(length1)] = baseMappingMatrix[1][(i2 + len(length1))%4]
    for j2 in range (0, arrayGap):   
        if (oligos[j2][i2 + len(length1)] == None):
            oligos[j2][i2 + len(length1)] = baseMappingMatrix[2][(i2 + len(length1))%4]
        
for i3 in range (0, len(length3)):
    for k31 in range (0, len(length3[i3][0])):
         oligos[length3[i3][0][k31]][i3 + len(length1) + len(length2)] = baseMappingMatrix[0][(i3 + len(length1) + len(length2))%4]
    for k32 in range (0, len(length3[i3][1])):
         oligos[length3[i3][1][k32]][i3 + len(length1) + len(length2)] = baseMappingMatrix[1][(i3 + len(length1) + len(length2))%4]
    for k33 in range (0, len(length3[i3][2])):
         oligos[length3[i3][2][k33]][i3 + len(length1) + len(length2)] = baseMappingMatrix[2][(i3 + len(length1) + len(length2))%4]
    for j3 in range (0, arrayGap):   
        if (oligos[j3][i3 + len(length1) + len(length2)] == None):
            oligos[j3][i3 + len(length1) + len(length2)] = baseMappingMatrix[3][(i3 + len(length1) + len(length2))%4]

# Assemble Reference Strands, i.e. add Segment Number, Reference Number, RS correction codes and PCR primers
GCTotal = 0    
count = 0
trimmed_whole_array = []
for k in range (0, arrayGap):
    
    oligos[k] = ''.join(oligos[k])
    trimmed_unit = ''

    for i in range (0, int(len(oligos[k])/80)+1):
        referenceSegment = oligos[k][80*i:80*(i+1)]
        
        referenceSegment = seqCorrectionForRS(referenceSegment)
        
        ternarySegmentNumber = ternary(i)
        ternaryReferenceNumber = ternary(k)
        ternarySegmentNumber = '0'*(4-len(ternarySegmentNumber)) + ternarySegmentNumber
        ternaryReferenceNumber = '0'*(4-len(ternaryReferenceNumber)) + ternaryReferenceNumber
        ternarySegmentNumberSeq = base3ToOligo(ternarySegmentNumber)
        ternaryReferenceNumberSeq = base3ToOligo(ternaryReferenceNumber)
        referenceSegment = ternaryReferenceNumberSeq + ternarySegmentNumberSeq + referenceSegment
        
        binaryConverted = converter(referenceSegment)
 
        binaryList = [int(binaryConverted[i:i + 8], 2) for i in range(0, len(binaryConverted), 8)]
    
        byteArray = rsc.encode(binaryList)
        
        byte_to_ternary_oligo = []
        for byte in byteArray[-2:]:
            byte_to_ternary = ternary(byte)
            byte_to_ternary = '0'*(6-len(byte_to_ternary)) + byte_to_ternary
            byte_to_ternary_oligoUnit = base3ToOligo(byte_to_ternary)
            byte_to_ternary_oligo.append(byte_to_ternary_oligoUnit)
        
        byte_to_ternary_oligo = ''.join(byte_to_ternary_oligo)

        baseDeconverted = primerLibrary[primerIndex] + referenceSegment + byte_to_ternary_oligo + primerLibrary[len(primerLibrary)-primerIndex-1]
        count += 1
        print('#' + str(count))
        print(referenceSegment[8:])
        print(baseDeconverted)
        # Calculating GC content of each strand
        GC = GCPercent(baseDeconverted)
        print('GC content of this strand: ' + str(GC))
        
        GCTotal += GCPercent(baseDeconverted)
        trimmed = baseDeconverted[29:109]
        array_reference[k][i] = trimmed
        trimmed_unit += trimmed

    trimmed_whole_array.append(trimmed_unit)

%store trimmed_whole_array
%store array_reference
# Calculating average GC content
GCAverage = GCTotal/count
print('Average GC content: ' + str(GCAverage))


#1
CGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTA
TCAACTGGTGATTCGTGCAACGTACGTACCGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAGTCTGTGTCTGTACGGTAGCTTCCTGTATGCCT
GC content of this strand: 0.5
#2
CGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTA
TCAACTGGTGATTCGTGCAACGTACGTAGCGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAGCTAGAGCTAGCACGGTAGCTTCCTGTATGCCT
GC content of this strand: 0.5070422535211268
#3
CGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTA
TCAACTGGTGATTCGTGCAACGTACGTATCGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAGATCATGATGTCACGGTAGCTTCCTGTATGCCT
GC content of this strand: 0.4859154929577465
#4
CGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTA
TCAACTGGTGATTCGTGCAACGTACGTCGCGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAGCG