In [16]:
import random
import primer3
import math
from array import *
import collections
from reedsolo import RSCodec
import itertools

def GCPercent(seq):                       # To calculate GC content of a DNA sequence
    counts = collections.Counter(seq)
    percent = (counts['C'] + counts['G'])/float(len(seq)) 
    return percent

def ternary(n):      # To convert a binary number to a ternary number (front '0's not included)
    if n == 0:
        return '0'
    nums = []
    while n:
        n, r = divmod(n, 3)
        nums.append(str(r))
    return ''.join(reversed(nums))
    
def base3ToOligo(seq):           # Base-3 oligo conversion to map a ternary number to a DNA sequence.(Goldman et al., 2013, Nature)
    base3ToOligo = {'0': 'C', '1': 'T', '2': 'G'}
    base3ToOligoConverted = []
    for i in range (0, len(seq)):
        base3ToOligoConverted.append(base3ToOligo[seq[i]])
        if base3ToOligo[seq[i]] == 'A':
            base3ToOligo = {'0': 'C', '1': 'T', '2': 'G'}
        elif base3ToOligo[seq[i]] == 'C':
            base3ToOligo = {'0': 'T', '1': 'G', '2': 'A'}
        elif base3ToOligo[seq[i]] == 'T':
            base3ToOligo = {'0': 'G', '1': 'A', '2': 'C'}
        elif base3ToOligo[seq[i]] == 'G':
            base3ToOligo = {'0': 'A', '1': 'C', '2': 'T'}
    return ''.join(base3ToOligoConverted)

def DNA_reverse_complement(DNA):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    return ''.join(complement.get(base, base) for base in reversed(DNA))


a = 'ACTG' 

pre_screened_sum = 0
pre_screened_sum_2 = 0
original_list = []
pre_screened_list = []
pre_screened_list_2 = []
my_list = []

pre_original_list = list(item for item in itertools.product(a, repeat = 7))   # Generate all possibilities of a 7-nt sequence

for List in pre_original_list:
    original_list.append(''.join(List))
    continue

for output in original_list:

    counts = collections.Counter(output)
    for i in range (0, 5):
  
        if (output[i] == output[i+1] == output[i+2]):     # Exclude homopolymers >= 3
                 
            pre_screened_list.append(''.join(output))
            pre_screened_sum += 1

            break


my_list = [x for x in original_list if (x not in pre_screened_list)]  

for output in my_list:
    
    counts = collections.Counter(output)
    
    if (output[0] == output[1] or output[5] == output[6] or (counts['C'] + counts['G'])/float(len(output)) < 0.40 or (counts['C'] + counts['G'])/float(len(output)) > 0.60):
        # The first two and the last two bases in a short sequence should not be the same, otherwise the incorporation ofa third base from a new short sequence may induce homopolymers >= 3
        # Screening GC content 40-60%
        pre_screened_list_2.append(''.join(output))
        pre_screened_sum_2 += 1      


Library = [x for x in my_list if (x not in pre_screened_list_2)]  

combinationLength = 717 
dataIndexSequence = []

for i in range (0, combinationLength):
    ternaryNumber = ternary(i)
    ternaryNumber = '0'*(6-len(ternaryNumber)) + ternaryNumber      # The index contains 7 bases, so the converted ternary number should be prefixed with '0's to ensure the length is 7
    dataIndexSequence.append(base3ToOligo(ternaryNumber) + 'A')
    dataIndexSequence.append(base3ToOligo(ternaryNumber) + 'T')
    dataIndexSequence.append(base3ToOligo(ternaryNumber) + 'C')
    dataIndexSequence.append(base3ToOligo(ternaryNumber) + 'G')
    
Library = [x for x in Library if (x not in dataIndexSequence)]      # To make sure PCR primers doesn't contain data index sequences

complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
screenedIndexComplementSegment = []

for i in range (0, combinationLength):      # To make sure the primer doesn't contain a complementary sequence to the data index sequence
    seq = dataIndexSequence[i]
    reverse_complement = "".join(complement.get(base, base) for base in reversed(seq))
    screenedIndexComplementSegment.append(reverse_complement)   

Library = [x for x in Library if (x not in screenedIndexComplementSegment)]  

count = 0
selectedSegmentCombination = []
primerLibrary = []


l = list(range(0, len(Library)-len(Library)%3))    # Create a list having the maximum number of library sequences which is a fold of 3

while(l != []):
    while(count <= 2):
        selected = random.choice(l)        # Randomly select a library sequence by its index
        selectedSegment = Library[selected]
        selectedSegmentCombination.append(''.join(selectedSegment))
        l.remove(selected)
        count += 1
    selectedSegmentCombination = ''.join(selectedSegmentCombination)
    primerLibrary.append(selectedSegmentCombination)
    selectedSegmentCombination = []
    count = 0


# Primer screening procedures start from here
# Melting temperature between 60-65 ℃
meltingTempExcluded = []
for i in range (0, len(primerLibrary)):
    meltingTemp = primer3.calcTm(primerLibrary[i], mv_conc=0, dv_conc=2.5, dna_conc=4000)
    if meltingTemp < 60 or meltingTemp > 65:
        meltingTempExcluded.append(primerLibrary[i])

primerLibrary = [x for x in primerLibrary if (x not in meltingTempExcluded)]    


# No hairpin structure above 37 ℃
hairpinExcluded = []
for i in range (0, len(primerLibrary)):
    res = primer3.calcHairpin(primerLibrary[i], mv_conc=0, dv_conc=2.5, dna_conc=4000)
    if res.tm >= 37:
        hairpinExcluded.append(primerLibrary[i])

primerLibrary = [x for x in primerLibrary if (x not in hairpinExcluded)]           


# No primer homodimers formed above 37 ℃
homodimerExcluded = []
for i in range (0, len(primerLibrary)):
    res = primer3.calcHomodimer(primerLibrary[i], mv_conc=0, dv_conc=2.5, dna_conc=4000)
    if res.tm >= 37:
        homodimerExcluded.append(primerLibrary[i])

primerLibrary = [x for x in primerLibrary if (x not in homodimerExcluded)]  


# No primer heterodimers formed above 37 ℃ 
heterodimerExcluded = []
for i in range (0, len(primerLibrary)):
    for j in range (i+1, len(primerLibrary)):
        res = primer3.calcHeterodimer(primerLibrary[i], primerLibrary[j], mv_conc=0, dv_conc=2.5, dna_conc=4000)
        if res.tm >= 37:
            heterodimerExcluded.append(primerLibrary[j])
    primerLibrary = [x for x in primerLibrary if (x not in heterodimerExcluded)] 
    heterodimerExcluded = []
 

# Hamming distance between any two primers should >= 6 (Organick et al., 2018, Nature Biotechnology)
Hamming = 0
HammingExcluded = []

for i in range (0, len(primerLibrary)):
    for j in range (i+1, len(primerLibrary)):
        if (j >= len(primerLibrary)):
            break
        for k in range (0, 21):
            if primerLibrary[i][k] != primerLibrary[j][k]:
                Hamming += 1
        if Hamming < 6:
            HammingExcluded.append(primerLibrary[j])
        primerLibrary = [x for x in primerLibrary if (x not in HammingExcluded)] 
        HammingExcluded = []
        Hamming = 0
        

# There should be no more than 10 bp inter-sequence complementarity between any two primers 
similarityExcluded = []

for i in range (0, len(primerLibrary)):
        for j in range (i+1, len(primerLibrary)):
            if (j >= len(primerLibrary)):
                break
            for m in range (0, 12):
                for n in range (0, 12):
                    if primerLibrary[i][m:m+10] == DNA_reverse_complement(primerLibrary[j])[n:n+10]:
                        similarityExcluded.append(primerLibrary[j])
            primerLibrary = [x for x in primerLibrary if (x not in similarityExcluded)] 
            similarityExcluded = []

# GC content of primers 45-55%
GCExcluded = []

for i in range (0, len(primerLibrary)):
    if (GCPercent(primerLibrary[i]) < 0.45 or GCPercent(primerLibrary[i]) > 0.55):
        GCExcluded.append(primerLibrary[i])
primerLibrary = [x for x in primerLibrary if (x not in GCExcluded)] 

print(len(primerLibrary))
print(primerLibrary)

# You can choose to enable the following code in order to store your own generated primer library
'''
%store primerLibrary
'''



        

187
['ACTCTCACTCTAACGCACTTG', 'GCTAAGCTCATGGTTCACAAG', 'CGTTGTGAGCCTTATAACGCT', 'GCCGAATACTGTTGGATCTTC', 'CTACTTGTCCAGGTACTCCGT', 'GAAGAAGACCAGTAGTTCCGT', 'GCTCAACTCGTAAGTAGACCA', 'GTCCGATACTTCCAGCACTTC', 'CAAGACAACTGATCGAATGGC', 'GACATTCTGGAGGAGAGGTTA', 'CGGAGTACAACGAGACTTCAC', 'ACACAGTACGAGCATGGACTA', 'GTCTCCATCCTCTACGATTCA', 'GAACTAGCATGAAGGAACCAG', 'GCCTTGATAAGCTGGATCCTC', 'GCACAAGTACGAAGACAGACA', 'CACCATCACTAAGCGAGGTTC', 'GTTGAGACACTAAGTGCCGAT', 'GCGGATAAGCACATTCAGGTA', 'GCCTAAGTCTACCAGTCCAAT', 'TGGTCGTATGGATCAGACAAC', 'ATGGTGAAGCGTGATCCTATG', 'GTCACGAATCTCCTGCTTCTG', 'AGATGTCATTAGCGAGGACTG', 'GCTTGATATTCGGTAGCAGTC', 'CACTCCACACCTCTGCTAAGA', 'ACTCAGTTAACGTCCTAAGCG', 'TGGCTCTAGAACCATGGATGT', 'ATGGAGAGCTTGCTGAAGTTG', 'CTCTTACGAGCTTCCGGTGTA', 'GTCCACACAATCGAGATAGGA', 'ACGAGGATAGGTTCTCATGGA', 'TCAACCAATGACGTACGGAGA', 'TACCACTTCCTCGTATCGATC', 'CGTCCATTGCATTCAGACTGA', 'GAAGAACAGCTTACGATGGCT', 'TCTCGGTTGGTATCAGTTGGA', 'GTCTTACACTACGCTCGACCT', 'TGAGGATGCTTGGTATGACAC', 'GCAATCTACCTAACCTACC

'\n%store primerLibrary\n'