In [1]:
'''
cutFinder
by Leo d'Espaux <leodespaux@gmail.com>
with help from William Zhuang, Kai Li

Finds cut sites within a user-input sequence, then checks those candidates against
chromosome files discarding sequences found more than once. 
Output is a list of candidate 23mers in your target sequence.

Details:
*For now only looks in the sense strand, but checks both sense and antisense on the genome.
*The program lists these sequences as it finds them. 
*Usually, we get a cut site every ~25nt.
*Note your target can be a sequence that's not originally in the genome.

Current as of 9/1/15

'''

# import libraries
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import Entrez



def findCutSites():

    print("I have the following genomes: S288c, CENpk, PO1f, CLIB, SAKL")
    species=input("Or type server to fetch the genome. ")
    print("")
    print("Hold tight while I fetch that genome...")
    print("")
    
    if species=="PO1f":
        allRecords=[SeqIO.read("Yl_PO1f_A.fasta", "fasta"), SeqIO.read("Yl_PO1f_B.fasta", "fasta"),
                    SeqIO.read("Yl_PO1f_C.fasta", "fasta"), SeqIO.read("Yl_PO1f_D.fasta", "fasta"),
                    SeqIO.read("Yl_PO1f_E.fasta", "fasta"), SeqIO.read("Yl_PO1f_F.fasta", "fasta")]

    elif species=="CLIB":
        allRecords=[SeqIO.read("Yali0A_contig.fasta", "fasta"), SeqIO.read("Yali0B_contig.fasta", "fasta"),
            SeqIO.read("Yali0C_contig.fasta", "fasta"), SeqIO.read("Yali0D_contig.fasta", "fasta"),
            SeqIO.read("Yali0E_contig.fasta", "fasta"), SeqIO.read("Yali0F_contig.fasta", "fasta")]

    elif species=="SAKL":
        allRecords=[SeqIO.read("chromosomeA.fasta", "fasta"), SeqIO.read("chromosomeB.fasta", "fasta"),
                    SeqIO.read("chromosomeC.fasta", "fasta"), SeqIO.read("chromosomeD.fasta", "fasta"),
                    SeqIO.read("chromosomeE.fasta", "fasta"), SeqIO.read("chromosomeF.fasta", "fasta")]
        
    elif species=="S288c":
        allRecords=[SeqIO.read("Scer01.fasta", "fasta"), SeqIO.read("Scer02.fasta", "fasta"),
                    SeqIO.read("Scer03.fasta", "fasta"), SeqIO.read("Scer04.fasta", "fasta"),
                    SeqIO.read("Scer05.fasta", "fasta"), SeqIO.read("Scer06.fasta", "fasta"),
                    SeqIO.read("Scer07.fasta", "fasta"), SeqIO.read("Scer08.fasta", "fasta"),
                    SeqIO.read("Scer09.fasta", "fasta"), SeqIO.read("Scer10.fasta", "fasta"),
                    SeqIO.read("Scer11.fasta", "fasta"), SeqIO.read("Scer12.fasta", "fasta"),
                    SeqIO.read("Scer13.fasta", "fasta"), SeqIO.read("Scer14.fasta", "fasta"),
                    SeqIO.read("Scer15.fasta", "fasta"), SeqIO.read("Scer16.fasta", "fasta")]
        
    elif species=="CENpk":
        allRecords=[SeqIO.read("CENPK113-7D_CH1.fasta", "fasta"), SeqIO.read("CENPK113-7D_CH2.fasta", "fasta"),
                    SeqIO.read("CENPK113-7D_CH3.fasta", "fasta"), SeqIO.read("CENPK113-7D_CH4.fasta", "fasta"),
                    SeqIO.read("CENPK113-7D_CH5.fasta", "fasta"), SeqIO.read("CENPK113-7D_CH6.fasta", "fasta"),
                    SeqIO.read("CENPK113-7D_CH7.fasta", "fasta"), SeqIO.read("CENPK113-7D_CH8.fasta", "fasta"),
                    SeqIO.read("CENPK113-7D_CH9.fasta", "fasta"), SeqIO.read("CENPK113-7D_CH10.fasta", "fasta"),
                    SeqIO.read("CENPK113-7D_CH11.fasta", "fasta"), SeqIO.read("CENPK113-7D_CH12.fasta", "fasta"),
                    SeqIO.read("CENPK113-7D_CH13.fasta", "fasta"), SeqIO.read("CENPK113-7D_CH14.fasta", "fasta"),
                    SeqIO.read("CENPK113-7D_CH15.fasta", "fasta"), SeqIO.read("CENPK113-7D_CH16.fasta", "fasta"),
                    SeqIO.read("CENPK113-7D_mitochondria.fasta", "fasta")]
        
    print("I found the following genomic records: ")
    for record in allRecords:
        print( record.description)
    print("")


    # ask for target and see if it's present and unique
    print("OK, what sequence do you want to find cut sites in: ")
    targetSeq=Seq(input().replace(" ","").upper()) #get rid of spaces and make all caps
    
    print("")
    targetCount=checkUnique(allRecords, targetSeq)
    print("Found instances: ")
    print(targetCount)
    
    
    
    #some sensible checks
    if targetCount==0:
        print("Sorry your sequence is not found in this genome")
        exit()
    elif targetCount>1:
        print("Sorry there are more than one instances of your sequence in this genome.")
        exit()
    
    elif targetCount==1:
        print("Your sequence is found in the genome once, proceeding...")
    
    print("")
    print("Here are some suitable cut sites for your target")

    #make an empty list
    cutters=[]    
    
    # let's look through our locus to find cut sites. We end at len-24, since we are looking thru 23-nts,  
    # and because python includes the first index but not the last one
    uniqueCount=0
    degenerateCount=0
    NAGcount=0 
    
    for i in range(len(targetSeq)-24):

        # sub is an array containing all 23mers
        sub=targetSeq[i:i+23]
        
        # first, cut sequences are N20NGG, cannot contain TTTTTT
        if sub[21]=="G" and sub[22]=='G' and not('TTTTTT' in sub):
           
            # and also, if the last 12 bases exists followed by any NGG more than once, toss it.
            # we define a CheckUnique function below which returns 1 if the input is found only once,
            # returns 2 if it's found more than once, and 0 if it's not found at all. 
        
            if (checkUnique(allRecords,sub[8:20]+"AGG")+checkUnique(allRecords,sub[8:20]+"TGG")+checkUnique(allRecords,sub[8:20]+"CGG")+checkUnique(allRecords,sub[8:20]+"GGG")) > 1:
                degenerateCount=degenerateCount+1
                
            
            # also, don't want any N9-20 followed by NAG
            elif (checkUnique(allRecords,sub[8:20]+"AAG")+checkUnique(allRecords,sub[8:20]+"TAG")+checkUnique(allRecords,sub[8:20]+"CAG")+checkUnique(allRecords,sub[8:20]+"GAG")) > 0:
                NAGcount=NAGcount+1
                
            
            else:
                # print the cut site that passes all tests
                uniqueCount=uniqueCount+1
                print (sub[0:20])
    print("\n"+"Done finding sequences. Good luck!")  
    print("degenerate, NAG counts: ")
    print(degenerateCount)
    print(NAGcount)

 

    
            
            
            
def checkUnique(searchRecords, string):
# Here we take in a string sequence and see if it's unique in the records (chromosomes)
# defined earlier. If unique, returns 1; if not found, 0; and if found more than once, 2.
        
        nfound=0
        
        for record in searchRecords:
            nfound=nfound+record.seq.count(str(string))
            if nfound>1:
                return 2
                break
            # note that we should look in the other strand, too    
            nfound=nfound+record.seq.reverse_complement().count(str(string))
            if nfound>1:
                return 2
                break
            
        if nfound == 0:
            return 0
            
        if nfound == 1:
            return 1
            
            
findCutSites()
                    
                        
                    



I have the following genomes: S288c, CENpk, PO1f, CLIB, SAKL
Or type server to fetch the genome. tcctcttccactggtacctctaaggtcgtttctgaaacctcctctaccatcgtcgatgacattccaagattgtctgctaattaccacggtgacttgtggcatcataacgtcattcaaaccttggaaactccatttagagaatcttctacttatcaagagagagctgatgaattggttgtcaagatcaaggatatgttcaacgccttgggtgatggtgatatctctccatctgcttatgatactgcctgggtcgctagattggctaccatctcttccgacggttccgaaaagccaagattcccacaagccttaaattgggtttttaacaaccaattgcaagacggttcttggggtattgaatctcatttctctttgtgtgatagattgttgaacaccactaactccgtcattgccttgtctgtttggaagactggtcactctcaagttcaacaaggtgccgaattcattgccgaaaacttgagattattgaacgaagaagatgaattgtctccagacttccaaatcatttttccagctttgttgcaaaaggccaaggccttaggtatcaacttgccatacgacttgccattcatcaagtacttgtctactaccagagaagctagattgactgacgtctccgctgctgctgacaacattccagccaacatgttgaatgccttggaaggtttagaagaagtcattgattggaacaagattatgagattccaatctaaagacggttcttttttgtcttcccctgcttctactgcttgtgtcttgatgaacaccggtgatgagaagtgtttcactttcttgaataacttgttggataaattcggtggttgtgttccatgtatgtattccattgatttattggaaagattgtctttagttgacaatatt

UnboundLocalError: local variable 'allRecords' referenced before assignment