In [None]:
'''
cutFinder
by Leo d'Espaux  

contributions by: Xingkai Li

Finds cut sites within a user-input sequence, having the chromosome files saved locally
or retrieving from NCBI using entrez. The "_Count" version rather than discarding candidate cut sequences
with more than one instance per chromosome, provides a list of all sequences and their Ncounts, and
NAGcoutns. 

Current as of 5/2/16



DIRECTIONS:
Click on this text, press play above, and wait. If stuck, press the restart button, and repeat. 






'''

# import libraries
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import Entrez



def findCutSites():

    
    # First, let's get the genome files and check that they load correctly 
    
    #print("What species do you want, e.g. PO1f, s288c")
    species="s288c" #input("Or type server to fetch the genome. ")
    print("")
    print("Hold tight while I fetch that genome...")
    print("")
    
    if species=="server":
        allRecords=fetchNCBI()
        
    elif species=="PO1f":
        allRecords=[SeqIO.read("Yl_PO1f_A.fasta", "fasta"), SeqIO.read("Yl_PO1f_B.fasta", "fasta"),
                    SeqIO.read("Yl_PO1f_C.fasta", "fasta"), SeqIO.read("Yl_PO1f_D.fasta", "fasta"),
                    SeqIO.read("Yl_PO1f_E.fasta", "fasta"), SeqIO.read("Yl_PO1f_F.fasta", "fasta")]

    elif species=="CLIB":
        allRecords=[SeqIO.read("Yali0A_contig.fasta", "fasta"), SeqIO.read("Yali0B_contig.fasta", "fasta"),
            SeqIO.read("Yali0C_contig.fasta", "fasta"), SeqIO.read("Yali0D_contig.fasta", "fasta"),
            SeqIO.read("Yali0E_contig.fasta", "fasta"), SeqIO.read("Yali0F_contig.fasta", "fasta")]

    elif species=="SAKL":
        allRecords=[SeqIO.read("chromosomeA.fasta", "fasta"), SeqIO.read("chromosomeB.fasta", "fasta"),
                    SeqIO.read("chromosomeC.fasta", "fasta"), SeqIO.read("chromosomeD.fasta", "fasta"),
                    SeqIO.read("chromosomeE.fasta", "fasta"), SeqIO.read("chromosomeF.fasta", "fasta")]
        
    elif species=="s288c":
        allRecords=[SeqIO.read("Scer01.fasta", "fasta"), SeqIO.read("Scer02.fasta", "fasta"),
                    SeqIO.read("Scer03.fasta", "fasta"), SeqIO.read("Scer04.fasta", "fasta"),
                    SeqIO.read("Scer05.fasta", "fasta"), SeqIO.read("Scer06.fasta", "fasta"),
                    SeqIO.read("Scer07.fasta", "fasta"), SeqIO.read("Scer08.fasta", "fasta"),
                    SeqIO.read("Scer09.fasta", "fasta"), SeqIO.read("Scer10.fasta", "fasta"),
                    SeqIO.read("Scer11.fasta", "fasta"), SeqIO.read("Scer12.fasta", "fasta"),
                    SeqIO.read("Scer13.fasta", "fasta"), SeqIO.read("Scer14.fasta", "fasta"),
                    SeqIO.read("Scer15.fasta", "fasta"), SeqIO.read("Scer16.fasta", "fasta"),]
        
    # Let's print the records to make sure they're good
    print("I found the following genomic records: ")
    for record in allRecords:
        print( record.description)
    print("")

    

    # ask for target sequence and see if it's present and unique
    print("OK, what sequence do you want to find cut sites in: ")
    targetSeq=Seq(input().replace(" ","")) #also want to remove annoying spaces
    
    print("\nLet me do some due diligence...\n")
    targetCount=checkUnique(allRecords, targetSeq)
    print("Found "+str(targetCount)+" instances of your target sequence in all the chromosomes (+ and - strands)\n")
     
    
    # some sensible checks
    if targetCount==0:
        print("Sorry your sequence is not found in this genome.")
        exit()
   

    #Now let's find the cut sites we want
    print("\nAnd here is a list of the guides, instances of each found as N12-20NGG, and as N12-20NAG, in all chromosomes:\n")
 
    #make an empty list
    cutters=[]    
    
    #And now we'll run the sequence in the fwd direction, and then in the reverse
    #And print any cut sites and Ninstances
    findCandidates(targetSeq, allRecords, cutters)
    findCandidates(targetSeq.reverse_complement(), allRecords, cutters)
    
    print(cutters)
    
    if cutters==[]:
        print("Shit out of luck. No cutters.")
    

def findCandidates(targetSeq, allRecords, cutters):
    
    # let's look through our locus to find cut sites. We end at len-24, since we are looking thru 23-nts,  
    # and because python includes the first index but not the last one
    
    for i in range(len(targetSeq)-24):
        
        
        
        # And now run through candidate cut sequences, sub
        sub=targetSeq[i:i+23]
        
        #make empty counters for each sub sequence
        degenerateCount=0
        NAGcount=0 
        
        # first, cut sequences are N20NGG, cannot contain TTTTTT
        if sub[21]=="G" and sub[22]=='G' and not('TTTTTT' in sub):
           
            # Checkunique, defined below, returns 0, 1, or N, each being the number of instances of the sub sequence that matters
            # which is the last 12 nt according to DiCarlo NAR 2013
        
            degenerateCount=checkUnique(allRecords,sub[8:20]+"AGG")+checkUnique(allRecords,sub[8:20]+"TGG")+checkUnique(allRecords,sub[8:20]+"CGG")+checkUnique(allRecords,sub[8:20]+"GGG")
                
            # also, don't want any N9-20 followed by NAG
            NAGcount=checkUnique(allRecords,sub[8:20]+"AAG")+checkUnique(allRecords,sub[8:20]+"TAG")+checkUnique(allRecords,sub[8:20]+"CAG")+checkUnique(allRecords,sub[8:20]+"GAG")
                
            cutters.append([str(sub[0:20]), degenerateCount, NAGcount])
                    
  
            
            
            
def checkUnique(allRecords, string):
# Here we take in a string sequence and see if it's unique in the records (chromosomes)
# defined earlier. If unique, returns 1; if not found, 0; and if found more than once, returns number of instances
        
        nfound=0
        
        for record in allRecords: #each record is a chromosome file. We need to look at the reverse too.
            nfound=nfound+record.seq.count(str(string))+record.seq.reverse_complement().count(str(string))
            
        if nfound>1:
            return nfound
            
        elif nfound == 0:
            return 0
            
        elif nfound == 1:
            return 1
            
            
findCutSites()
                    
                        
        
        
        
        
   







'''

Hold on... It's loading...


''' 
        
        
        
        
    




Hold tight while I fetch that genome...

I found the following genomic records: 
tpg|BK006935.2| [organism=Saccharomyces cerevisiae S288c] [strain=S288c] [moltype=genomic] [chromosome=I] [note=R64-1-1]
tpg|BK006936.2| [organism=Saccharomyces cerevisiae S288c] [strain=S288c] [moltype=genomic] [chromosome=II] [note=R64-1-1]
tpg|BK006937.2| [organism=Saccharomyces cerevisiae S288c] [strain=S288c] [moltype=genomic] [chromosome=III] [note=R64-1-1]
tpg|BK006938.2| [organism=Saccharomyces cerevisiae S288c] [strain=S288c] [moltype=genomic] [chromosome=IV] [note=R64-1-1]
tpg|BK006939.2| [organism=Saccharomyces cerevisiae S288c] [strain=S288c] [moltype=genomic] [chromosome=V] [note=R64-1-1]
tpg|BK006940.2| [organism=Saccharomyces cerevisiae S288c] [strain=S288c] [moltype=genomic] [chromosome=VI] [note=R64-1-1]
tpg|BK006941.2| [organism=Saccharomyces cerevisiae S288c] [strain=S288c] [moltype=genomic] [chromosome=VII] [note=R64-1-1]
tpg|BK006934.2| [organism=Saccharomyces cerevisiae S288c] [strai