In [5]:
'''
****************************
* Protein analyser class
* contains the high level analysis routines applied to scoring and other low-level analysis done by 
* Proteome Scorer class
* has a range of analyses.
* descriptions are made available to the user, whoi selects an analysis to run
* selected index is returned and appropriate analysis run
* first version started 15 Sep 2018
*
* 22 Sep 2018 
* added random seed setting capability, passed into an updated version of scorer
* allows test mode and external control to be set and recorded
* need to import time mopduile for this
*
* 4 Nov 2018
* added ability to replace rogue amino acids
* need arose due to an 'O' in Methanosarcina mazei pro
*
* 8 Nov 2018
* changed the indexing of locations of the scores as the new version of the scorer class
* this accessed by a variable bextended = True to add 'XXXXXX' on the front and 'XXXX' on the end
* defaults to true
*
* 20 Nov 2018
* wholesale changing of the histogram section
* 22 Nov 2018 - take a cut - '1.0 Candidate
****************************
'''

#import datetime - imported in Proteome Core
#import random - this is imported in Proteome Scorer
import csv
import time

class ProteomeAnalyser():
    
    def __init__(self, logger, scorer, btestmode = False):
        
        '''
        ****************************
        * initialises the Proteome analyser
        * needs a logger to record what happens.
        * and a scorer class to perform the analysis
        * hence cannot be initiatied without either of these.
        * note the logger comes from Proteome core so we can assume functions datestamp() and timestamp() exist
        ****************************
        '''
        
        self.logger = logger
        self.scorer = scorer
        self.versionid = '1.0'
        self.filepath = '/Users/johnslee/Documents/'
        
        if btestmode:
            self.randomseed = 1
        else:
            self.randomseed = int(time.time())
        
        versionlist = 'scorer version = ' + scorer.versionid
        versionlist += ', logger version = ' + self.versionid
        self.logger.write_log_entry(versionlist)
        self.logger.write_log_entry('random seed set to ' + str(self.randomseed))
        
        self.numberofbuckets = 37# module-variable for the number of buckets in a histogram
        
        
    def available_analysis(self):
        
        '''
        ****************************
        * returns a list of available analysis
        * each number must be matched by an entry in function run_analysis()
        * which is at the bottom of the class
        ****************************
        '''
        
        myanalyses = []
        myanalyses.append([0, 'Proteome Score','Creates a csv file with a full list of FFAT scores for each protein'])
        myanalyses.append([1, 'Bottom Ten','Creates a csv file the lowest 10 scores for each protein plus location'])
        myanalyses.append([2, 'Bottom Ten - Sorted','Creates a csv file the lowest 10 scores for each protein, sorted by lowest score. Very slow'])
        myanalyses.append([3, 'Histogram','Creates a csv file of a histogram of the FFAT scores of a proteome'])
        myanalyses.append([4, 'Histogram + randomised histogram (full proteome)','a histogram of the FFAT scores of a proteome plus a randomised equivalent histogram (by full proteome)'])
        myanalyses.append([5, 'Histogram + randomised histogram (by protein)','a histogram of the FFAT scores of a proteome plus a randomised equivalent histogram (by protein)'])
        myanalyses.append([6, 'Reordered Histogram','a histogram of the FFAT scores of a proteome plus a reordered weighting equivalent histogram'])
        myanalyses.append([7, 'Histogram + rendomised preserving acids','a histogram of the FFAT scores of a proteome plus a randomised equivalent preserving acids in low-scoring flanks' ])
        return myanalyses
        
        
    def create_proteomescore_csv(self, proteomes):
        
        '''
        ****************************
        * takes a list of proteomes
        * for each proteome
        * scores the protein and makes a csv file from it.
        *
        * analysis details:
        * number - 0
        * title -  'Proteome Score'
        * description - 'Creates a csv file with a full list of FFAT scores for each protein'
        *
        ****************************
        '''
        
        bextended = True
        
        for proteome in proteomes:
        
            orgfilename = proteome[0][0]
            
            self.logger.write_log_entry('running proteome score on proteome ' + orgfilename)
    
            foutname = self.filepath + orgfilename + '_rawscores_' + datestamp() + '.csv'
            fout = open(foutname,"w")
            cr = csv.writer(fout, delimiter=',',quoting=csv.QUOTE_ALL)
           
            myproteins = proteome[1]
    
            ctr = 0

            for protein in myproteins:

                ctr += 1

                myresults = self.scorer.score_protein(protein[1], bextended)
                mycsv = [protein[0][0],protein[0][1]]
                for result in myresults:
                    mycsv.append(result)
                cr.writerow(mycsv)
            
            self.logger.write_log_entry('finished scoring ' + orgfilename + ' - ' + str(ctr) + ' proteins')
            if bextended:
                self.logger.write_log_entry('scoring done with X amino acids appended front and back')
            
            fout.close()
            
        self.logger.write_log_entry('finished proteome score')
        
    def write_generated_proteome_file(self, proteins, filename):
        '''
        ********************************
        * 22 Sep 2018
        * creates a txt file of a generated file in a Fasta like way
        * key is to avoid confusing this with an actual one
        * and to write in a way that can easily be read.
        ********************************
        '''
        
        self.logger.write_log_entry('writing randomised proteome ' + filename)
    
        foutname = self.filepath + filename + datestamp() + '.txt'
        fout = open(foutname,"w")
    
        for protein in proteins:

            # construct Fasta-like header
            # >YAL010C MDM10 SGDID:S000000008, Chr I fr...

            proteinheader = '>' + protein[0][0] + ' ' + protein[0][1] + ' randomised version'
            fout.write(proteinheader + '\n')
            fout.write(protein[1] + '\n')
            
        fout.close()
        
    def bottom_ten(self, proteomes, bextended = True):

        '''
        ********************************
        * scores each protein, and produces a list of the bottome 10 scores plus starting location
        * doesn't sort - much faster than sorting, which can be done in excel very easily.
        *
        * analysis details:
        * number - 1
        * title -  'Bottom Ten'
        * description - 'Creates a csv file the lowest 10 scores for each protein plus location'
        *
        * 17 Sep added limit to protein_score_ordered to speed up the function
        *
        * 8 Nov 18 added bextension to indicate 'X's added front and backto allow scoring of start and end sequences
        ********************************
        '''

        numscores = 10
        
        # now shift index for the start of the sequence depending whether been extended with 'X 8 Nov 18
        if bextended:
            istartindex = 6
        else:
            istartindex = 0

        for proteome in proteomes:

            orgfilename = proteome[0][0]

            foutname = self.filepath + orgfilename + '_bottomten_' + datestamp() + '.csv'
            fout = open(foutname,"w")
            cr = csv.writer(fout, delimiter=',',quoting=csv.QUOTE_ALL)
            
            self.logger.write_log_entry('running bottom_ten on proteome ' + orgfilename)

            myproteins = proteome[1]
            allmybottomten = []

            for protein in myproteins:
                
                myresults = self.scorer.score_protein(protein[1], bextended)
                myrankedresults = self.scorer.protein_score_ordered(myresults, numscores)
                thisbottomten = myrankedresults[0 : numscores]
                
                mycsv = []
                mycsv.append(protein[0][0])
                mycsv.append(protein[0][1])
                mycsv.append(len(protein[1]))
                
                for i in range(0, numscores):
                    if i < len(thisbottomten):
                        mycsv.append(thisbottomten[i][0])
                    
                mycsv.append('')
                
                for i in range(0, numscores):
                    if i < len(thisbottomten):
                        mycsv.append(thisbottomten[i][1] - istartindex)
                
                cr.writerow(mycsv)
                
            self.logger.write_log_entry('finished sorted and ranking proteins in ' + orgfilename)
                
            fout.close()
            
        self.logger.write_log_entry('finished bottom ten')
        
    def bottom_ten_sorted(self, proteomes, bextended = True):

        '''
        ********************************
        * scores each protein, and produces a list of the bottome 10 scores plus starting location
        * then sorts on first score and prints lowest scores first
        * a future adaptation would be to sort on the first ten starting in positon 10
        * so the first one is the lowest of all.
        *
        * analysis details:
        * number - 2
        * title -  'Bottom Ten - sorted'
        * description - 'Creates a csv file the lowest 10 scores for each protein, sorted by lowest score'
        *
        * 8 Nov 18 added bextension to indicate 'X's added front and backto allow scoring of start and end sequences
        *
        ********************************
        '''

        numscores = 10
        
        # now shift index for the start of the sequence depending whether been extended with 'X 8 Nov 18
        if bextended:
            istartindex = 6
        else:
            istartindex = 0

        for proteome in proteomes:

            orgfilename = proteome[0][0]

            foutname = self.filepath + orgfilename + '_bottomten_sorted_' + datestamp() + '.csv'
            fout = open(foutname,"w")
            cr = csv.writer(fout, delimiter=',',quoting=csv.QUOTE_ALL)
            
            self.logger.write_log_entry('running bottom_ten_sorted on proteome ' + orgfilename)

            myproteins = proteome[1]
            allmybottomten = []

            for protein in myproteins:
                
                myresults = self.scorer.score_protein(protein[1], bextended)
                myrankedresults = self.scorer.protein_score_ordered(myresults, numscores)
                thisbottomten = myrankedresults[0 : numscores]
                allmybottomten.append(thisbottomten)
                
            # now have all the bottom 10 scores. need to sort
            
            numproteins = len(allmybottomten)
            seq = range(0, numproteins)
            
            #for irank in range(numscores - 1, -1, -1):
            for irank in [1,0]:# doing all 10 very time consuming
                # irank is the number in the list we are sorting on.
                # we sort by 0th, then 1st, etc, so we need to sort in reverse order
                
                for i in range(0, numproteins - 1):
                    for j in range(numproteins - 2, i - 1, -1):
                        if allmybottomten[seq[j]][irank][0] > allmybottomten[seq[j + 1]][irank][0]:
                            temp = seq [j]
                            seq[j] = seq[j + 1]
                            seq[j + 1] = temp
                
            # now have the sequence for this protein
                
            for s in seq:
                
                mycsv = []
                mycsv.append(myproteins[s][0][0])
                mycsv.append(myproteins[s][0][1])
                mycsv.append(len(myproteins[s][1]))
                mycsv.append('')
                
                for i in range(0, numscores):
                    if i < len(allmybottomten[s]):
                        mycsv.append(allmybottomten[s][i][0])
                    
                mycsv.append('')
                
                for i in range(0, numscores):
                    if i < len(allmybottomten[s]):
                        mycsv.append(allmybottomten[s][i][1] - istartindex)                           
                
                cr.writerow(mycsv)
                
            self.logger.write_log_entry('finished sorted and ranking proteins in ' + orgfilename)
                
            fout.close()
            
        self.logger.write_log_entry('finished bottom ten - sorted')
        
    '''
    ********************************
    * now a set of routines round histograms
    * this counts the occurence of FFAT scores for a set of proteins
    *
    * basis is make_histogram - takes a proteome and generates a list structure with a histogram count.
    ********************************
    '''
    
    def make_histogram(self, proteome):
        
        '''
        ********************************
        * creates a histogram of FFAT scores for a proteome
        * 20 Nov 2018
        * generates a list of scores and count.
        * generally first is 0, and there are sequences that score 0
        * then 0.5, and anything between 0 and 0.5 gets into that bucket 
        * does a single proteome
        * and returns as a list of [proteome_detail,[[score, count], [score, count]]]
        ********************************
        '''
        
        bars = []
        for i in range(0,self.numberofbuckets):
            bars.append([i/2.0, 0])

        orgdetails = proteome[0]

        myproteins = proteome[1]

        for protein in myproteins:

            myresults = self.scorer.score_protein(protein[1])

            for res in myresults:
                idx = int(res * 2.0)
                if idx < len(bars):
                    bars[idx][1] += 1
                else:
                    print 'score beyond high bound ' + str(res) 

        return [orgdetails,bars]
    
    def proteome_histogram(self, proteomes):
        
        for proteome in proteomes:
            
            buckets = self.make_histogram(proteome)
            
            orgfilename = proteome[0][0]
            
            foutname = self.filepath + orgfilename + '_histogram_' + datestamp() + '.csv'
            fout = open(foutname,"w")
            cr = csv.writer(fout, delimiter=',',quoting=csv.QUOTE_ALL)

            for bucket in buckets[1]:
                cr.writerow(bucket)
        
    
    def histogram_randomised(self, proteomes, originalseed, numhistograms, bbyprotein = False):
        
        '''
        ********************************
        * creates a histogram of FFAT scores for a proteome
        * writes to a .txt file
        * compares to a randomised version
        * includes amino acids count check
        *
        * 13 Nov 18
        * doing a single randomisation not sufficient - need to do a number
        * also include a seed, which will be incremented by 1 each loop
        * change in output to an array histograms[[bars][numbers in bar]]
        ********************************
        '''
        
        csvtoprow = ['bucket','count']
        for i in range(0,numhistograms):
            csvtoprow.append('sim #' + str(i))

        for proteome in proteomes:

            orgdetails = proteome[0]
            orgfilename = orgdetails[0]
            myproteins = proteome[1]
            
            csvtitle = ['histogram_randomised','seed = ' + str(originalseed), orgfilename, datestamp()]
            self.logger.write_log_entry('running histogram_randomised on proteome ' + orgfilename)
            
            if bbyprotein:
                foutname = self.filepath + orgfilename + '_histogram_by_protein_' + datestamp() + '.csv'
            else:
                foutname = self.filepath + orgfilename + '_histogram_full_proteome_' + datestamp() + '.csv'
                
            fout = open(foutname,"w")
            cr = csv.writer(fout, delimiter=',',quoting=csv.QUOTE_ALL)
            
            originalbars = self.make_histogram(proteome)
        
            lastscore = 0 # make element 2 a running count.
            
            randombars = []
            
            currentseed = originalseed
            
            for j in range(0, numhistograms):
                
                myrandomisedproteins = self.scorer.randomise_proteins(myproteins, currentseed, bbyprotein)
                
                myrandomisedproteome = [orgdetails, myrandomisedproteins]
                
                randombars.append(self.make_histogram(myrandomisedproteome))
                            
                currentseed += 1
            
            cr.writerow(csvtitle)
            cr.writerow(csvtoprow)
            
            for i in range(0, self.numberofbuckets):
                
                mycsv = []
                mycsv.append(originalbars[1][i][0])
                mycsv.append(originalbars[1][i][1])
                for j in range(0, numhistograms):
                    mycsv.append(randombars[j][1][i][1])
               
                cr.writerow(mycsv)
            
    def histogram_reordered(self, proteomes, reorder):
        
        '''
        ********************************
        * creates a histogram of FFAT scores for a proteome
        * writes to a .txt file
        * reorderers the scoring of the main sequence according to the variable reorder
        * compares to the original version
        * includes amino acids count check
        ********************************
        '''
        
        # check the reorder meets the requirements of reordering
        # each number from 0 to 6 once and only once.
        
        standard =range(0,7)
        if sorted(reorder) <> standard:
            print 'warning - reorder sequence does not meet requirements of each number between 0 and 6 once and only once'
        else:
            
            seq = ''
            for r in reorder:
                seq += str(r + 1)
                
            bars = []
            numbars = 37
    
            for i in range(0,numbars):
                bars.append([i/2.0,0,0])
    
            for proteome in proteomes:

                orgfilename = proteome[0][0]

                foutname = self.filepath + orgfilename + '_histogram_reordered_' + seq + '_' + datestamp() + '.txt'
                fout = open(foutname,"w")

                self.logger.write_log_entry('running histogram plus reordered histogram - results in ' + foutname)

                myproteins = proteome[1]

                ctrscore = 0

                for i in range(0,numbars):
                    bars[i][1] = 0

                for protein in myproteins:

                    myresults = self.scorer.score_protein(protein[1])
                    ctrscore += len(myresults)
                    for res in myresults:
                        idx = int(res * 2)
                        if idx < len(bars):
                            bars[idx][1] += 1
                        else:
                            print('score beyond high bound ' + str(res))

                fout.write(orgfilename + ' histogram - ' + str(ctrscore) + ' scores' + '\n')
                fout.write('-------------\n')

                lastscore = 0 # make element 2 a running count.
                for bar in bars:
                    bar[2] = lastscore + 100.0 * bar[1]/ctrscore
                    lastscore = bar[2]
                    fout.write(str(bar[0]) + ': ' + str(bar[1]) + ', ' + str(bar[2])[:6] + '%\n')
                fout.write('-------------\n\n')

                for i in range(0,numbars):
                    bars[i][1] = 0
                    
                #use ctrscore as the total of all scores again as shouldn't change just on re-ordering.

                for protein in myproteins:

                    myresults = self.scorer.score_protein_reordered(protein[1], reorder)
                    for res in myresults:
                        idx = int(res * 2)
                        if idx < len(bars):
                            bars[idx][1] += 1
                        else:
                            print('score beyond high bound ' + str(res))

                fout.write(orgfilename + ' histogram - reordered - ' + seq + '\n')
                fout.write('-------------\n')

                lastscore = 0 # make element 2 a running count.
                for bar in bars:
                    bar[2] = lastscore + 100.0 * bar[1]/ctrscore
                    lastscore = bar[2]
                    fout.write(str(bar[0]) + ': ' + str(bar[1]) + ', ' + str(bar[2])[:6] + '%\n')
                fout.write('-------------\n')

            self.logger.write_log_entry('finished running histogram routine ')
            self.logger.write_log_entry('-----------------------------------')
             
    def histogram_randomised_preserved_acidflank(self, proteomes,originalseed, numhistograms, cutoff = 0.5, bbyprotein = False):
        
        '''
        ********************************
        * creates a histogram of FFAT scores for a proteome
        * writes to a .txt file
        * reorders apart from preserving qualifying acids
        * which are acids 'DEST' in the flank of segments with FFAT scores of 0.5 or less
        * compares to the original version
        * includes amino acids count check
        *
        * 22 Sep 2018 - make a separate file of the hits
        *
        * 20 Nov 18 - make it jus a flank score with either 0 or 0.5 
        * introduce the cutoff score as an option.
        ********************************
        '''
        csvtoprow = ['bucket','count']
        for i in range(0,numhistograms):
            csvtoprow.append('sim #' + str(i))

        for proteome in proteomes:

            orgdetails = proteome[0]
            orgfilename = orgdetails[0]
            myproteins = proteome[1]
            
            csvtitle = ['histogram_randomised_preserved_acidflank','seed = ' + str(originalseed), orgfilename,'cut off = '+ str(cutoff), datestamp()]
            self.logger.write_log_entry('running histogram_randomised_preserved_acidflank on proteome ' + orgfilename)
            self.logger.write_log_entry('seed = ' + str(originalseed) + ',  cut off = '+ str(cutoff))
            
            if bbyprotein:
                foutname = self.filepath + orgfilename + '_acidflankmask_histogram_by_protein_' + datestamp() + '.csv'
            else:
                foutname = self.filepath + orgfilename + '_acidflankmask_histogram_full_proteome_' + datestamp() + '.csv'
                
            fout = open(foutname,"w")
            cr = csv.writer(fout, delimiter=',',quoting=csv.QUOTE_ALL)
            
            originalbars = self.make_histogram(proteome)
        
            lastscore = 0 # make element 2 a running count.
            
            randombars = []
            
            currentseed = originalseed
            
            for j in range(0, numhistograms):
                
                myrandomisedproteins = self.scorer.randomise_proteins_preserve_acidic_flank(myproteins, currentseed, cutoff, bbyprotein)
                
                myrandomisedproteome = [orgdetails, myrandomisedproteins]
                
                randombars.append(self.make_histogram(myrandomisedproteome))
                            
                currentseed += 1
            
            cr.writerow(csvtitle)
            cr.writerow(csvtoprow)
            
            for i in range(0, self.numberofbuckets):
                
                mycsv = []
                mycsv.append(originalbars[1][i][0])
                mycsv.append(originalbars[1][i][1])
                for j in range(0, numhistograms):
                    mycsv.append(randombars[j][1][i][1])
               
                cr.writerow(mycsv)
                
        
    def run_analysis(self, analysisnumber, proteomes):
        
        if analysisnumber == 0:
            self.create_proteomescore_csv(proteomes)
        elif analysisnumber == 1:
            self.bottom_ten(proteomes)
        elif analysisnumber == 2:
            self.bottom_ten_sorted(proteomes)
        elif analysisnumber == 3:
            self.proteome_histogram(proteomes)
        elif analysisnumber == 4:
            numhistograms = 10
            thisseed = 1
            self.histogram_randomised(proteomes, thisseed, numhistograms, False )
        elif analysisnumber == 5:
            numhistograms = 10
            thisseed = 1
            self.histogram_randomised(proteomes, thisseed, numhistograms, True)
        elif analysisnumber == 6:
            newsequence = [2,5,6,1,0,4,3]
            self.histogram_reordered(proteomes, newsequence)
        elif analysisnumber == 7:
            numhistograms = 10
            thisseed = 1
            #histogram_randomised_preserved_acidflank(self, proteomes, originalseed, numhistograms, cutoff = 0.5, bbyprotein = False)
            self.histogram_randomised_preserved_acidflank(proteomes, thisseed, numhistograms, 0.5, False)
        else:
            print 'unrecognised analysis index '+ str(analysisnumber)
        
        
'''
****************************
* end of the analyser class
****************************
'''


'\n****************************\n* end of the analyser class\n****************************\n'