In [3]:
'''
****************************
* Proteome Test Harness
* 
* set of functions to test the various modules
* associated with the Yeast/Proteome project
* 14 Sep 2018
****************************
'''

#import datetime - imported in Proteome Core
#import random - this is imported in Proteome Scorer
import csv

filenamecore = 'Proteome Core Dev.ipynb'
filenamescorer = 'Proteome Scorer Dev.ipynb'
filenameanalyser = 'Proteome Analyser Dev.ipynb'

%run 'Proteome Core Dev.ipynb'
%run 'Proteome Scorer Dev.ipynb'
%run 'Proteome Analyser Dev.ipynb'

def strip_identifiers(lineinfile):
    
    idx1 = lineinfile.index(',')
    ref1 = lineinfile[:idx1]
    idx2 = lineinfile[idx1 + 1:].index(',')
    ref2 = lineinfile[idx1 + 1: idx1 + idx2 + 1]
    
    return [ref1,ref2]

def compare_csv_files(filename1, filename2):
    
    '''
    ********************************
    * a routine for comparing two files that should be the same
    * hmmm ... quite difficult. Need to make some assumptions.
    * Firstly, we need a file to compare, and a file to compare against, so we take f1 and note its differences to f2.
    * to do a full two-way comparison need to do compare_csv_files(f1,f2) followed by compare_csv_files(f2,f1)
    *
    * approach as follows
    * need a row identifier which is the first two columns
    * seek against the opposite current row and the next. Need to keep both rows, as may have
    * f1(r1,r2,r4) and f2(r1,r2,r3,r4), or f1(r1,r2,r3), f2(r1,r3) 
    * 
    ********************************
    '''
    
    f1 = open(filename1, 'r')
    f2 = open(filename2, 'r')
    
    regressionstatus = []
    regressionstatus.append('regression of ' + filename1 + ' against ' + filename2)
    
    bpass = True
    
    idx1 = 0
    
    file2lines = []
    file2matched = []
    file2ref = []
    
    # okay - this is crap as it consumes memory
    for line2 in f2:
        file2lines.append(line2)
        file2matched.append(-1)# -1 means we do not have a match
        file2ref.append(strip_identifiers(line2))
        
    num2 = len(file2lines)
    
    for line1 in f1:
        
        idx1 +=1
        
        baseline = line1
        ref1 = strip_identifiers(baseline)
        bfoundline = False
        match = -1
        
        # now try and find the same line in f2
        for i in range(0, num2):
            if file2matched[i] == -1:
                if ref1 == file2ref[i]:
                    file2matched[i] = idx1
                    bfoundline = True
                    imatch = i
                    break
                    
        if not bfoundline:
            bpass = False
            regressionstatus.append('row ' + str(idx1) + ' not matched - ' + ref1[0] + ', ' + ref1[1])
        else:
            if baseline == file2lines[imatch]:
                pass
            else:
                bpass = False
                regressionstatus.append('row ' + str(idx1) + ' found match but contents do not match - ' + ref1[0] + ', ' + ref1[1])
                
    return [bpass, regressionstatus]

def readproteomefile(filedetails):
    '''
    ***************************
    * is an independent check on the ability of ProteinReader() to properly read proteins
    * this programme takes in text files of the various forms of proteomes
    * and returns the number of proteins and the length of each protein.
    * 
    * filedetails is a list of files in the form of the library conents ie [filename, reference name, readformat]
    *
    * some structure has to be assumed. the main one is that each new protein starts with a '>'
    * slight buggeration factor. The 'dashed' format has lines such as '---A--' so need to remove the A's
    ***************************
    '''
    
    fpath = '/Users/johnslee/Documents/'
    ftype = '.txt'
    
    # include all possible leters in aminoacidchars as do not want to terminate the read because of a wild card
    aminoacidchars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ*'
    
    filemeasures = []
    
    for f in filedetails:
        
        proteincounts = []
        
        fname = fpath + f[0] + ftype
        f = open(fname, 'r')
        
        thisprotein = ''
        
        for line in f:
            
            if '>' in line:
                # new protein. May be the first in which case thisprotein is just ''

                if len(thisprotein) > 0:
                    
                    thiscode = ''
                    idx = len(thisprotein)

                    bvalidcode = True
                    while idx > 0 and bvalidcode:
                        idx -= 1
                        if thisprotein[idx] in aminoacidchars:
                            thiscode = thisprotein[idx] + thiscode
                        else:
                            bvalidcode = thisprotein[idx] == '-' or thisprotein[idx] == chr(0)
                            # hoping that there are no cases where - is in an annoying place other than for 'dashed' format
                            # OrthoDB has a strange end character equal to chr(0)
                            
                    proteincounts.append(len(thiscode))
                
                thisprotein = ''
            
            else:
                
                thissegment = line.strip('\n')
                thissegment = thissegment.strip(chr(0))
                thisprotein += thissegment.strip('-')
       
        thiscode = ''
        for char in thisprotein:
            if char in aminoacidchars:
                thiscode += char
                
        proteincounts.append(len(thiscode))
        
        filemeasures.append([fname,len(proteincounts),proteincounts])
    
    return filemeasures
        
def FileRetrieveTests(testmode, reader):

    '''
    ***************************
    * runs comparisons on a filelist between the formal file retrieval processes in Proteinreader
    * and a basic text file reader with a minimum of assumptions.
    * logger passed in to enable recording of the results
    ***************************
    '''

    strresults = []
    strresults.append('File retrieve comparison test - Core module against basic file retrieval routine.')
    strresults.append(' ')
    filelist = reader.availableproteomes()
    
    testlist = range(0,len(filelist))
    
    proteomesfortest = reader.read_files(testlist)
    
    comparisontestlist = []
    for t in testlist:
        comparisontestlist.append(filelist[t])
        
    comparisonproteomes = readproteomefile(comparisontestlist)
    
    # let the comparisons begin.
    # assume number of proteomes is the same.
    
    numproteomes = len(proteomesfortest)
    bpass = True
    
    for i in range(0,numproteomes):
        
        thisproteome = proteomesfortest[i][1]
        checkproteome = comparisonproteomes[i]
        
        
        strtext = filelist[testlist[i]][0]
        strtext +=  ' number of proteins:'
        strtext += ' reader =  ' + str(len(thisproteome))
        strtext += ' check = ' + str(checkproteome[1])
        strresults.append(strtext)
        print strtext
        
        for j in range(0, len(thisproteome)):
            if j < checkproteome[1]:
                if len(thisproteome[j][1]) <> checkproteome[2][j]:
                    bpass = False
                    strtext = filelist[testlist[i]][0] + ': protein length mismatch - ' + thisproteome[j][0][0] + ' - ' +str(j)
                    strtext += ' ' + str(len(thisproteome[j][1])) 
                    strtext += ' vs ' + str(checkproteome[2][j]) + ' in test file'
                    strresults.append(strtext)
                    print strtext
                   
    if bpass:
        strresults.append('File retrieval test passed')
    else:
        strresults.append('*** File retrieval test failed ***')
        
    return strresults
    

def RunSystemRegressionTest():
    
    '''
    ***************************
    * 11th Nov 2018
    * Full regression test
    ***************************
    '''
    
    btestmode = False
    bprintmode = True
    
    '''
    ***************************
    * There are four components that need the versions recording.
    * Proteome Core, which reads files and has utilities, and is loaded separately
    * Proteome Scorer, which produces FFAT scores for protein chains
    * Proteome Analyser, which has the scorer passed to it, does high level analysis and generates files
    ***************************
    '''

    mylogger = ProteomeActivityLogger(btestmode, bprintmode)
    myreader = ProteomeReader(btestmode)
    myscorer = SequenceAnalysis()
    myanalysis = ProteomeAnalyser(mylogger, myscorer, btestmode)
    
    mylogger.write_log_entry('Regression test')
    mylogger.write_log_entry('Analyser version - ProteomeAnalyser -' + myanalysis.versionid)
    mylogger.write_log_entry('Scorer version - SequenceAnalysis -' + myscorer.versionid)
    mylogger.write_log_entry('Reader version - ProteomeReader - ' + myreader.versionid)
    
    foutname = myanalysis.filepath + 'Regression_Test_'+  datestamp() + '.txt'
    mylogger.write_log_entry('results in ' + foutname)
    
    fout = open(foutname,"w")
    
    fout.write('Full System Regression Test' + '\n')
    fout.write('components' + '\n')
    fout.write('Core - ' + myreader.versionid + '\n')
    fout.write('Scorer - ' + myscorer.versionid + '\n')
    fout.write('Analyser - ' + myanalysis.versionid + '\n')
    
    fileretrieveresults = FileRetrieveTests(btestmode,myreader)
    
    for result in fileretrieveresults:
        fout.write(result + '\n')
    fout.write('\n')
    
    '''
    ***************************
    * now compare the csv files
    ***************************
    '''
    
    fcsv1 = myreader.fpath + 'uniprot_proteome_yeastx6049_fasta_rawscores_20181108.csv'
    ftest1 = myreader.fpath + 'uniprot_proteome_yeastx6049_fasta_rawscores_' + datestamp() + '.csv'
    fcsv2 = myreader.fpath + 'uniprot_proteome_yeastx6049_fasta_bottomten_sorted_20181108.csv'
    ftest2= myreader.fpath + 'uniprot_proteome_yeastx6049_fasta_bottomten_sorted_' + datestamp() + '.csv'
    myproteome = myreader.read_files([1])
    
    myanalysis.run_analysis(0,myproteome)
    csvcompareresults = compare_csv_files(ftest1,fcsv1)
    
    fout.write('comparison of ' + fcsv1+ ' against ' + ftest1 + '\n')
    
    if csvcompareresults[0]:
        fout.write(' Test Passed'+ '\n')
    else:
        for result in csvcompareresults[1]:
            fout.write(result + '\n')
            
    fout.write('\n')
        
    myanalysis.run_analysis(2,myproteome)
    csvcompareresults = compare_csv_files(ftest2,fcsv2)
    
    fout.write('comparison of ' + fcsv2+ ' against ' + ftest2 + '\n')
    
    if csvcompareresults[0]:
        fout.write(' Test Passed'+ '\n')
    else:
        for result in csvcompareresults[1]:
            fout.write(result + '\n')
            
    fout.write('\n')
    
    print 'finished RunSystemRegressionTest'
    
    
def AcidTractScore():
    '''
    ***************************
    * Generates a csv file of acid tract scores
    * does it on a specific test file, # 8 in the test list
    ***************************
    '''
    
    btestmode = True
    
    mylogger = ProteomeActivityLogger(False, True)
    myreader = ProteomeReader(btestmode)
    myscorer = SequenceAnalysis()
    #myanalysis = ProteomeAnalyser(mylogger, myscorer, btestmode)
    
    myproteomes = myreader.read_files([8])
    
    fpath = '/Users/johnslee/Documents/'
    
    foutname = fpath + 'acidic_tract_score_test' + datestamp() + '.csv'
    fout = open(foutname,"w")
    
    cr = csv.writer(fout, delimiter=',',quoting=csv.QUOTE_ALL)
    
    for proteome in myproteomes:
        
        myproteins = proteome[1]
        
        for protein in myproteins:
        
            myresults = myscorer.score_protein_acidic_tract(protein[1])
            
            randomseed = 1
            maxscore = 0.0
            bbyprotein = True
            
            mymask0 = myscorer.generate_mask_preserve_acidic_flank( [protein], maxscore , bbyprotein)
            randomproteins0 = myscorer.randomise_proteins_masked([protein], mymask0, randomseed, bbyprotein)
            
            randomseed = 1
            maxscore = 0.5
            
            mymask05 = myscorer.generate_mask_preserve_acidic_flank( [protein], maxscore , bbyprotein)
            randomproteins05 = myscorer.randomise_proteins_masked([protein], mymask05, randomseed, bbyprotein)
            
            mycsv = []
            mycsv.append(protein[0][0])
            mycsv.append(protein[0][1])
            mycsv.append(len(protein[1]))
            
            cr.writerow(mycsv)
            
            for i in range(0, len(myresults)):
                mycsv = []
                mycsv.append(protein[1][i])
                mycsv.append(myresults[i])
                mycsv.append(mymask0[0][i])
                mycsv.append(randomproteins0[0][1][i])
                mycsv.append(mymask05[0][i])
                mycsv.append(randomproteins05[0][1][i])
                cr.writerow(mycsv)

    print 'finished writing ' + foutname
                
    
def TestMaskingAndRandomisation():
    '''
    ***************************
    * Functional test on screening and randomisation of proteome and proteins
    * check mask is fully observed
    * check number of acids fully preserved
    ***************************
    '''
    
    randomseed = 1
    ftest='uniprot_proteome_yeastx6049_fasta'
    
    btestmode = False
    
    mylogger = ProteomeActivityLogger(False, True)
    myreader = ProteomeReader(btestmode)
    myscorer = SequenceAnalysis()
    #myanalysis = ProteomeAnalyser(mylogger, myscorer, btestmode)
    
    myproteomes = myreader.read_files([1])
    
    # test masking.
    # use the letter A
    
    maska = []
    
    mylogger.write_log_entry('Functional test - masking and randomisation')
    mylogger.write_log_entry('scorer version ' + myscorer.versionid)
    mylogger.write_log_entry('reader version ' + myreader.versionid)
    
    proteins = myproteomes[0][1]

    for protein in proteins:

        numacids = len(protein[1])
        proteinmask = [0] * numacids
            
        for i in range(0, numacids):
            if protein[1][i] == 'A':
                proteinmask[i] = 1
                
        maska.append(proteinmask)
        
    # have the mask
    # now2 test for by-protein randomisationby protein
    
    newproteins = myscorer.randomise_proteins_masked(proteins, maska, randomseed, True)
    
    numproteins = len(newproteins)
    
    bfullpass = True
    
    for i in range(0, numproteins):

        bpass = True
        acidcount = [[0,0]] * len(myscorer.aminoacids)
        
        for j in range(0, len(myscorer.aminoacids)):
            acidcount[j][0] = proteins[i][1].count(myscorer.aminoacids[j])
            acidcount[j][1] = newproteins[i][1].count(myscorer.aminoacids[j])
            if acidcount[j][0] <> acidcount[j][1]:
                bpass = False
                mylogger.write_log_entry('mismatch in number for acid ' + myscorer.aminoacids[j] + ' in protein ' + str(i))
                
        bfullpass = bfullpass and bpass
        
        bpass = True
        numproteinsinchain = len(proteins[i][1])

        for j in range(0, numproteinsinchain):
            
            if proteins[i][1][j] == 'A':
                if newproteins[i][1][j] <> 'A':
                    bpass = False
                    mylogger.write_log_entry('test failure - A transformed when should have been masked')
                    
            else:
                if newproteins[i][1][j] == 'A':
                    bpass = False
                    mylogger.write_log_entry('test failure - A present when should not have been present')
                    
            bfullpass = bfullpass and bpass
            
    if bfullpass:
        mylogger.write_log_entry('mask and randomisation by protein - test passed')
    else:
        mylogger.write_log_entry('mask and randomisation by protein - test failed')
        
    # now whole proteome
    newproteins = myscorer.randomise_proteins_masked(proteins, maska, randomseed, False)
    
    numproteins = len(newproteins)
    
    bfullpass = True
    
    acidcount = [[0,0]] * len(myscorer.aminoacids)
    
    for i in range(0, numproteins):

        bpass = True
        numproteinsinchain = len(proteins[i][1])
        
        for j in range(0, len(myscorer.aminoacids)):
            acidcount[j][0] += proteins[i][1].count(myscorer.aminoacids[j])
            acidcount[j][1] += newproteins[i][1].count(myscorer.aminoacids[j])
           
        for j in range(0, numproteinsinchain):
            
            if proteins[i][1][j] == 'A':
                if newproteins[i][1][j] <> 'A':
                    bpass = False
                    mylogger.write_log_entry('test failure - A transformed when should have been masked')
                    
            else:
                if newproteins[i][1][j] == 'A':
                    bpass = False
                    mylogger.write_log_entry('test failure - A present when should not have been present')
                    
            bfullpass = bfullpass and bpass
            
    for j in range(0, len(myscorer.aminoacids)):
        if acidcount[j][0] <> acidcount[j][1]:
            bfullpass = False
            mylogger.write_log_entry('mismatch in number for acid ' + myscorer.aminoacids[j])
            
    if bfullpass:
        mylogger.write_log_entry('mask and randomisation for whole proteome - test passed')
    else:
        mylogger.write_log_entry('mask and randomisation for whole proteome - test failed')
            
    print 'finished TestMaskingAndRandomisation'
    
RunSystemRegressionTest()

#TestMaskingAndRandomisation()
 
#AcidTractScore()

New activity log version 1.2 Candidate release
0: 20190128 18:21. scorer version = Dev on 1.0, logger version = Dev on 1.0
1: 20190128 18:21. random seed set to 1548699711
2: 20190128 18:21. Regression test
3: 20190128 18:21. Analyser version - ProteomeAnalyser -Dev on 1.0
4: 20190128 18:21. Scorer version - SequenceAnalysis -Dev on 1.0
5: 20190128 18:21. Reader version - ProteomeReader - 1.1 Candidate release
6: 20190128 18:21. results in /Users/johnslee/Documents/Regression_Test_20190128.txt
replaced 1 instance of a rogue amino acid in MTTB1
replaced 1 instance of a rogue amino acid in MTBB1
replaced 1 instance of a rogue amino acid in MTMB
replaced 1 instance of a rogue amino acid in MTTB2
replaced 1 instance of a rogue amino acid in MTBB2
replaced 1 instance of a rogue amino acid in MTBB3


IndexError: string index out of range

In [None]:
[[0,0]] * 5