In [1]:
'''
****************************
* Proteome Core
* 
* as the project grows so the need to organise properly grows.
* so some key functionality is being split out into this file.
* The intention is it an be properly tested and version controlled,
* and then loaded in other notebooks through the magic command %run
*
* 2 Sep 2018
* first separation and consolidation
* 
* main access is through function readfiles() which takes a list of integers
* the function then returns a list of file contents with each element being that of the corresponding item in the list
* the numbers refer to the sequence of files in a text file currently living at /Users/johnslee/proteomes.txt
* for each item in the list readfiles returns a two element list
* [text element from the list of files][proteome1, proteome2, proteome3,]]
* each proteome consists of a list of proteins[protein1, protein2, protein3]
* each protein consists of a header and the code
* header = [protein name, protein reference]
*
* Development version 10 Sep 2018
* includes test file loading as an option on start up
* includes activity logger as a separate class - 14 Sep 2018
* 
* 16 Sep 2018 
* new file format called here 'ACBD5' - for file formats from Joe Costello at Exeter
*
* 23 Sep Release 1.1
* test file loading as an option
* inclusion of logger
* new file type readACBD5
* removal of * from Fasta-style files
*
* Dev version
*
* 4 Nov 2018
* added ability to replace rogue amino acids
* need arose due to an 'O' in Methanosarcina mazei proteome which Tim says should replace with 'X'
****************************
'''
import datetime

def datestamp():
    return datetime.date.today().strftime("%Y%m%d")

def timestamp():
    return datetime.datetime.now().strftime("%H:%M")

class ProteomeActivityLogger():
    
    '''
    ****************************
    * logging class that allows other programmes to create a log of activity.
    * has two modes - test and prod - ie non test
    * creates a new log each day, or opens the existing log for that day.
    * has a printmode that allows all entries going to the log file to be printd on screen
    ****************************
    '''
    
    def __init__(self, testmode = False, printmode = False):
    
        self.versionid = '1.2'
        self.fpath = '/Users/johnslee/Documents/'
        self.counter = 0
        self.printmode = printmode
    
        if testmode:
            fname = self.fpath + 'ProteomeTestlog' + datestamp() + '.txt'
            firstline = 'New test log'
        else:
            fname = self.fpath + 'Proteomelog' + datestamp() + '.txt'
            firstline = 'New activity log'
        
        self.f = open(fname, 'a')
        
        firstline += ' version ' + self.versionid
        self.f.write(firstline + '\n')  # python will convert \n to os.linesep 
        if self.printmode:
            print firstline
    
    def write_log_entry(self, entry):
        
        premlim = str(self.counter) + ': ' + datestamp() + ' ' + timestamp() + '. '
        self.f.write(premlim + entry + '\n')
        self.counter += 1
        if self.printmode:
            print premlim + entry
        
class ProteomeReader():
    
    '''
    ****************************
    * class ProteomeReader
    * reads proteome files 
    * maintains a list of files taken from a text file proteomes.txt
    * so new proteomes are added in that file.
    * info in file is [filename,organism name,readindex]
    * has four type of files it handles.
    * readindex = 0 for fasta, 1 = uniref, 2 = dashed, 3 = OrthoDB, 4 = ACBD5
    ****************************
    '''
    
    def __init__(self, testmode = False):
        
        '''
        ***************************
        * builds a library of available proteomes
        * assumes all have the same path and type
        * also sets the version control whgich should be changed by the programmer for each release.
        * contains the file path in a variable set here.
        ***************************
        '''
        
        self.versionid = '1.1'
        
        if testmode:
            filenameproteomes = '/Users/johnslee/proteomes_test_files.txt'
        else:
            filenameproteomes = '/Users/johnslee/proteomes.txt'
        
        self.proteomelibrary = []
        
        f = open(filenameproteomes, 'r')
        
        for line in f:
            
            thisfile = line[:]
            
            if ',' in thisfile:
                idx = thisfile.index(',')
                thisfilename = thisfile[ : idx]
                
            thisfile = thisfile[ idx + 1 :]
            
            if ',' in thisfile:
                idx = thisfile.index(',')
                thisfiledescription = thisfile[ : idx]
                
            thisfile = thisfile[ idx + 1:].strip()

            thisfilemode = int(thisfile)
            
            self.proteomelibrary.append([thisfilename,thisfiledescription,thisfilemode])
        

        self.fpath = '/Users/johnslee/Documents/'
        self.ftype = '.txt'
        
    def read_files(self, flist):
        '''
        ****************************
        * 28 Aug 2018
        * rejig of how protein files are read in
        * flist is a list of numbers referring to indices in the file library
        * reads in the content and returns a single list
        * with one entry per file
        * so returns [file1, file2]
        * file 1 = [proteome library 1, proteins]
        * proteome library = ['file name','organism name',read method]
        * proteins = [protein1, protein2]
        * protein1 = [[protein ID, protein name, comments],[code]]
        * so the code for the 5th protein of the 3rd file is in proteins[2][1][4][1]
        ****************************
        '''
    
        proteins = []
        for f in flist:
            myfile = self.proteomelibrary[f]
            fullfilename = self.fpath + myfile[0] + self.ftype
            if myfile[2] == 0:
                myproteins = self.readFASTA(fullfilename)
            elif myfile[2] == 1:
                myproteins = self.readUniRef50(fullfilename)
            elif myfile[2] == 2:
                myproteins = self.readDashed(fullfilename)
            elif myfile[2] == 3:
                myproteins = self.readOrthoDB(fullfilename)
            elif myfile[2] == 4:
                myproteins = self.readACBD5(fullfilename)
                
            # 4 Nov 2018 replace target acids
            self.replace_target_acids(myproteins)
            
            proteins.append([myfile,myproteins])
        
        return proteins
        
    
    def readFASTA(self, fname):
    
        '''
        ****************************
        * reads a strand of protein in FASTA format
        * in two parts: the first contains text and the second the code.
        * the first part has a clear sequence:example is '>YHR073W OSH3 
        * the YHR073W is a descriptive sequence. The second is a name
        *
        * data is in a list proteins
        * each item is an individual protein sequence that contains two elements
        * the first element is a list containing [7 letter id sequence, name, text]
        * the second is a string of the code.
        * 
        * August 2018 
        * The previous way names were read in was wrong. They are not fixed length
        * for example have these two
        * 'YPR159C-A YPR159C-A SGDID:S000028725'
        * 'YPR159W KRE6 SGDID:S000006363'
        * need to keep going until a space, so have 
        *
        * 2 Sep 2018
        * seem to have screwed this up so trying to sort it. fFrom human have 
        * sp|O95139|NDUB6_HUMAN NADH dehydrogenase [ubiquinone] 1 beta subcomplex subunit 6 OS=Homo sapiens OX=9606 GN=NDUFB6 PE=1 SV=3
        * MTGYTPDEKLRLQQLRELRRRWLKDQELSPREPVLPPQKMGPMEKFWNKFLENKSPWRKM...
        * becomes [['NDUB6','O95139'],['MTGYTPDEKLRLQQLRELRRRWLKDQELSPREPVLPPQKMGPMEKFWNKFLENKSPWRKM...
        * 
        * 23 Sep 2018
        * some files have an asterisk on the end of the acid chain - remove here.
        ****************************
        '''
        
        badchar = '*'

        f = open(fname, 'r')

        proteins = []
        thisprotein = [] 

        for line in f:

            if '>' in line:

                # so have a new protein
                if len(thisprotein) > 0:
                    # If we have been building a protein, then we need to append the current protein to the list ...

                    thisprotein.append(thissequence)
                    proteins.append(thisprotein)
                    
                # now start a new protein

                if '|' in line:
                    
                    idxID =line.index('|')
                    proteinID = line[idxID + 1 : ] # 'O95139|NDUB6_HUMAN NADH deh'
                    idxID = proteinID.index('|')
                    proteinname = proteinID[:idxID].strip() # 'proteinname = 095139'
            
                    proteinID = proteinID[idxID + 1 : ]#'NDUB6_HUMAN NADH deh'
                
                    idxID = proteinID.index('_')
                    if idxID < len(proteinID):

                        proteinID = proteinID[:idxID]
                        thisprotein = [[ proteinID , proteinname]]
                    
                    else:
                        
                        thisprotein = [[ 'unknown protein' , proteinname]]
                        
                else:
                    
                    thisdetail = []
                    idxID = 1 + line.index('>')
                    proteinID = ''
                    while line[idxID]<>' ' and idxID<len(line):
                        proteinID += line[idxID]
                        idxID += 1

                    proteinname = ''
                    idxID += 1

                    while line[idxID]<>' ' and idxID<len(line):
                        proteinname += line[idxID]
                        idxID += 1

                    thisprotein = [[proteinID, proteinname, line[idxID:].strip('\n')]]
                
                thissequence = ''
            else:
                thissequence += line.strip('\n\r')
                while badchar in thissequence: # 23 Sep 2018 strip out *. usually on the end.
                    idx = thissequence.index(badchar)
                    thissequence = thissequence[:idx] + thissequence[idx + 1:]
                
        thisprotein.append(thissequence)
        proteins.append(thisprotein)

        return proteins
    
    def readUniRef50(self, fname):

        '''
        ********************************
        * quite a lot of files cmoing in an unpleasant format with UniRef50 in the line
        * this gives it a proper knife-and-forking to get the protein out
        * but may lose some of thre associated detail and mangle some of the names.
        * 4 Sep 2018
        * not so easy. first line contains all the details, then the code follows
        * introduce headerline as a differentiator
        ********************************
        '''

        charset = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 =>_-'

        f = open(fname, 'r')

        myproteins = []
        thisprotein = [] 

        thissequence = ''

        for line in f:

            if '>UniRef' in line:# got new protein
                
                if len(thissequence) > 0:
                    
                    thisprotein = [[proteinID, proteinname]]
                    thisprotein.append(thissequence)
                    myproteins.append(thisprotein)
                    thissequence = ''
                
                headerline =''
                for char in line:
                    if char in charset:
                        headerline += char 
                        
                if 'RepID=' in headerline:
                    idx1 = 6 + headerline.index('RepID=')
                    if '_' in headerline[idx1:]:
                        idx2 = idx1 + headerline[idx1:].index('_')
                        proteinID = headerline[idx1 : idx2]
                        proteinname= headerline[idx2+1:]
                    else:
                        proteinID = headerline[idx1 : ]
                        proteinname = headerline[idx1:]
                        
                else:
                    
                    print 'non=spec code in Uniref'
                    proteinname = 'unknown'
                    proteinID = 'unknown'
                    
            else:
                
                for char in line:
                    if char in charset:
                        thissequence += char
                    
        if len(thissequence) > 0:
                    
            thisprotein = [[proteinID, proteinname]]
            thisprotein.append(thissequence)
            myproteins.append(thisprotein)
            mycode = ''
        
        return myproteins
    
    def readOrthoDB(self, fname):

        '''
        ********************************
        * 2 Sep 2018
        * another format OrthoDB
        * 226230:00043d {"pub_gene_id":"J4TV27", "pub_og_id":"EOG092C5A9Y", "og_name":"Transcription factor Opi1","level":4751, 
        *  "description":"Transcription factor Opi1"} MSESQRLGLSEEEVEAAEVLGVLKQSCRQKSQRSENVSQGDRRLAGESSTTPLNILDRVSNKI
        * becomes[['EOG092C5A9Y','J4TV27','Transcription factor Opi1'],'MSESQRLGLSEEEVEA']
        ********************************
        '''
        
        charset = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 =>_:{},*'

        f = open(fname, 'r')

        myproteins = []
        thisprotein = [] 
        
        thissequence = '' # need to do this for first protein
        
        for line in f:
            
            mycode = ''
            
            for char in line:
                if char in charset:
                    mycode += char

            
            if 'pub_gene_id' in mycode:
                
                # new protein
                if len(thissequence) > 0:
                    
                    thisprotein = [[proteinID, proteinname]]
                    thisprotein.append(thissequence)
                    myproteins.append(thisprotein)
                    
                    thissequence = ''
                    proteinname = proteinID = ''
                    
                idx1 = mycode.index('pub_gene_id')
                substring = mycode[ idx1 : ]# "pub_gene_id":"J4TV27", "pub_og_id":"EOG092C5A9Y", "
                idx2 = substring.index(':')
                substring = substring [ idx2 + 1 : ] # "J4TV27", "pub_og_id":"EOG092C5A9Y", "
                idx3 = substring.index(',')
                proteinID = substring [ : idx3 ]

                idx4 = substring.index(':')
                substring = substring [ idx4 + 1 : ] # EOG092C5A9Y", "
                idx5 = substring.index(',')
                proteinname = substring [ : idx5 ]
                               
            else:

                thissequence += mycode
            
        
        if len(thissequence)>0:
            
            thisprotein = [[proteinID, proteinname]]
            thisprotein.append(thissequence)
            myproteins.append(thisprotein)

        return myproteins

    def readDashed(self, fname):
        
        '''
        ********************************
        * quite a lot of files cmoing in an unpleasant format with UniRef50 in the line
        * this gives it a proper knife-and-forking to get the protein out
        * but may lose some of thre associated detail and mangle some of the names.
        *
        * have input >tr|I7CJK2|I7CJK2_NATSJ Uncharacterized protein OS=Natrinema sp. (strain J7-2) GN=NJ7G_2660 PE=4 SV=1
        *   LFDAQRTAVKQSQQLLKQGMT--------AQRTV-DTMVGTGLTGQESLQRYQLEVAQAA-------------THGTLSAMAA-----------------
        *   --------------------------------------------------MLPG---DD--A--TEAHQAVDESFEQLKRTHAAVYDMLERDLEQGVDAT
        * 
        * becomes
        * tr|I7CJK2|I7CJK2_NATSJ Uncharacterized protein OS=Natrinema sp. (strain J7-2) GN=NJ7G_2660 PE=4 SV=1
        *   LFDAQRTAVKQSQQLLKQGMTAQRTVDTMVGTGLTGQESLQRYQLEVAQAATHGTLSAMAAMLPGDDATEAHQAVDESFEQLKRTHAAVYDMLERDLEQGVDAT
        * which we need to turn into [[I7CJK2,NATSJ],'LFDAQR...']
        * having to use charactres culled from tests to identify ends of descriptions.
        * should encourage getting rid if this format
        ********************************
        '''

        charset = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 =>|_'
        AAcharset = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        
        f = open(fname, 'r')

        myproteins = []
        thisprotein = [] 

        thissequence = ''
        myheader = ''

        for line in f:
            
            if '>' in line:
                
                if len(thissequence) > 0:#already have a protein assembled
                    
                    thisprotein = [[proteinID, proteinname]]
                    thisprotein.append(thissequence)
                    myproteins.append(thisprotein)
                    thissequence = ''
                
                myheader = ''
                for char in line:
                    if char in charset:
                        myheader += char
                        
                if '|' in myheader:

                    idxu = 1 + myheader.index('|')  # 1 for length |
                    myheader = myheader[idxu : ]

                    if '|' in myheader:
                        idxi = myheader.index('|') # before first space is the ID
                        proteinID = myheader[ : idxi]
                        myheader = myheader[ idxi + 1 :]

                    if '_' in myheader:
                        idxn = myheader.index('_') # before second space is the name
                        myheader = myheader[ idxn + 1:]

                    if ' ' in myheader:
                        idxn = myheader.index(' ') # before second space is the name
                        proteinname = myheader[:idxn]
                        myheader = myheader[ idxn + 1:]
                        
            else:
                
                for char in line:
                    if char in AAcharset:
                        thissequence += char
                        
        if len(thissequence) > 0: # have the last protein assembled and need to add.
                    
            thisprotein = [[proteinID, proteinname]]
            thisprotein.append(thissequence)
            myproteins.append(thisprotein)

 
        return myproteins

    def readACBD5(self, fname):
    
        '''
        ****************************
        * format ACDB5 called for files received from Joe Costello at Exeter University
        * which are two files with ACBD5 in the title
        * very simple format: first line is just the protein '>A0FKI7
        * then the amino acid code
        * 16 Sep 2018
        ****************************
        '''
        
        f = open(fname, 'r')

        proteins = []
        thisprotein = [] 

        for line in f:

            if '>' in line:

                # so have a new protein
                # we need to complete the current protein and add to the list ...
                
                if len(thisprotein) > 0:

                    thisprotein.append(thissequence)
                    proteins.append(thisprotein)
                    
                # now start a new protein
                # format is simple - '>' then the protein name eg >A0FKI7
                
                idxID =line.index('>')
                proteinID = line[idxID + 1 : ].strip() # 'A0FKI7' - that's all there is
                thisprotein = [[ proteinID , proteinID]]
                
                thissequence = ''
                        
            else:
                
                thissequence += line.strip('\n\r')
                
        #add final protein
        thisprotein.append(thissequence)
        proteins.append(thisprotein)

        return proteins
    
    def replace_target_acids(self, proteins):
        '''
        ****************************
        * 4 Nov 2018
        * replaces specific amino acids with other amino acids in the proteome.
        * need arose to replace 'O' with 'X' in Methanosarcina Mazei proteome
        * but make a general case
        * so creat a list of pairs, with the first item being replaced with second item
        ****************************
        '''
        
        targetacids = []
        targetacids.append(['O','X'])
    
        for protein in proteins:
            ctr = 0
            for target in targetacids:
                while target[0] in protein[1]:
                    idx = protein[1].index(target[0])
                    protein[1] = protein[1][:idx] + target[1] + protein[1][idx + 1:]
                    ctr += 1
            if ctr > 0:
                # should log this, but currently no log file available in this module
                if ctr == 1:
                    print 'replaced 1 instance of a rogue amino acid in ' + protein[0][0]
                else:
                    print 'replaced ' + str(ctr) + ' instances of rogue amino acids in ' + protein[0][0]
            
                    
    
    def availableproteomes(self):
        return self.proteomelibrary


    def showlibrary(self):
        
        '''
        ************************
        * utility to return available proteomes
        ************************
        '''
        
        return self.proteomelibrary
               
    '''
****************************
* end of the proteome reader class
****************************
'''
