#### Annotation of the extended set consist in the reading and merging of HMMR scan and mulitfasta data

In [1]:
%matplotlib inline
import sys, os
sys.path.append("/Users/guillaumelaunay/work/DVL/python3/pyproteinsExt/src")
sys.path.append("/Users/guillaumelaunay/work/DVL/python3/pyproteins/src")
%load_ext autoreload
## Plain fasta utility functions

import gzip, io
import urllib.request

def mFastaParseZip(inputFile):
    data = None
    with io.TextIOWrapper(gzip.open(inputFile, 'r')) as f:
        data = mFastaParseStream(f)
    return data

def mFastaParseUrl(url):
    fp = urllib.request.urlopen(url)
    mybytes = fp.read()
    #mFastaParseStream(fp)
    mystr = mybytes.decode("utf8")
    fp.close()
    data = mFastaParseStream(mystr.split('\n'))
    
#    print(mystr)
    return data

def mFastaParseStream(stream):
    
    data = {}    
    headPtr = ''
    for line in stream:
        #print (line)
        if line == '':
            continue
        s = line.replace('\n','')
        if s.startswith('>'):
            headPtr = s.split()[0][1:]
            
            if headPtr in data:
                raise ValueError('Smtg wrong')
            data[headPtr] = {'header': s, 'sequence' : '' }
            
            continue
        data[headPtr]['sequence'] += s
    return data

### Read-in hmmr scan

In [9]:
%autoreload 2
import pyproteinsExt.hmmrContainerFactory as hm

mainContainer = {}

fileName="/Volumes/arwen/mobi/group/NOX_GL/extendedSet/NOX_noEukaryota_PB_NR_hmmscan.out"
hscan = hm.parse(inputFile=fileName)
print( len(hscan.T()), 'proteins to reannotate' )

for e in hscan.T():
    if e in mainContainer:
        raise ValueError("Known id ", e)
        
    mainContainer[e] = { 'hmmr' :  hscan.T()[e], 'tmhmm' : { 'fasta' : None } }b
    

3274 proteins to reannotate


### Read-in corresponding mfasta

In [14]:
fastaContainer = None
with open('/Volumes/arwen/mobi/group/NOX_GL/extendedSet/NOX_noEukaryota_PB_NR.fasta', 'r') as f:
    fastaContainer = mFastaParseStream(f)
print(len(list(fastaContainer.keys())), 'available fasta entries')

3274 available fasta entries


### Merge the two

In [20]:
for _id in mainContainer:
    if _id not in fastaContainer:
        raise ValueError(_id, 'missing in fasta container')
    mainContainer[_id]['tmhmm']['fasta'] = fastaContainer[_id]
        

#### Extract TaxonID

In [22]:
import re
def getTaxID(datum):
    reTaxID = re.compile('OX=([\d]+)')
    m = reTaxID.search(datum['tmhmm']['fasta']['header'])
    if not m:
        raise ValueError('Cant parse taxid from', datum['tmhmm']['fasta']['header'])
    datum['taxid'] = m.groups()[0]
    
for _id in mainContainer:
    getTaxID(mainContainer[_id])

#### Inspect NCBI Taxonomy

In [23]:
import pyproteinsExt.ontology
taxonTree = pyproteinsExt.ontology.Ontology(file='/Users/guillaumelaunay/work/databases/ontology/ncbitaxon.owl')

###### Flag Non Eukaryota phylum members

In [33]:
cnt = 0
u = 0
for _id in mainContainer:
    u += 1
    taxid=mainContainer[_id]['taxid']
    n = taxonTree.onto.search(iri='http://purl.obolibrary.org/obo/NCBITaxon_' + taxid)
    if not n:
        print ('Cant find Taxon node for', taxid)
        mainContainer[_id]['isNoEukaryota'] = False  
        continue

    bool=True
    for t in taxonTree._getLineage(n[0]):
        if not t.label:
            continue
        if t.label[0] == 'Eukaryota':
            bool=False
            break
    if bool:
        cnt += 1
    mainContainer[_id]['isNoEukaryota'] = bool      

print("Total number of bacterial sequences", cnt, u)

Cant find Taxon node for 2083010
Cant find Taxon node for 2109333
Cant find Taxon node for 2109333
Cant find Taxon node for 2083010
Cant find Taxon node for 2083010
Cant find Taxon node for 2116516
Cant find Taxon node for 2116516
Cant find Taxon node for 2107699
Cant find Taxon node for 2107702
Cant find Taxon node for 2107699
Cant find Taxon node for 2107702
Cant find Taxon node for 2126737
Total number of bacterial sequences 2493 3274


#### Deserialize the seed data set, to mark seed members in the extended data set

In [42]:
import pickle, time
import time

def save(data, tag=None):
    saveDir="/Users/guillaumelaunay/work/projects/NOX"
    timestr = time.strftime("%Y%m%d-%H%M%S")
    fTag = "NOX_annotation_" + tag + "_" if tag else "NOX_annotation_"
    fSerialDump = fTag + timestr + ".pickle"
    with open(saveDir + '/' + fSerialDump, 'wb') as f:
        pickle.dump(data, f)
    print('data structure saved to', saveDir + '/' + fSerialDump)

def load(fileName):
    saveDir="/Users/guillaumelaunay/work/projects/NOX"
    d = pickle.load( open(saveDir + "/" + fileName, "rb" ) )
    print("restore a annotated container of ", len(d), "elements")
    return d

seedContainer = load('NOX_annotation_fullPFAM_20180625-111432.pickle')

nBact=0
nSeed=0
u=0
for _id in mainContainer:
    mainContainer[_id]['isSeed'] = False
    if not mainContainer[_id]['isNoEukaryota']:
        continue
    nBact +=1
    if _id not in seedContainer:
        continue
    mainContainer[_id]['isSeed'] = True
    nSeed += 1

print(nSeed, ' seed elements marked among ', nBact, ' bacterial entries (', len(mainContainer), ' total )')
print(u,' bacterial seed elements were found among ', len(seedContainer), 'entries')

restore a annotated container of  4915 elements


KeyError: 'tr|A0A1H5PJ42|A0A1H5PJ42_9ACTN'

In [39]:
c=0
d=0
for _id in mainContainer:
    if mainContainer[_id]['isSeed']:
        c +=1
    if 'isNoEukaryota' not in mainContainer[_id]:
        print(mainContainer[_id])
    if mainContainer[_id]['isNoEukaryota']:
        d +=1
print(c, ' seed elements marked among ', d, ' bacterial entries (', len(mainContainer), ' total )')

472  seed elements marked among  2493  bacterial entries ( 3274  total )


In [35]:
save(mainContainer,'extendedSet_fullPFAM')

data structure saved to /Users/guillaumelaunay/work/projects/NOX/NOX_annotation_extendedSet_fullPFAM_20180628-155755.pickle
