### Creating the seed data set

Starting from complete trEMBL dataset <span style='background:#f7f3f7;padding:0.4em;border-radius:2px; border:solid bgrey 1px'>arwen:/mobi/group/NOX_CH/data/uniprot_trembl.fasta.gz</span> which is a symbolic link for `arwen:/mobi/group/databases/flat/uniprot_trembl_2019_02.fasta.gz`
 *  Split the dataset in small volumes
     * script: <span style="color:green">**split.py**</span>
     * Usage:
     Create and go to the `/mobi/group/NOX_GL/volumes` 
```console
    ROOT_DIR=/mobi/group/NOX_CH
    SCRIPT_DIR=/mobi/group/NOX_CH/nox-analysis/scripts
    $SCRIPT_DIR/split.py $ROOT_DIR/data/uniprot_trembl.fasta.gz
```

 * Run the HMMR and TMHMM annotations
    * script: <span style="color:green">**runHMMR_slurm.sh**</span>
    * Usage:  
  
```console
    mkdir $ROOT_DIR/seedSet
    mkdir $ROOT_DIR/seedSet/work
    $SCRIPT_DIR/runHMMR_slurm.sh $ROOT_DIR/volumes $ROOT_DIR/seedSet/work $ROOT_DIR/data/profiles
```

 * Use this notebook to parse the _work_ folder (see **Parsing all data files** section)

    * Filter-out non eukaryotic entries and dump the corresponding fasta sequence in folder <span style='background:#f7f3f7;padding:0.4em;border-radius:2px; border:solid bgrey 1px'>/mobi/group/NOX_CH/seedSet/NOX_noEukaryota</span> (create directory before)
         

 * Concatenate all fasta sequences in a single file
```console
     cd $ROOT_DIR/seedSet/NOX_noEukaryota
     for i in $(ls); do sed -i -e '$a\' $i; done
     cat $ROOT_DIR/seedSet/NOX_noEukaryota/*.fasta > $ROOT_DIR/seedSet/NOX_noEukaryota.mfasta
 ```

* Perform full Pfam annotation

```console
     sbatch $SCRIPT_DIR/runHMMSCAN.sbatch /mobi/group/databases/hmmr/Pfam-A.hmm $ROOT_DIR/seedSet/NOX_noEukaryota.mfasta $ROOT_DIR/seedSet/NOX_noEukaryota_hmmscan.out
```

 * Enrich the datacontainer with these new annotation, see **Full Pfam annotation** section

In [1]:
%matplotlib inline
import sys, os
import copy
sys.path.append("/Users/chilpert/Work/pyproteinsExt/src")
sys.path.append("/Users/chilpert/Work/pyproteins/src")
%load_ext autoreload
%autoreload 2

In [8]:
import gzip, io
import urllib.request

def mFastaParseZip(inputFile):
    data = None
    with io.TextIOWrapper(gzip.open(inputFile, 'r')) as f:
        data = mFastaParseStream(f)
    return data

def mFastaParseUrl(url):
    fp = urllib.request.urlopen(url)
    mybytes = fp.read()
    #mFastaParseStream(fp)
    mystr = mybytes.decode("utf8")
    fp.close()
    data = mFastaParseStream(mystr.split('\n'))
    
#    print(mystr)
    return data

def mFastaParseStream(stream):
    
    data = {}    
    headPtr = ''
    for line in stream:
        #print (line)
        if line == '':
            continue
        s = line.replace('\n','')
        if s.startswith('>'):
            headPtr = s.split()[0][1:]
            
            if headPtr in data:
                raise ValueError('Smtg wrong')
            data[headPtr] = {'header': s, 'sequence' : '' }
            
            continue
        data[headPtr]['sequence'] += s
    return data

#mFastaParseUrl('http://www.uniprot.org/uniprot/S4Z6V5.fasta')
#data = mFastaParse('/Volumes/arwen/home/ygestin/prositetask-backup/alignTrembl/bibl/Trembl_47/Trembl_47.fasta.gz')
#test=None
#with open('/Volumes/arwen/mobi/group/NOX_GL/work/uniprot_trembl_v11/hmmsearch.fasta', 'r') as f:
#    test = mFastaParseStream(f)

In [6]:
import re

def num(s):
    try:
        return int(s)
    except ValueError:
        return float(s)
    
    
reTMH = re.compile('^(\# ){0,1}([\S]+)[\s]+([\S].*)[\s]+([\d\.]+)$')
def loadTMHMM(lDir):
    
    fastaContainer = None
    with open( lDir+ '/hmmsearch.fasta', 'r') as f:
        fastaContainer = mFastaParseStream(f)
    
    file = lDir+ '/tmhmm.out'
    data = {}
    with open(file, 'r') as f:
        for l in f:
            m = reTMH.search(l)
            if m:
                _id = m.groups()[1] 
                if _id not in data:
                    if _id not in fastaContainer:
                        raise ValueError("Misisng fasta for tmhmm prediction")
                    data[_id] = {'hCount':0 ,
                                'helix':[], 'fasta' : fastaContainer[_id],
                                'mask': '-' * len(fastaContainer[_id]['sequence'])
                                }
                
                if not m.groups()[2].startswith('TMHMM2'):
                    data[_id][re.sub('[\s]*:[\s]*$', '',m.groups()[2])] = num(m.groups()[3])
                    continue
                
                
                m2 = m.groups()[2].split('\t')
                if not m2:
                    raise ValueError('could not parse helix line')
                helixCoor =  {'volume' : m2[1], 
                              'start'  : num(m2[2].replace(' ', '')),
                              'stop'   : num(m.groups()[3]) 
                            }
                data[_id]['helix'].append(helixCoor)
                
                
                data[_id]['helix'].append(helixCoor)
                #print (data[_id]['mask']) 
                l_1 = len(data[_id]['mask'])
                buf = list(data[_id]['mask'])
                symbol = None
                if helixCoor['volume'] == 'TMhelix':
                    data[_id]['hCount'] += 1
                    #symbol = 'H'
                    symbol = str(data[_id]['hCount']) if data[_id]['hCount'] < 10 else str(data[_id]['hCount'])[-1]
                elif helixCoor['volume'] == 'inside':
                    symbol = 'i'
                elif helixCoor['volume'] == 'outside':
                    symbol = 'e'
                else :
                    raise ValueError("unknown symbol " + helixCoor['volume'])

                i=helixCoor['start'] - 1
                j=helixCoor['stop']
                #print(i,j,len(buf))
                toAdd = symbol * (j - i)
                buf[i:j] =  list(toAdd)#helixCoor['stop'] - helixCoor['start'] + 1
                data[_id]['mask'] = ''.join(buf)
                if len(data[_id]['mask']) != l_1:
                    print("ERROR ", _id, l_1, len(data[_id]['mask']), '>>', i, j, '<<')
                    print (len(buf[i:j]), len(list(toAdd)), symbol, '-->', toAdd )
                #print(data[_id]['mask'])
    
    #        Hcluster(data)
    return data
#d = loadTMHMM('/Volumes/arwen/home/ygestin/prositetask-backup/alignTrembl/bibl/Trembl_47')
#d = loadTMHMM('/Volumes/arwen/mobi/group/NOX_GL/work_sample/uniprot_trembl_v11')
#d

In [2]:
def HIS_clust(data, min=2, max=7):
    for _id in data:
        data[_id]['Htest'] = {'status' : False, 'data' : [] }

        #Discard unwanted numbe of helices
        if data[_id]['hCount'] < min or data[_id]['hCount'] > max:
            #print('Wrong helices number ', _id, data[_id]['hCount'])
            continue
        
        H_status = []
        iMax = len(data[_id]['mask'])
        # internal error check
        if len(data[_id]['mask']) != len(data[_id]['fasta']['sequence']) :
            print( len(data[_id]['mask']), len(data[_id]['fasta']['sequence']) )
            print(_id, data[_id])
            raise ValueError("")
        # Select only residues that are Histidine within TMH
        for i in range(0, iMax):
            if data[_id]['mask'][i] == "i" or  data[_id]['mask'][i] == "e":
                continue
            if not data[_id]['fasta']['sequence'][i] == "H":
                continue
            H_status.append( [i, data[_id]['mask'][i], False] )
        # Pairwise comparaison between Histidine of the same helix, marking pairs separated by 12 to 14 residues
        for i in range (0, len(H_status) - 1):
            for j in range (i + 1, len(H_status)):
                if H_status[i][1] != H_status[j][1]:
                    continue
                d = H_status[i][0] - H_status[j][0]
                if d >= 12 or d <= 14:
                    H_status[i][2] = True
                    H_status[j][2] = True
        
        #print(H_status)
        # Only keep marked histidine
        H_status = [ x for x in H_status if x[2] ]
        # Create a dicitinary where keys are Helices numbers
        H_groups = {}
        for x in H_status:
            if not x[2]:
                continue
            if x[1] not in H_groups:
                H_groups[x[1]]=[]
            H_groups[x[1]].append(x)
        
        # The test is passed if at least two distinct helices feature at least one correctly spaced histidine pair
        # ie : if the helice dictionary has more than 1 entrie
        #print(H_status)
        #print("-->", H_groups)
        HisTestBool = True if len(H_groups) > 1 else False
        
        data[_id]['Htest']['status'] = HisTestBool
        data[_id]['Htest']['data'] = H_groups
    return data

#m = HIS_clust(d)
#print(len([ m[x] for x in m if m[x]['Htest']['status'] ]), len(m))

In [5]:
import pickle, time
import time

def save(data, tag=None):
    saveDir="/Volumes/arwen/mobi/group/NOX_CH/pickle_saved"
    timestr = time.strftime("%Y%m%d-%H%M%S")
    fTag = "NOX_annotation_" + tag + "_" if tag else "NOX_annotation_"
    fSerialDump = fTag + timestr + ".pickle"
    with open(saveDir + '/' + fSerialDump, 'wb') as f:
        pickle.dump(data, f)
    print('data structure saved to', saveDir + '/' + fSerialDump)

def load(fileName):
    saveDir="/Volumes/arwen/mobi/group/NOX_CH/pickle_saved"
    d = pickle.load( open(saveDir + "/" + fileName, "rb" ) )
    print("restore a annotated container of ", len(d), "elements")
    return d

# Parsing all data files 

### Parsing HMMR data
NB: There are stdout of 3 consecutive hmmr calls

All in a single **data** container

In [4]:
import pyproteinsExt.hmmrContainerFactory as hm
import glob
dataDir=glob.glob('/Volumes/arwen/mobi/group/NOX_CH/seedSet/work/uniprot_trembl_v*')

data = hm.parse(inputFile=dataDir[0] + '/hmmsearch.out')
i=0

for iDir in dataDir[1:]:
    #print(iDir)
    data += hm.parse(inputFile=iDir + '/hmmsearch.out')
    i += 1

search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search
search

In [None]:
### Discard domains with evalue > 1e-3 

In [72]:
data.evalue_filter(1e-3)

In [65]:
data.evalue_filter(1e-3)

In [73]:
T_all=data.T()
T=data.T(filter=True)

257799
98292


In [84]:
print("Proteins with at least 1 domain",len(T_all))
print("Proteins with at least 1 domain with evalue filter at 1e-3",len(T))

Proteins with at least 1 domain 178539
Proteins with at least 1 domain with evalue filter at 1e-3 64021


## Loading TMHMM data

In [9]:
dataTMHMM = {}
for lDir in dataDir:
    d = loadTMHMM(lDir)
    if set( dataTMHMM.keys() ) & set( d.keys() ):
        print('doublons')
    dataTMHMM.update(d)

In [15]:
dataTMHMM = HIS_clust(dataTMHMM)
print(len(dataTMHMM))

178540


178540


In [48]:
dataFilter=[p for p in dataTMHMM if dataTMHMM[p]['Htest']['status']]
print(len(dataFilter))

10989


In [None]:
all_prot=set([p for p in dataFilter])
print(len(all_prot))

In [57]:
import pyproteinsExt.tmhmmContainerFactory as tmhmm 
data_tmhmm=tmhmm.parse(inputFile=dataDir[0]+'/tmhmm.out')
#print(len(data_tmhmm))
for iDir in dataDir[1:]:
    data_tmhmm+=tmhmm.parse(inputFile=iDir+'/tmhmm.out')

In [47]:
all_prot2=set([obj.prot for obj in data_tmhmm.entries])

In [52]:
def mFastaParseStream(stream):
    data = {}    
    headPtr = ''
    for line in stream:
        #print (line)
        if line.startswith("#No protein detected by HMMR"): 
            return data
        if line == '':
            continue
        s = line.replace('\n','')
        if s.startswith('>'):
            headPtr = s.split()[0][1:]
            
            if headPtr in data:
                raise ValueError('Smtg wrong')
            data[headPtr] = {'header': s, 'sequence' : '' }
            
            continue
        data[headPtr]['sequence'] += s
    return data

def parseFasta(lDir): 
    dic={}
    with open( lDir+ '/hmmsearch.fasta', 'r') as f:
        fastaContainer = mFastaParseStream(f)
        for p in fastaContainer: 
            dic[p]=fastaContainer[p]
            
    return dic 

In [53]:
dataFasta={}
for lDir in dataDir:
    d = parseFasta(lDir)
    dataFasta.update(d)

In [58]:
data_tmhmm.filter_nb_helix(2,7)
print(len(data_tmhmm))
data_tmhmm.filter(tmhmm.filter_bi_histidine,dataFasta)
print(len(data_tmhmm))

56999
TO
10989


##### Transform a PFAM domain indexed data structure in a protein indexed data structure
Then filter out the protein that feature the 3 domains


#### All data

In [87]:
D_all = {}
fad=0
nad=0
ferric=0
for protein in T_all:
    if len(T_all[protein]) == 3:
           D_all[protein] = T_all[protein]
    for dom in T_all[protein]: 
        if dom == "PF08022_full":
            fad+=1
        elif dom == "PF01794_full": 
            ferric+=1
        elif dom == "PF08030_full": 
            nad+=1
        else: 
            print("OOOO")
        #if dom == "PF08022_full":
            
print('Number of proteins entries featuring FAD',fad)
print('Number of proteins entries featuring NAD',nad)
print('Number of proteins entries featuring Ferric reductase',ferric)
print('Size of their intersection',len(D_all))

Number of proteins entries featuring FAD 77203
Number of proteins entries featuring NAD 121386
Number of proteins entries featuring Ferric reductase 59209
Size of their intersection 18020


#### Filtered data

In [88]:
D = {}
fad=0
nad=0
ferric=0
for protein in T:
    if len(T[protein]) == 3:
           D[protein] = T[protein]
    for dom in T[protein]: 
        if dom == "PF08022_full":
            fad+=1
        elif dom == "PF01794_full": 
            ferric+=1
        elif dom == "PF08030_full": 
            nad+=1
        else: 
            print("OOOO")
        #if dom == "PF08022_full":
            
print('Number of proteins entries featuring FAD',fad)
print('Number of proteins entries featuring NAD',nad)
print('Number of proteins entries featuring Ferric reductase',ferric)
print('Size of their intersection',len(D))

Number of proteins entries featuring FAD 31593
Number of proteins entries featuring NAD 37456
Number of proteins entries featuring Ferric reductase 29243
Size of their intersection 14022


## Merge TMHMM & HMMR data

  * Proteins with the 3 domain types
  * Their TMHMM status


#### All data

In [90]:
merged_all = {}
for _id in D_all:
    if _id not in dataTMHMM:
        print('Missing protein ID' + _id)
    if not dataTMHMM[_id]['Htest']['status']:
        continue
    merged_all[_id] = {
        'hmmr' : D_all[_id],
        'tmhmm' : dataTMHMM[_id]
    }
    
print('Number of protein entries featuring FAD,NAD and Ferric transferase domains', len(D))
print('Number of protein featuring 2 to 7 TMH and 2 bi-histine', len(dataTMHMM))
print('Size of their intersection', len(merged_all))

Number of protein entries featuring FAD,NAD and Ferric transferase domains 14022
Number of protein featuring 2 to 7 TMH and 2 bi-histine 178540
Size of their intersection 5972


#### Filtered data

In [91]:
merged = {}
for _id in D:
    if _id not in dataTMHMM:
        print('Missing protein ID' + _id)
    if not dataTMHMM[_id]['Htest']['status']:
        continue
    merged[_id] = {
        'hmmr' : D[_id],
        'tmhmm' : dataTMHMM[_id]
    }
    
print('Number of protein entries featuring FAD,NAD and Ferric transferase domains', len(D))
print('Number of protein featuring 2 to 7 TMH and 2 bi-histine', len(dataTMHMM))
print('Size of their intersection', len(merged))

Number of protein entries featuring FAD,NAD and Ferric transferase domains 14022
Number of protein featuring 2 to 7 TMH and 2 bi-histine 178540
Size of their intersection 5043


### Discard Eukaryota

#### Extract TaxonID

In [101]:
def getTaxID(datum):
    reTaxID = re.compile('OX=([\d]+)')
    m = reTaxID.search(datum['tmhmm']['fasta']['header'])
    if not m:
        raise ValueError('Cant parse taxid from', datum['tmhmm']['fasta']['header'])
    datum['taxid'] = m.groups()[0]
    
for _id in merged:
    getTaxID(merged[_id])

for _id in merged_all:
    getTaxID(merged_all[_id])

#### Flag Non Eukaryota phylum members

In [102]:
from ete3 import NCBITaxa
ncbi=NCBITaxa() 

##### All data

In [103]:
unclassified=0
archaea=0
bacteria=0
eukaryota=0
not_found=0
for _id in merged_all: 
    bool=True
    taxid=merged_all[_id]['taxid']
    #print(taxid)
    try : 
        lineage=ncbi.get_lineage(taxid)
        lineage_rank=ncbi.get_rank(lineage)
        superkingdom=[taxid for taxid in lineage_rank if lineage_rank[taxid]=='superkingdom']
        if superkingdom : 
            name=ncbi.get_taxid_translator(superkingdom)[superkingdom[0]]
            if name == "Eukaryota":
                bool=False
                eukaryota+=1
            elif name == "Bacteria":
                bacteria+=1
            elif name == "Archaea": 
                archaea+=1
            else: 
                print("OOO")
        else: 
            unclassified+=1
        merged_all[_id]['isNoEukaryota']=bool
            
    except : 
        not_found+=1

print("Eukaryota",eukaryota)
print("Bacteria",bacteria)
print("Archaea",archaea)
print("Unclassified",unclassified)
print("Not found", not_found)

Eukaryota 5116
Bacteria 848
Archaea 3
Unclassified 2
Not found 3


##### Filtered data 

In [104]:
unclassified=0
archaea=0
bacteria=0
eukaryota=0
not_found=0
for _id in merged: 
    bool=True
    taxid=merged[_id]['taxid']
    #print(taxid)
    try : 
        lineage=ncbi.get_lineage(taxid)
        lineage_rank=ncbi.get_rank(lineage)
        superkingdom=[taxid for taxid in lineage_rank if lineage_rank[taxid]=='superkingdom']
        if superkingdom : 
            name=ncbi.get_taxid_translator(superkingdom)[superkingdom[0]]
            if name == "Eukaryota":
                bool=False
                eukaryota+=1
            elif name == "Bacteria":
                bacteria+=1
            elif name == "Archaea": 
                archaea+=1
            else: 
                print("OOO")
        else: 
            unclassified+=1
        merged[_id]['isNoEukaryota']=bool
            
    except : 
        not_found+=1

print("Eukaryota",eukaryota)
print("Bacteria",bacteria)
print("Archaea",archaea)
print("Unclassified",unclassified)
print("Not found", not_found)

Eukaryota 4870
Bacteria 169
Archaea 0
Unclassified 1
Not found 3


In [120]:
save(merged,"filter")

data structure saved to /Volumes/arwen/mobi/group/NOX_CH/pickle_saved/NOX_annotation_filter_20190516-163544.pickle


#### Just keep non Eukaryota sequences in datacontainer

##### All data

In [98]:
data=merged_all
data_noEuk_all={}
for k in data:
    if not 'isNoEukaryota' in data[k]:
        continue
    if data[k]['isNoEukaryota']:
        data_noEuk_all[k] = data[k]

##### Filtered data

In [99]:
data=merged
data_noEuk={}
for k in data:
    if not 'isNoEukaryota' in data[k]:
        continue
    if data[k]['isNoEukaryota']:
        data_noEuk[k] = data[k]

In [100]:
print(len(data_noEuk_all),"NOX proteins found with no filter.")
print(len(data_noEuk),"NOX proteins found with evalue filter")

853 NOX proteins found with no filter.
170 NOX proteins found with evalue filter


In [109]:
proteins_all=set([p for p in data_noEuk_all])
proteins_filtered=set([p for p in data_noEuk])
deleted_proteins=proteins_all.difference(proteins_filtered)

##### Write deleted proteins with their taxonomy in a file

In [118]:
o=open("/Volumes/arwen/mobi/group/NOX_CH/deleted_proteins_with_domain_filter.tsv",'w')
o.write("#Protein\tTaxid\tTaxname\n")
for p in deleted_proteins: 
    taxid=data_noEuk_all[p]['taxid']
    taxname=ncbi.get_taxid_translator([taxid]).get(int(taxid),"no name")
    o.write(p+"\t"+taxid+"\t"+taxname+"\n")

In [121]:
save(data_noEuk,"filter_noEukaryota")

data structure saved to /Volumes/arwen/mobi/group/NOX_CH/pickle_saved/NOX_annotation_filter_noEukaryota_20190516-163551.pickle


## Now we work only with filtered proteins

#### Filter obsolete Uniprot entry 

In [145]:
data=load("NOX_annotation_filter_noEukaryota_20190516-163551.pickle")

restore a annotated container of  170 elements


In [146]:
import pyproteinsExt.uniprot as uniprot
uColl = uniprot.getUniprotCollection()
uColl.setCache(location="/Users/chilpert/cache/uniprot")
uniprot.getPfamCollection().setCache(location="/Users/chilpert/cache/pfam")
new_data={}
c=0
not_found=[]
for p in data :
    p_id=p.split("|")[1]
    try : 
        obj=uColl.get(p_id)
        new_data[p]=data[p]
        new_data[p]['RefSeq']={}
        new_data[p]['RefSeq']['genome']=obj.Genome.RefSeqRef
        new_data[p]['RefSeq']['protein']=obj.Genome.RefSeqProteinRef
        new_data[p]['EMBL']={}
        new_data[p]['EMBL']['genome']=obj.Genome.EMBLRef 
        new_data[p]['EMBL']['protein']=obj.Genome.EMBLProteinRef
        #new_data[p]['Uniprot_domains']=obj.domains
        
    except : 
        c+=1
        not_found.append(p_id)
        continue

Changing cache location to /Users/chilpert/cache/uniprot
Reindexing /Users/chilpert/cache/uniprot
Acknowledged 163 entries (/Users/chilpert/cache/uniprot)
Changing cache location to /Users/chilpert/cache/pfam
Reindexing /Users/chilpert/cache/pfam
Acknowledged 159 entries (/Users/chilpert/cache/pfam)
got to fetch A0A3L9B3M0
got to fetch A0A2Z5AMG3
got to fetch A0A168SHW4
got to fetch A0A3L4QTP8
got to fetch A0A3L2F2W5
got to fetch A0A3L3SJU7
got to fetch A0A3L3SPV6


In [149]:
print(len(new_data),"with Uniprot entry.")  

163 with Uniprot entry.


In [147]:
save(new_data,"noEukaryota_noObsolete")

data structure saved to /Volumes/arwen/mobi/group/NOX_CH/pickle_saved/NOX_annotation_noEukaryota_noObsolete_20190516-172955.pickle


#### Save non Eukaryota sequences in given directory

In [78]:
data=load("NOX_annotation_noEukaryota_noObsolete_20190516-172955.pickle")

restore a annotated container of  163 elements


In [84]:
import re
saveDir="/Volumes/arwen/mobi/group/NOX_CH/seedSet/NOX_noEukaryota"
def mFastaSplitDump(data, saveDir, fileTag='default' ,distinct=True):
    c = 1
    f = None
    if not distinct:
        f = open(saveDir + '/'+ fileTag + '_all.fasta', 'w')
        
    for _id in data:
        if distinct:
            f = open(saveDir + '/'+ fileTag + '_' + str(c) + '.fasta', 'w')
        c += 1
        f.write(data[_id]['tmhmm']['fasta']['header']+"\n")
        f.write(re.sub("(.{81})", "\\1\n", data[_id]['tmhmm']['fasta']['sequence'], 0, re.DOTALL))
        if distinct:
            f.close()
    if not distinct:    
        f.close()

In [85]:
mFastaSplitDump(data, saveDir, 'NOX_noEukaryota')

### Full Pfam annotation

#### Parse hmm and discard domains

In [86]:
import pyproteinsExt.hmmrContainerFactory as hm

def parse_and_discard(data,evalue_threshold=None):
    new_data=copy.deepcopy(data)
    for p in new_data: 
        new_data[p]['hmmr']={}
    fileName="/Volumes/arwen/mobi/group/NOX_CH/seedSet/NOX_noEukaryota_hmmscan.out" 
    hscan=hm.parse(inputFile=fileName)
    bool_filter=False
    if evalue_threshold: 
        hscan.evalue_filter(evalue_threshold)
        bool_filter=True 
    transpose=hscan.T(filter=bool_filter)    
    for e in transpose: 
        new_data[e]['hmmr']=transpose[e]
        
    print(new_data['tr|A0A2M7JHP3|A0A2M7JHP3_9DELT']['hmmr'].keys())
    return new_data    

In [88]:
# Evalue threshold 1e-3
data10_3=parse_and_discard(data,1e-3)
# Evalue threshold 1e-1
data10_1=parse_and_discard(data,1e-1)
# All 
data_all=parse_and_discard(data)

dict_keys(['NAD_binding_1', 'Ferric_reduct', 'FAD_binding_8', 'NAD_binding_6', 'FAD_binding_6'])
dict_keys(['NAD_binding_1', 'Ferric_reduct', 'FAD_binding_8', 'NAD_binding_6', 'FAD_binding_6'])
dict_keys(['NAD_binding_1', 'Ferric_reduct', 'FAD_binding_8', 'NAD_binding_6', 'FAD_binding_6'])


In [89]:
save(data10_3,"filter_fullPfam_filteredDomains1e-3")
save(data10_1,"filter_fullPfam_filteredDomains1e-1")
save(data_all,"filter_fullPfam_allDomains")

data structure saved to /Volumes/arwen/mobi/group/NOX_CH/pickle_saved/NOX_annotation_filter_fullPfam_filteredDomains1e-3_20190517-162404.pickle
data structure saved to /Volumes/arwen/mobi/group/NOX_CH/pickle_saved/NOX_annotation_filter_fullPfam_filteredDomains1e-1_20190517-162404.pickle
data structure saved to /Volumes/arwen/mobi/group/NOX_CH/pickle_saved/NOX_annotation_filter_fullPfam_allDomains_20190517-162405.pickle


### Other


In [204]:
import re
reMotifNADPH = re.compile('G[ISVL]G[VIAF][TAS][PYTA]')
reMotifFAD = re.compile('H[PSA]F[TS][LIMV]')

NAD_miss = 0
FAD_miss = 0
Both_miss = 0
for p in merged_restore:
    seq = merged_restore[p]['tmhmm']['fasta']['sequence']
    m = reMotifNADPH.search(seq)
    n = reMotifFAD.search(seq)
    merged_restore[p]['NADPH_reg'] = True if m else False
    merged_restore[p]['FAD_reg']   = True if n else False

    if not m:
        NAD_miss += 1
        if not n:
            Both_miss += 1
    if not n:
        FAD_miss += 1

print('Total Number of filtered sequence', len(merged_restore))
print('Number of negative to:')
print('*The NAD pattern',str(NAD_miss), '\n*The FAD pattern', str(FAD_miss), '\n*Both patterns ', Both_miss)

Total Number of filtered sequence 386
Number of negative to:
*The NAD pattern 54 
*The FAD pattern 147 
*Both patterns  16
