In [12]:
import glob
import sys, os
sys.path.append("/Users/chilpert/Dev/pyproteinsExt/src")
sys.path.append("/Users/chilpert/Dev/pyproteins/src")
import pyproteinsExt.topology as topology
from ete3 import NCBITaxa

# 1. Predict NOX

Swissprot dataset : `arwen:/mobi/group/databases/flat/uniprot_sprot_2019_07.fasta.gz`

Split dataset : 
```console 
ROOT_DIR=/mobi/group/NOX_eukaryotes
SCRIPT_DIR=/mobi/group/NOX_clean/nox-analysis/scripts

mkdir $ROOT_DIR/volumes
cd $ROOT_DIR/volumes
python $SCRIPT_DIR/split.py /mobi/group/databases/flat/uniprot_sprot_2019_07.fasta.gz
```

Run hmmr/thmhmm : 
```console
mkdir $ROOT_DIR/work
$SCRIPT_DIR/runHMMR_slurm.sh $ROOT_DIR/volumes $ROOT_DIR/work /mobi/group/NOX_clean/data/profiles
```

In [1]:
def parsing_results(dataDir):
    dataDir_length=len(dataDir)
    c=1
    print(c,"/",dataDir_length)
    dataContainer=topology.parse(hmmrOut=dataDir[0]+"/hmmsearch.out",tmhmmOut=dataDir[0]+"/tmhmm.out",fastaOut=dataDir[0]+"/hmmsearch.fasta")
    for dir in dataDir[1:]:
        c+=1
        if c%10==0:
            print(c,"/",dataDir_length)
        dataContainer=dataContainer.addParsing(topology.parse(hmmrOut=dir+"/hmmsearch.out",tmhmmOut=dir+"/tmhmm.out",fastaOut=dir+"/hmmsearch.fasta"))
    return dataContainer

In [10]:
dataDir=glob.glob('/Volumes/arwen/mobi/group/NOX_eukaryotes/work/uniprot_sprot*')
dataContainer=parsing_results(dataDir)

1 / 2


In [17]:
def filter_three_domains(entry): 
    '''Filter proteins that contains 3 domains'''
    domains=set([hmm_obj.domain for hmm_obj in entry.hmmr])
    if len(domains)==3:
        return True
    return False

def filter_nb_helix(entry,**kwargs):
    '''Filter proteins that have between min_helix and max_helix transmembrane helix'''
    min_helix=kwargs["min_helix"]
    max_helix=kwargs["max_helix"]
    if entry.tmhmm.nb_helix>=min_helix and entry.tmhmm.nb_helix<=max_helix:
        return True 
    return False

def filter_bi_histidine(entry):
    '''
    Filter proteins that have the bi-histidine pattern. Bi-histidine pattern is when we have 2 histidines separate
    by 12 to 14 residues, in 2 distinct helixes. 
    '''
    seq=entry.fasta.seq
    topo_seq=entry.tmhmm.topology_seq
    H_status = []
    if len(topo_seq)!=len(seq):
        raise Exception("Topology seq and amino acids seq with different size. Check !")
    for i in range(len(seq)):
        if topo_seq[i]=="i" or topo_seq[i]=="o":
            continue
        if not seq[i]=="H":
            continue
        H_status.append( [i, topo_seq[i], False] )     
    for i in range (len(H_status) - 1):
        for j in range (i + 1, len(H_status)):
            if H_status[i][1] != H_status[j][1]:
                continue
            d = H_status[j][0] - H_status[i][0]
            if d >= 12 and d <= 14:
                H_status[i][2] = True
                
    # Only keep marked histidine
    H_status = [ x for x in H_status if x[2] ]
    # Create a dicitinary where keys are Helices numbers
    H_groups = {}
    for x in H_status:
        if not x[2]:
            continue
        if x[1] not in H_groups:
            H_groups[x[1]]=[]
        H_groups[x[1]].append(x)
        
    # The test is passed if at least two distinct helices feature at least one correctly spaced histidine pair
    # ie : if the helice dictionary has more than 1 entrie
    HisTestBool = True if len(H_groups) > 1 else False     
    return HisTestBool

def filter_evalue(entry,**kwargs):
    '''Just keep proteins that have an evalue <= threshold for all domains'''
    threshold=kwargs["threshold"]
    conserve=0
    domains=set()
    for h in entry.hmmr: 
        if float(h.hit.iEvalue) <= threshold:
            domains.add(h.domain)
    if len(domains)==3:
        return True
    return False     

def filter_evalue_hit(hit,**kwargs):
    '''Don't keep hit inside hmm hits whith evalue > threshold''' 
    threshold=kwargs["threshold"]
    if float(hit.hit.iEvalue)<=threshold: 
        return True 
    return False

def function_get_taxid(e):
    taxid = e.fasta.header.split("OX=")[1].split(" ")[0]
    return taxid

def filter_NOX(data_container):
    print("Initial proteins :", len(data_container))
    '''All filters function to get predicted NOX proteins'''
    #3 domains 
    filterThreeDomains=data_container.filter(filter_three_domains)
    print("Number of proteins with 3 domains : ",len(filterThreeDomains))

    # Helix filter
    filterHelix=filterThreeDomains.filter(filter_nb_helix,min_helix=2,max_helix=7).filter(filter_bi_histidine)
    print("Number of proteins after helix filter : ",len(filterHelix))

    #Evalue 1e-3 filter 
    filterEvalue3=filterHelix.filter(filter_evalue,threshold=1e-3).filter_hit(filter_evalue_hit,threshold=1e-3)
    print("Number of non-eukaryotic proteins after evalue 1e-3 filter :",len(filterEvalue3))
    return filterEvalue3

In [19]:
dataFiltered = filter_NOX(dataContainer)
mfasta = dataFiltered.proteins_mfasta()
o=open("/Volumes/arwen/mobi/group/NOX_eukaryotes/predicted_NOX_proteins.mfasta",'w')
o.write(mfasta)
o.close()

Initial proteins : 668
Number of proteins with 3 domains :  84
Number of proteins after helix filter :  17
Number of non-eukaryotic proteins after evalue 1e-3 filter : 17


# 2. Annotate domains
```console
mkdir -p $ROOT_DIR/Domains_annotation

cd $ROOT_DIR/Domains_annotation

sbatch $SCRIPT_DIR/runHMMSCAN.sbatch /mobi/group/databases/hmmr/Pfam-A_32.hmm $ROOT_DIR/predicted_NOX_proteins.mfasta $ROOT_DIR/Domains_annotation/predicted_NOX_proteins_hmmscan.out
```

In [22]:
dataFiltered.complete_hmmr("/Volumes/arwen/mobi/group/NOX_eukaryotes/Domains_annotation/predicted_NOX_proteins_hmmscan.out")
dataFiltered.create_domain_entries()

In [23]:
def filterEvalue_hits(hit,**kwargs):
    threshold=kwargs["threshold"]
    if float(hit.hit.iEvalue)<=threshold:
        return True
    return False

In [24]:
data_evalue3=dataFiltered.filter_hit(filterEvalue_hits,threshold=1e-3)

In [25]:
data_evalue3.compute_overlapped_domains(10)

In [37]:
def best_evalue(list_hit):
    evalues=[float(h.hit.iEvalue) for h in list_hit]
    best_evalue=min(evalues)
    conserve_hit=[h for h in list_hit if float(h.hit.iEvalue)==best_evalue]
    for ch in conserve_hit:
        print(ch.hit)
    if len(conserve_hit)>1: 
        raise Exception("best_evalue conserve_hit>1. Check.")
    return conserve_hit[0] 

def filter_overlapped_domains(hit):
    print("H", hit)
    core_domains=["ferric_reduct_prokaryotes","nad_binding_prokaryotes","fad_binding_prokaryotes"]
    if hit.domain in core_domains: 
        return True
    if not hit.overlapped_hits: 
        return True
    overlap_hits_core=[h for h in hit.overlapped_hits if h.domain in core_domains]
    if not overlap_hits_core:
        conserve_hit=best_evalue([hit]+hit.overlapped_hits)
        if conserve_hit==hit:
            return True
    return False

In [38]:
data_overlap_domains=data_evalue3.filter_hit(filter_overlapped_domains)

H <pyproteinsExt.hmmrContainerFactory.HMMObj object at 0x111a792e8>
{'hmmID': 'PF01794_full', 'aliID': 'sp|Q95L74|CY24B_BISBI', 'header': '1  score: 117.8 bits;  conditional E-value: 4.6e-36', 'score': '117.8', 'bias': '5.1', 'cEvalue': '4.6e-36', 'iEvalue': '9.1e-33', 'hmmFrom': '4', 'hmmTo': '123', 'aliFrom': '57', 'aliTo': '219', 'envFrom': '54', 'envTo': '220', 'acc': '0.93', 'hmmStringLetters': 'aallnlnllllla..lrn..tplr.......lltgipldklltfHrligrlilllallHailhlvndlprssa................................dseskleslvktpevltGivalllllllattslpvirrlsyevFwytHhlfvivflll', 'matchString': 'aa+ln+n++l+l+  +rn  ++lr       + ++ +ld++ltfH++++++i+l++++H+i+hl+n  ++ +a                                + e+ l+  v+  +++tG+v++l+l+l++t+s++ irr+++evFwytHhlfvi+f++l', 'aliStringLetters': 'AACLNFNCMLILLpvCRNllSFLRgssaccsTRIRRQLDRNLTFHKMVAWMIALHTAIHTIAHLFNVEWCVNArvnnsdpysialsdigdkpnetylnfvrqrikNPEGGLYVAVTLLAGITGVVITLCLILIITSSTKTIRRSYFEVFWYTHHLFVIFFIGL', 'hmmSymbolStuff': {}, 'aliSymbolStuff': {'PP': '89*********88

Exception: best_evalue conserve_hit>1. Check.