In [1]:
import os, sys
import glob
import pickle
import gzip
sys.path.append("/Users/chilpert/Dev/pyproteinsExt/src")
sys.path.append("/Users/chilpert/Dev/pyproteins/src")
import pyproteinsExt.topology as topology
import pyproteinsExt.ena as ena
from ete3 import NCBITaxa
import time
%load_ext autoreload
%autoreload 2

In [2]:
def save(data, tag=None):
    saveDir="/Volumes/arwen/mobi/group/NOX_ecoli_full/pickle_saved"
    timestr = time.strftime("%Y%m%d-%H%M%S")
    fTag = "NOX_ecoli_" + tag + "_" if tag else "NOX_ecoli_"
    fSerialDump = fTag + timestr + ".pickle"
    with open(saveDir + '/' + fSerialDump, 'wb') as f:
        pickle.dump(data, f)
    print('data structure saved to', saveDir + '/' + fSerialDump)

def load(fileName):
    saveDir="/Volumes/arwen/mobi/group/NOX_ecoli_full/pickle_saved"
    d = pickle.load( open(saveDir + "/" + fileName, "rb" ) )
    print("restore a annotated container of ", len(d), "elements")
    return d

The purpose of this notebook is to construct two datasets : E.coli genomes with NOX and E.coli genomes without NOX.

## 1. Download E.coli genomes

* Download RefSeq bacteria assembly_summary_refseq from NCBI ftp

* Extract E.coli from assembly_summary

In `arwen`:
```console
cd /mobi/group/databases/refseq95_proteins
wget -O refseq_bacteria_assembly_summary_r95.txt https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt 
awk -F "\t" '{ if ($7 == 562) print }' refseq95_bacteria_assembly_summary.txt > refseq95_ecoli_assembly_summary.txt
```
17142 organisms

* Download E.coli proteins using slurm (and rewrite headers by adding assembly accession), create volumes of 10 E.coli
In `arwen`:
```console
DB=/mobi/group/databases/refseq95_proteins/ecoli
mkdir -p $DB
bash /home/chilpert/Dev/arwen_scripts/download_proteins_from_assembly_summary_slurm.sh refseq95_ecoli_assembly_summary.txt $DB 10
```

* Check if all proteins have been downloaded 
```console
bash /home/chilpert/Dev/arwen_scripts/verif_download.sh $DB > empty_fasta.txt
```
If you have line on `empty_fasta.txt`, delete this entries and relaunch download. Do this unless you don't have lines on empty_fasta.txt anymore. Maybe you will have to download some entries manually (example: sometimes ftp link is not the last element of assembly_summary line (very rare, 3 cases for refseq95 over 17k) and it's not handled for now)

```console
for f in $(cut -f 2 empty_fasta.txt); do rm $f; done
bash /home/chilpert/Dev/arwen_scripts/download_proteins_from_assembly_summary_slurm.sh refseq95_ecoli_assembly_summary.txt $DB 10
```

* Concatenate E.coli proteins inside volumes. Create fasta file with proteins of 10 E.coli. Slurm concatenation jobs create problems, don't do. 

SLURM : (/!\ WARNING : CG status problem when try it)
```console
bash /home/chilpert/Dev/arwen_scripts/concatenate_volumes_slurm.sh $DB /mobi/group/NOX_ecoli_full/volumes_sbatch
mv /mobi/group/NOX_ecoli_full/volumes_sbatch/volume*/*.faa.gz /mobi/group/NOX_ecoli_full/volumes/
```

WITHOUT SLURM : 
```console
OUTDIR=/mobi/group/NOX_ecoli_full/volumes_sbatch
for volume in $(ls -d $DB/volume*); do 
    volume=$(readlink -f $volume)
    volume_name=$(echo $volume | rev | cut -f 1 -d "/" | rev)
    mkdir -p $OUTDIR/$volume_name
    new_volume=$OUTDIR/$volume_name
    zcat $volume/fasta/*.gz | gzip > $new_volume/$volume_name\_protein.faa.gz
mv /mobi/group/NOX_ecoli_full/volumes_sbatch/volume*/*.faa.gz /mobi/group/NOX_ecoli_full/volumes/
```

* Concatenate by 100 E.coli to have less hmm jobs. 
```console
mkdir -p /mobi/group/NOX_ecoli_full/volumes_concat10
python /home/chilpert/Dev/arwen_scripts/concat_by_number.py /mobi/group/NOX_ecoli_full/volumes 10 /mobi/group/NOX_ecoli_full/volumes_concat10
```


## 2. Predict NOX proteins

### Launch script
In `arwen`:
```console
ROOT_DIR=/mobi/group/NOX_ecoli_full
SCRIPT_DIR=/mobi/group/NOX_clean/nox-analysis/scripts
bash $SCRIPT_DIR/runHMMR_slurm.sh $ROOT_DIR/volumes_concat10 $ROOT_DIR/work /mobi/group/NOX_clean/data/refined_profiles_3
```
### Parse results

In [3]:
def parsing_results(dataDir):
    dataDir_length=len(dataDir)
    c=1
    print(c,"/",dataDir_length)
    dataContainer=topology.parse(hmmrOut=dataDir[0]+"/hmmsearch.out",tmhmmOut=dataDir[0]+"/tmhmm.out",fastaOut=dataDir[0]+"/hmmsearch.fasta")
    for dir in dataDir:
        c+=1
        if c%10==0:
            print(c,"/",dataDir_length)
        dataContainer=dataContainer.addParsing(topology.parse(hmmrOut=dir+"/hmmsearch.out",tmhmmOut=dir+"/tmhmm.out",fastaOut=dir+"/hmmsearch.fasta"))
    return dataContainer

dataDir=glob.glob('/Volumes/arwen/mobi/group/NOX_ecoli_full/work/volume*')
dataContainer=parsing_results(dataDir)

1 / 172
10 / 172
20 / 172
30 / 172
40 / 172
50 / 172
60 / 172
70 / 172
80 / 172
90 / 172
100 / 172
110 / 172
120 / 172
130 / 172
140 / 172
150 / 172
160 / 172
170 / 172


### Filter NOX proteins

In [4]:
def filter_three_domains(entry): 
    '''Filter proteins that contains 3 domains'''
    domains=set([hmm_obj.domain for hmm_obj in entry.hmmr])
    if len(domains)==3:
        return True
    return False

def filter_nb_helix(entry,**kwargs):
    '''Filter proteins that have between min_helix and max_helix transmembrane helix'''
    min_helix=kwargs["min_helix"]
    max_helix=kwargs["max_helix"]
    if entry.tmhmm.nb_helix>=min_helix and entry.tmhmm.nb_helix<=max_helix:
        return True 
    return False

def filter_bi_histidine(entry):
    '''
    Filter proteins that have the bi-histidine pattern. Bi-histidine pattern is when we have 2 histidines separate
    by 12 to 14 residues, in 2 distinct helixes. 
    '''
    seq=entry.fasta.seq
    topo_seq=entry.tmhmm.topology_seq
    H_status = []
    if len(topo_seq)!=len(seq):
        raise Exception("Topology seq and amino acids seq with different size. Check !")
    for i in range(len(seq)):
        if topo_seq[i]=="i" or topo_seq[i]=="o":
            continue
        if not seq[i]=="H":
            continue
        H_status.append( [i, topo_seq[i], False] )     
    for i in range (len(H_status) - 1):
        for j in range (i + 1, len(H_status)):
            if H_status[i][1] != H_status[j][1]:
                continue
            d = H_status[j][0] - H_status[i][0]
            if d >= 12 and d <= 14:
                H_status[i][2] = True
                
    # Only keep marked histidine
    H_status = [ x for x in H_status if x[2] ]
    # Create a dicitinary where keys are Helices numbers
    H_groups = {}
    for x in H_status:
        if not x[2]:
            continue
        if x[1] not in H_groups:
            H_groups[x[1]]=[]
        H_groups[x[1]].append(x)
        
    # The test is passed if at least two distinct helices feature at least one correctly spaced histidine pair
    # ie : if the helice dictionary has more than 1 entrie
    HisTestBool = True if len(H_groups) > 1 else False     
    return HisTestBool

def filter_evalue(entry,**kwargs):
    '''Just keep proteins that have an evalue <= threshold for all domains'''
    threshold=kwargs["threshold"]
    conserve=0
    domains=set()
    for h in entry.hmmr: 
        if float(h.hit.iEvalue) <= threshold:
            domains.add(h.domain)
    if len(domains)==3:
        return True
    return False     

def filter_evalue_hit(hit,**kwargs):
    '''Don't keep hit inside hmm hits whith evalue > threshold''' 
    threshold=kwargs["threshold"]
    if float(hit.hit.iEvalue)<=threshold: 
        return True 
    return False

def filter_NOX(data_container):
    print("Initial proteins :", len(data_container))
    '''All filters function to get predicted NOX proteins'''
    #3 domains 
    filterThreeDomains=data_container.filter(filter_three_domains)
    print("Number of proteins with 3 domains : ",len(filterThreeDomains))

    # Helix filter
    filterHelix=filterThreeDomains.filter(filter_nb_helix,min_helix=2,max_helix=7).filter(filter_bi_histidine)
    print("Number of proteins after helix filter : ",len(filterHelix))

    #Evalue 1e-3 filter 
    filterEvalue3=filterHelix.filter(filter_evalue,threshold=1e-3).filter_hit(filter_evalue_hit,threshold=1e-3)
    print("Number of proteins after evalue 1e-3 filter :",len(filterEvalue3))
    return filterEvalue3

In [5]:
dataFiltered = filter_NOX(dataContainer)

Initial proteins : 126901
Number of proteins with 3 domains :  546
Number of proteins after helix filter :  526
Number of proteins after evalue 1e-3 filter : 526


In [11]:
save(dataFiltered)

data structure saved to /Volumes/arwen/mobi/group/NOX_ecoli_full/pickle_saved/NOX_ecoli_20190823-102442.pickle


#### E.coli accession with NOX

In [60]:
ecoli_NOX_info = {}
list_ecoli_NOX = [e.prot.split("|")[0] for e in dataFiltered]
ecoli_NOX = set([e.prot.split("|")[0] for e in dataFiltered])
for acc in ecoli_NOX:
    count = list_ecoli_NOX.count(acc)
    if count > 1: 
        print(acc, "has", count, "predicted NOX")
    ecoli_NOX_info[acc]={'strain': None,'taxid': None, "isolate":None, "name":None}

GCF_900196555.1 has 2 predicted NOX


#### Check GCF_900196555.1
```
GCF_900196555.1	PRJNA224116	SAMEA104140562	FZJD00000000.1	na	562	562	Escherichia coli		F2_83	latest	Scaffold	Major	Full	2017/09/09	F2_83	Ausgem	GCA_900196555.1	identical	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/196/555/GCF_900196555.1_F2_83
```

In [67]:
nox2 = [e for e in dataFiltered if "GCF_900196555.1" in e.prot]
for n in nox2:
    print(n.fasta.header)
    print(n.fasta.seq)

GCF_900196555.1|WP_096317702.1 iron reductase [Escherichia coli]
MKKITCLGIATFIIACVIYLLPQFYILLTIQGDGWLLRKEFILFSGVVAWVFMTLAIVLPLRLPALESMTGGLDKGFILHKWAGIITLTTGVLHWMMKIVPKWLAQQGWITPQQKIKHMAGSAPEWSIELASMGQTIAEWAIYILIVLCIVSLSKKIPYHVFRYIHKIFPLIYLSITFHTLTILSKTSWWSSSSVLIILILAVIGTVSAFISLFQQIGKNRKILATVILAECHSDTIDITLQLEKPLYHHSGQFAFVRFGDSPEQHPYTIASSPDNPLTLRFVIKALGDYTRHLVETLTPGVKAEVEGPYGCFDFESKSERQIWVAGGIGITPFLSRLTALAQQGGTEIPTELWYCGRHEPSRALRELCAEAKVILHTIDTRTHERLSSEILLKTMSPDENVGVWFCGPASFGRMLHRDMKKRSVPFQYDNFSLR
GCF_900196555.1|WP_096317731.1 iron reductase [Escherichia coli]
MGLLESVMRKITYLGIATFLIACLIYFIPQLNILLSIQGDRWLLRKEFILFSGVVAWVFMTLAMVLSLRLPALESMTGGLDKGFILHKWAGIITLTTGVLHWMMKIVPKWLAQQGWIAPQHKVRHMAGSAPEWPIELASMGQTIAEWAIYILIVLCIISLSKKIPYHIFRFIHKLFPLIYLSITFHTLTILSKTSWWSSISVLIILILAAIGTVSSFISLFQLIGKKRKIFATVIRTECHRDTIDITLQLEKTLHYHSGQFAFVRFGDSPEQHPYTIASSPVNPLTLRFVIKALGDDTRHLVETLTPGVKAEVEGPYGCFNFESKSERQIWVAGGIGITPFLSRLAALAQQGGTEIPTELWYCGRHEPSGALTELCTKAKVRLHTINTRTQERLSSEILLQTMSSDEKVGVWFCGPASFCRILQRDMKKRSVPF

In [49]:
f = open("/Volumes/arwen/mobi/group/databases/refseq95_proteins/refseq95_ecoli_assembly_summary.txt" , "r")
for l in f:
    l_split = l.rstrip().split("\t")
    gcf = l_split[0]
    if gcf in ecoli_NOX_info:
        taxid = l_split[5]
        organism_name = l_split[7]
        strain = l_split[8]
        isolate = l_split[9]
        ecoli_NOX_info[gcf]["taxid"] = taxid
        ecoli_NOX_info[gcf]["name"] = organism_name
        ecoli_NOX_info[gcf]["strain"] = strain
        ecoli_NOX_info[gcf]["isolate"] = isolate

In [50]:
taxids = set([ecoli_NOX_info[gcf]["taxid"] for gcf in ecoli_NOX_info])
names = set([ecoli_NOX_info[gcf]["name"] for gcf in ecoli_NOX_info])
name_strain_isolate = [ecoli_NOX_info[gcf]["name"] + "|" + ecoli_NOX_info[gcf]["strain"] + "|" + ecoli_NOX_info[gcf]["isolate"] for gcf in ecoli_NOX_info]
print(len(taxids), "different taxids")
print(len(names), "different names")
print(len(set(name_strain_isolate)), "different combination name+strain")
for ns in set(name_strain_isolate):
    count = name_strain_isolate.count(ns)
    if count != 1:
        print(ns, count)

108 different taxids
108 different names
523 different combination name+strain
Escherichia coli|| 3


#### Save NOX

In [51]:
mfasta = dataFiltered.proteins_mfasta()
with open("/Volumes/arwen/mobi/group/NOX_ecoli_full/predictedNOX_ecoli.mfasta", "w") as o:
    o.write(mfasta)

## 3. Check E.coli redundancy
Is there redundancy in E.coli dataset ? 2 E.coli are considered redundant if they have same proteins. To check that, all E.coli proteins are clustered, and clusters can be associated with each E.coli. If 2 organisms have same clusters, they are redundant. To improve comparison of 17k clusters list, we use bits comparison (python module . Each organism is associated with one byte object, length is the number of clusters, we have 0 if the cluster is absent and 1 is it's present. 

### All proteins clustering

**In arwen:**  
<span style="color:red">See how to install modules and install mmsesq</span>.  
For now, mmseqs conda version is used, path to conda bin is directly in script.
```console
ROOTDIR=/mobi/group/NOX_ecoli_full
sbatch /home/chilpert/Dev/arwen_scripts/run_mmseqs.sbatch $ROOTDIR/redundancy all_ecoli $ROOTDIR/volumes_concat10/volume*.fasta.gz
```
Once it's done, other clustering thresholds are tested (re-use database created in previous step)
```console
sbatch /home/chilpert/Dev/arwen_scripts/run_mmseqs_just_cluster.sbatch $ROOTDIR/redundancy/db/all_ecoli.mmseqsdb 0.9 $ROOTDIR/redundancy all_ecoli_sid90
sbatch /home/chilpert/Dev/arwen_scripts/run_mmseqs_just_cluster.sbatch $ROOTDIR/redundancy/db/all_ecoli.mmseqsdb 1 $ROOTDIR/redundancy all_ecoli_sid100
```

### Pairs of redundant E.coli
Quite long script, so treatment is done separately and not in this notebook.

**In arwen**:  
Activate a conda environment with gmpy2 installed. You can launch the 3 treatments in same time. 
```console
python /home/chilpert/Dev/NOX/process_clustering.py $ROOTDIR/redundancy/all_ecoli_cluster.tsv $ROOTDIR/redundancy/redundant_pairs.tsv
```

### Redundant groups

In [38]:
f_dic = {"default": "/Volumes/arwen/mobi/group/NOX_ecoli_full/redundancy/redundant_pairs.tsv",
        "id80" : "/Volumes/arwen/mobi/group/NOX_ecoli_full/redundancy/redundant_pairs_sid80.tsv",
        "id90" : "/Volumes/arwen/mobi/group/NOX_ecoli_full/redundancy/redundant_pairs_sid90.tsv"}

dic_group = {}
for t in f_dic:
    f = open(f_dic[t], "r")
    list_group = []
    for l in f:
        org1 = l.split("\t")[0]
        org2 = l.rstrip().split("\t")[1]
        presence_index1 = [list_group.index(g) for g in list_group if org1 in g]
        presence_index2 = [list_group.index(g) for g in list_group if org2 in g]
        if presence_index1:
            list_group[presence_index1[0]].update({org1, org2})
        if presence_index2:
            list_group[presence_index2[0]].update({org1, org2})
        if not presence_index1 and not presence_index2:
            list_group.append({org1, org2})  
    f.close()
    dic_group[t] = list_group
    print("==", t)
    print(len(list_group), "redundant groups")
    print("------")
    for g in list_group:
        print(g)
    print()

== default
9 redundant groups
------
{'GCF_000009565.1', 'GCF_000022665.1'}
{'GCF_002953815.1', 'GCF_002952875.1', 'GCF_002952915.1', 'GCF_002953795.1', 'GCF_002953015.1', 'GCF_002953095.1'}
{'GCF_002953875.1', 'GCF_002953035.1', 'GCF_002952955.1', 'GCF_002953075.1', 'GCF_002953775.1', 'GCF_002952895.1'}
{'GCF_001308165.1', 'GCF_001308065.1'}
{'GCF_002834485.1', 'GCF_002835175.1'}
{'GCF_002116475.1', 'GCF_002116555.1'}
{'GCF_900096855.1', 'GCF_900096835.1'}
{'GCF_003830815.1', 'GCF_003831055.1'}
{'GCF_000233875.1', 'GCF_000233895.1'}

== id80
4 redundant groups
------
{'GCF_000009565.1', 'GCF_000022665.1'}
{'GCF_900096855.1', 'GCF_900096835.1', 'GCF_900096805.1'}
{'GCF_001308165.1', 'GCF_001308065.1'}
{'GCF_002834485.1', 'GCF_002835175.1'}

== id90
4 redundant groups
------
{'GCF_000009565.1', 'GCF_000022665.1'}
{'GCF_900096855.1', 'GCF_900096835.1', 'GCF_900096805.1'}
{'GCF_001308165.1', 'GCF_001308065.1'}
{'GCF_002834485.1', 'GCF_002835175.1'}



#### Check group properties 
```console
bash /home/chilpert/Dev/NOX/redundant_info.sh GCF_000009565.1 GCF_000022665.1
```
Do this with accessions for each group

* **Group 1**

GCF_000009565.1: 469008, strain=BL21(DE3), 4333 proteins
```
>GCF_000009565.1|WP_000002283.1 MULTISPECIES: carbon-phosphorus lyase complex subunit PhnJ [Enterobacteriaceae] 
>GCF_000009565.1|WP_000002542.1 MULTISPECIES: S26 family signal peptidase [Enterobacteriaceae] 
>GCF_000009565.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_000009565.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria] 
>GCF_000009565.1|WP_000003071.1 MULTISPECIES: lysine--tRNA ligase [Proteobacteria]
```

GCF_000022665.1: 469008, strain=BL21(DE3), 4333 proteins
```
>GCF_000022665.1|WP_000002283.1 MULTISPECIES: carbon-phosphorus lyase complex subunit PhnJ [Enterobacteriaceae] 
>GCF_000022665.1|WP_000002542.1 MULTISPECIES: S26 family signal peptidase [Enterobacteriaceae] 
>GCF_000022665.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_000022665.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria] 
>GCF_000022665.1|WP_000003071.1 MULTISPECIES: lysine--tRNA ligase [Proteobacteria]
```

* **Group2**

GCF_002953815.1: 562, strain=4FA, 4462 proteins
```
>GCF_002953815.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_002953815.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_002953815.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_002953815.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002953815.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```

GCF_002952875.1: 562, strain=6FA, 4461 proteins
```
>GCF_002952875.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_002952875.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_002952875.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_002952875.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002952875.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```

GCF_002952915.1: 562, strain=3FA, 4462 proteins
```
>GCF_002952915.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_002952915.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_002952915.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_002952915.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002952915.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```

GCF_002953795.1: 562, strain=5FA, 4462 proteins
```
>GCF_002953795.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_002953795.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_002953795.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_002953795.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002953795.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```

GCF_002953015.1: 562, strain=9FA, 4461 proteins
```
>GCF_002953015.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_002953015.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_002953015.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_002953015.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002953015.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```

GCF_002953095.1: 562, strain=1FA, 4462 proteins
```
>GCF_002953095.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_002953095.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_002953095.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_002953095.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002953095.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```

* **Group3**

GCF_002953875.1: 562, strain=2A, 4461 proteins
```
>GCF_002953875.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_002953875.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_002953875.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_002953875.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002953875.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```

GCF_002953035.1: 562, strain=8A, 4461 proteins
```
>GCF_002953035.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_002953035.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_002953035.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_002953035.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002953035.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```


GCF_002952955.1: 562, strain=2_0, 4461 proteins
```
>GCF_002952955.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_002952955.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_002952955.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_002952955.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002952955.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```


GCF_002953075.1: 562, strain=2FA, 4461 proteins
```
>GCF_002953075.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_002953075.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_002953075.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_002953075.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002953075.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```


GCF_002953775.1: 562, strain=7A, 4461 proteins
```
>GCF_002953775.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_002953775.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_002953775.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_002953775.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002953775.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```

GCF_002952895.1: 562, strain=6A, 4460 proteins
```
>GCF_002952895.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_002952895.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_002952895.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_002952895.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002952895.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```

* **Group 4**

GCF_001308165.1: 562, strain=K-12 substr. MG1655_TMP32XR2, 4466 proteins
```
>GCF_001308165.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_001308165.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_001308165.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_001308165.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_001308165.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```

GCF_001308065.1: 511145, strain=K-12 substr. MG1655, 4466 proteins
```
>GCF_001308065.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_001308065.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_001308065.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_001308065.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_001308065.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```

* **Group 5**

GCF_002834485.1: 562, strain=KCJK6605, 5273 proteins
```
>GCF_002834485.1|WP_000002107.1 MULTISPECIES: ASCH domain-containing protein [Enterobacteriaceae] 
>GCF_002834485.1|WP_000002261.1 DUF1364 domain-containing protein [Escherichia coli] 
>GCF_002834485.1|WP_000002283.1 MULTISPECIES: carbon-phosphorus lyase complex subunit PhnJ [Enterobacteriaceae] 
>GCF_002834485.1|WP_000002446.1 MULTISPECIES: LysR family transcriptional regulator [Enterobacteriaceae] 
>GCF_002834485.1|WP_000002542.1 MULTISPECIES: S26 family signal peptidase [Enterobacteriaceae]
```

GCF_002835175.1: 562, strain=KCJK6613, 5273 proteins
```
>GCF_002835175.1|WP_000002107.1 MULTISPECIES: ASCH domain-containing protein [Enterobacteriaceae] 
>GCF_002835175.1|WP_000002261.1 DUF1364 domain-containing protein [Escherichia coli] 
>GCF_002835175.1|WP_000002283.1 MULTISPECIES: carbon-phosphorus lyase complex subunit PhnJ [Enterobacteriaceae] 
>GCF_002835175.1|WP_000002446.1 MULTISPECIES: LysR family transcriptional regulator [Enterobacteriaceae] 
>GCF_002835175.1|WP_000002542.1 MULTISPECIES: S26 family signal peptidase [Enterobacteriaceae]
```

* **Group 6**
```
GCF_002116475.1: 562, strain=AUH_IMP167, 5114 proteins
>GCF_002116475.1|WP_000002279.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase PhnJ [Enterobacteriaceae] 
>GCF_002116475.1|WP_000002446.1 MULTISPECIES: LysR family transcriptional regulator [Enterobacteriaceae] 
>GCF_002116475.1|WP_000002542.1 MULTISPECIES: signal peptidase I [Enterobacteriaceae] 
>GCF_002116475.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002116475.1|WP_000002960.1 MULTISPECIES: ribonuclease E inhibitor RraB [Enterobacteriaceae]
```


GCF_002116555.1: 562, strain=AUH_IMP161, 5114 proteins
```
>GCF_002116555.1|WP_000002279.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase PhnJ [Enterobacteriaceae] 
>GCF_002116555.1|WP_000002446.1 MULTISPECIES: LysR family transcriptional regulator [Enterobacteriaceae] 
>GCF_002116555.1|WP_000002542.1 MULTISPECIES: signal peptidase I [Enterobacteriaceae] 
>GCF_002116555.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_002116555.1|WP_000002960.1 MULTISPECIES: ribonuclease E inhibitor RraB [Enterobacteriaceae]
```

* **Group 7**

GCF_900096855.1: 562, , 4443 proteins
```
>GCF_900096855.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_900096855.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_900096855.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_900096855.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_900096855.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```

GCF_900096835.1: 562, , 4442 proteins
```
>GCF_900096835.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_900096835.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_900096835.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_900096835.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_900096835.1|WP_000002953.1 MULTISPECIES: regulator of ribonuclease activity B [Proteobacteria]
```

* **Group 8**

GCF_003830815.1: 562, strain=PN42, 4440 proteins
```
>GCF_003830815.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_003830815.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_003830815.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_003830815.1|WP_000002787.1 MULTISPECIES: conjugal transfer protein TraP [Enterobacteriaceae] >GCF_003830815.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria]
```

GCF_003831055.1: 562, strain=PN45, 4440 proteins
```
>GCF_003831055.1|WP_000002303.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase [Proteobacteria] 
>GCF_003831055.1|WP_000002474.1 MULTISPECIES: transcriptional regulator [Proteobacteria] 
>GCF_003831055.1|WP_000002541.1 MULTISPECIES: S26 family signal peptidase [Proteobacteria] 
>GCF_003831055.1|WP_000002787.1 MULTISPECIES: conjugal transfer protein TraP [Enterobacteriaceae] 
>GCF_003831055.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria]
```

* **Group 9**

GCF_000233875.1: 885276, strain=clone D i2, 4627 proteins
```
>GCF_000233875.1|WP_000002278.1 alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase PhnJ [Escherichia coli] 
>GCF_000233875.1|WP_000002542.1 MULTISPECIES: signal peptidase I [Enterobacteriaceae] 
>GCF_000233875.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_000233875.1|WP_000002950.1 MULTISPECIES: ribonuclease E inhibitor RraB [Enterobacteriaceae] 
>GCF_000233875.1|WP_000003071.1 MULTISPECIES: lysine--tRNA ligase [Proteobacteria]
```

GCF_000233895.1: 885275, strain=clone D i14, 4627 proteins
```
>GCF_000233895.1|WP_000002278.1 alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase PhnJ [Escherichia coli] 
>GCF_000233895.1|WP_000002542.1 MULTISPECIES: signal peptidase I [Enterobacteriaceae] 
>GCF_000233895.1|WP_000002907.1 MULTISPECIES: diacylglycerol kinase [Proteobacteria] 
>GCF_000233895.1|WP_000002950.1 MULTISPECIES: ribonuclease E inhibitor RraB [Enterobacteriaceae] 
>GCF_000233895.1|WP_000003071.1 MULTISPECIES: lysine--tRNA ligase [Proteobacteria]
```


### Redundancy in E.coli with NOX ?

In [53]:
default_list_group = dic_group["default"]
redundant_accession = set([acc for acc_set in default_list_group for acc in acc_set])

In [55]:
print(redundant_accession.intersection(ecoli_NOX))

set()


No strict redundancy in E.coli with NOX

In [26]:
f = open("/Volumes/arwen/mobi/group/databases/refseq95_proteins/refseq95_ecoli_assembly_summary.txt", "r")
all_ecoli_accession = [l.split("\t")[0] for l in f]
f.close()

In [84]:
ecoli_no_NOX = set(all_ecoli_accession).difference(ecoli_NOX)

In [85]:
print(len(ecoli_no_NOX))

16619


## Try to use all NOX sequence for profile

In [9]:
dataDir = glob.glob('/Volumes/arwen/mobi/group/NOX_ecoli_full/hmmr_all_NOX/volume*')
dataProfile = parsing_results(dataDir)

1 / 172
10 / 172
20 / 172
30 / 172
40 / 172
50 / 172
60 / 172
70 / 172
80 / 172
90 / 172
100 / 172
110 / 172
120 / 172
130 / 172
140 / 172
150 / 172
160 / 172
170 / 172


In [26]:
def filter_evalue_one_profile(entry,**kwargs):
    '''Just keep proteins that have an evalue <= threshold for all domains'''
    threshold=kwargs["threshold"]
    for h in entry.hmmr: 
        if float(h.hit.iEvalue) <= threshold:
            return True
    return False  

dataFilterEvalue = dataProfile.filter(filter_evalue_one_profile, threshold = 1e-1)
print(len(dataFilterEvalue), "after evalue filter")
dataFiltered_NOX_profile = dataProfile.filter(filter_nb_helix, min_helix = 2, max_helix = 7).filter(filter_bi_histidine)
print(len(dataFiltered_NOX_profile), "after structure filter")
set_data_begin = set([e.prot for e in dataFiltered])
set_data_NOX_profile = set([e.prot for e in dataFiltered_NOX_profile])
print(len(set_data_begin.intersection(set_data_NOX_profile)), "common proteins before and after NOX profile")

57964 after evalue filter
526 after structure filter
526 common proteins before and after NOX profile


## Try to refine domains

In [148]:
mfasta_nad=dataFiltered.get_domain_mfasta("nad_binding_prokaryotes")
mfasta_fad=dataFiltered.get_domain_mfasta("fad_binding_prokaryotes")
mfasta_ferric=dataFiltered.get_domain_mfasta("ferric_reduct_prokaryotes")

In [149]:
nad_output="/Volumes/arwen/mobi/group/NOX_ecoli_full/refined_profiles/nad_binding_prokaryotes.mfasta"
fad_output="/Volumes/arwen/mobi/group/NOX_ecoli_full/refined_profiles/fad_binding_prokaryotes.mfasta"
ferric_output="/Volumes/arwen/mobi/group/NOX_ecoli_full/refined_profiles/ferric_reduct_prokaryotes.mfasta"

In [151]:
with open(nad_output, "w") as o:
    o.write(mfasta_nad)
with open(fad_output, "w") as o:
    o.write(mfasta_fad)
with open(ferric_output, "w") as o:
    o.write(mfasta_ferric)

In [25]:
dataDir = glob.glob('/Volumes/arwen/mobi/group/NOX_ecoli_full/refined_work/volume*')
dataRefined = parsing_results(dataDir)
dataRefinedFilter = filter_NOX(dataRefined)

1 / 172
10 / 172
20 / 172
30 / 172
40 / 172
50 / 172
60 / 172
70 / 172
80 / 172
90 / 172
100 / 172
110 / 172
120 / 172
130 / 172
140 / 172
150 / 172
160 / 172
170 / 172
Initial proteins : 48524
Number of proteins with 3 domains :  547
Number of proteins after helix filter :  526
Number of proteins after evalue 1e-3 filter : 526


In [28]:
set_data_domains_profile = set([e.prot for e in dataRefinedFilter])
print(len(set_data_domains_profile.intersection(set_data_begin)), "common proteins before and after refined profiles.")

526 common proteins before and after refined profiles.


## Presence/absence matrix

### Which cluster is NOX ? 

In [10]:
ecoliNOX = pickle.load(open("/Volumes/arwen/mobi/group/NOX_ecoli_full/pickle_saved/NOX_ecoli_20190823-102442.pickle", "rb"))
ecoliNOX_prot = set([e.prot for e in ecoliNOX])

#### Parse mmseqs all proteins clustering and write in more readable format
In `arwen`:
```console
python /home/chilpert/Dev/arwen_scripts/parse_mmseqs_clusters.py /mobi/group/NOX_ecoli_full/redundancy/all_ecoli_cluster.tsv /mobi/group/NOX_ecoli_full/all_ecoli_cluster.tsv /mobi/group/NOX_ecoli_full/presence_absence/all_ecoli_cluster_parse.tsv 
```
Copy clustering results in computer where jupyter is launch for read more rapidly. 

In [44]:
clusters_with_NOX = []
dic_cluster = {}
i = 0
f = open("/Users/chilpert/Results/NOX_genetic_profile/all_ecoli_cluster_parse.tsv", "r")
for l in f:
    i += 1
    if i % 10000 == 0:
        print(i)
    cluster = l.split("\t")[0]
    prots = set(l.rstrip().split("\t")[1].split(";"))
    orgs = set([p.split("|")[0] for p in prots])
    dic_cluster[cluster] = prots
    if prots.intersection(ecoliNOX_prot):
        clusters_with_NOX.append(cluster)
f.close()
print("NOX cluster:", clusters_with_NOX)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
NOX cluster: ['15649']


All NOX proteins are in cluster 15649. 
**Do we have other proteins in this cluster ?**

In [12]:
cluster15649_prots = dic_cluster['15649']
print(len(cluster15649_prots), "proteins in cluster 15649")
prot_no_NOX = cluster15649_prots.difference(ecoliNOX_prot)
print(len(prot_no_NOX), '"not NOX" proteins')

546 proteins in cluster 15649
20 "not NOX" proteins


**Look at this 20 proteins in details**  
Do we have NOX domains ?  
Do we have transmembrane helixes ?  
Do we have bi-histidine pattern ?  
Why they don't pass NOX filters ?  

In [13]:
def filter_prot(entry, **kwargs):
    if not "list_prot" in kwargs:
        raise Exception("Give list_prot argument")
    list_prot = kwargs["list_prot"]
    if entry.prot in list_prot:
        return True
    return False

def filter_three_domains(entry): 
    '''Filter proteins that contains 3 domains'''
    domains=set([hmm_obj.domain for hmm_obj in entry.hmmr])
    if len(domains)==3:
        return True
    return False


def filter_nb_helix(entry,**kwargs):
    '''Filter proteins that have between min_helix and max_helix transmembrane helix'''
    min_helix=kwargs["min_helix"]
    max_helix=kwargs["max_helix"]
    if entry.tmhmm.nb_helix>=min_helix and entry.tmhmm.nb_helix<=max_helix:
        return True 
    return False

def filter_bi_histidine(entry):
    '''
    Filter proteins that have the bi-histidine pattern. Bi-histidine pattern is when we have 2 histidines separate
    by 12 to 14 residues, in 2 distinct helixes. 
    '''
    seq=entry.fasta.seq
    topo_seq=entry.tmhmm.topology_seq
    H_status = []
    if len(topo_seq)!=len(seq):
        raise Exception("Topology seq and amino acids seq with different size. Check !")
    for i in range(len(seq)):
        if topo_seq[i]=="i" or topo_seq[i]=="o":
            continue
        if not seq[i]=="H":
            continue
        H_status.append( [i, topo_seq[i], False] )     
    for i in range (len(H_status) - 1):
        for j in range (i + 1, len(H_status)):
            if H_status[i][1] != H_status[j][1]:
                continue
            d = H_status[j][0] - H_status[i][0]
            if d >= 12 and d <= 14:
                H_status[i][2] = True
                
    # Only keep marked histidine
    H_status = [ x for x in H_status if x[2] ]
    # Create a dicitinary where keys are Helices numbers
    H_groups = {}
    for x in H_status:
        if not x[2]:
            continue
        if x[1] not in H_groups:
            H_groups[x[1]]=[]
        H_groups[x[1]].append(x)
        
    # The test is passed if at least two distinct helices feature at least one correctly spaced histidine pair
    # ie : if the helice dictionary has more than 1 entrie
    HisTestBool = True if len(H_groups) > 1 else False     
    return HisTestBool

def filter_evalue(entry,**kwargs):
    '''Just keep proteins that have an evalue <= threshold for all domains'''
    threshold=kwargs["threshold"]
    conserve=0
    domains=set()
    for h in entry.hmmr: 
        if float(h.hit.iEvalue) <= threshold:
            domains.add(h.domain)
    if len(domains)==3:
        return True
    return False  

In [14]:
# Just keep "new NOX" proteins
dataClusterNOX = dataContainer.filter(filter_prot, list_prot = prot_no_NOX)
# Test each filter step
filterThreeDomains = dataClusterNOX.filter(filter_three_domains)
print("3 domains", len(filterThreeDomains))
filterEvalue = dataClusterNOX.filter(filter_evalue, threshold = 1e-3)
print("Evalue", len(filterEvalue))
filterNbHelix = dataClusterNOX.filter(filter_nb_helix, min_helix = 2, max_helix = 7)
print("Helix number", len(filterNbHelix))
filterBiHistidine = dataClusterNOX.filter(filter_bi_histidine)
print("Bi-histidine", len(filterBiHistidine))

3 domains 20
Evalue 20
Helix number 20
Bi-histidine 0


The 20 proteins don't pass bi-histidine filter. Check topology.

In [19]:
import re
dataClusterNOX.separate_seq_into_fragments()
for e in dataClusterNOX:
    print("==", e.prot)
    histidine_helixes = []
    for h in e.helix_fragments:
        print(h["name"])
        H_position = [m.start() for m in re.finditer('H', h["seq"])]
        print(H_position)
    print()

== GCF_004000315.1|WP_024189813.1
TMhelix_1
[]
TMhelix_2
[]
TMhelix_3
[1]
TMhelix_4
[0, 3, 6, 19]
TMhelix_5
[]

== GCF_002537695.1|WP_024189813.1
TMhelix_1
[]
TMhelix_2
[]
TMhelix_3
[1]
TMhelix_4
[0, 3, 6, 19]
TMhelix_5
[]

== GCF_900448805.1|WP_024189813.1
TMhelix_1
[]
TMhelix_2
[]
TMhelix_3
[1]
TMhelix_4
[0, 3, 6, 19]
TMhelix_5
[]

== GCF_004766675.1|WP_024189813.1
TMhelix_1
[]
TMhelix_2
[]
TMhelix_3
[1]
TMhelix_4
[0, 3, 6, 19]
TMhelix_5
[]

== GCF_003144955.1|WP_024189813.1
TMhelix_1
[]
TMhelix_2
[]
TMhelix_3
[1]
TMhelix_4
[0, 3, 6, 19]
TMhelix_5
[]

== GCF_000488015.1|WP_024189813.1
TMhelix_1
[]
TMhelix_2
[]
TMhelix_3
[1]
TMhelix_4
[0, 3, 6, 19]
TMhelix_5
[]

== GCF_002227095.1|WP_089553608.1
TMhelix_1
[1]
TMhelix_2
[0, 3, 6, 19]
TMhelix_3
[]

== GCF_000350685.1|WP_024189813.1
TMhelix_1
[]
TMhelix_2
[]
TMhelix_3
[1]
TMhelix_4
[0, 3, 6, 19]
TMhelix_5
[]

== GCF_002226395.1|WP_024189813.1
TMhelix_1
[]
TMhelix_2
[]
TMhelix_3
[1]
TMhelix_4
[0, 3, 6, 19]
TMhelix_5
[]

== GCF_002226495.1

For the 20 proteins that doesn't pass histidine filter, they have one helix with bi-histidine pair and one helix with only one histidine.

## Presence/absence matrix all E.coli

In `arwen`: 
```console
python /home/chilpert/Dev/NOX/presence_absence_ecoli.py /mobi/group/NOX_ecoli_full/presence_absence/all_ecoli_cluster_parse.tsv /mobi/group/NOX_ecoli_full/presence_absence/all_ecoli_presence_absence_matrix.tsv
```

Final clusters presence absence matrix : `/mobi/group/NOX_ecoli_full/presence_absence/all_ecoli_presence_absence_matrix.tsv`