In [2]:
import os, sys
import glob
import pickle
import gzip
sys.path.append("/Users/chilpert/Dev/pyproteinsExt/src")
sys.path.append("/Users/chilpert/Dev/pyproteins/src")
import pyproteinsExt.topology as topology
import pyproteinsExt.ena as ena
from ete3 import NCBITaxa
%load_ext autoreload
%autoreload 2

The purpose of this notebook is to construct two datasets : E.coli genomes with NOX and E.coli genomes without NOX.

## 1. Download E.coli genomes

* Download RefSeq bacteria assembly_summary_refseq from NCBI ftp

* Extract E.coli from assembly_summary

In `arwen`:
```console
cd /mobi/group/databases/refseq95_proteins
wget -O refseq_bacteria_assembly_summary_r95.txt https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt 
awk -F "\t" '{ if ($7 == 562) print }' refseq95_bacteria_assembly_summary.txt > refseq95_ecoli_assembly_summary.txt
```
17142 organisms

* Download E.coli proteins using slurm (and rewrite headers by adding assembly accession), create volumes of 10 E.coli
In `arwen`:
```console
DB=/mobi/group/databases/refseq95_proteins/ecoli
mkdir -p $DB
bash /home/chilpert/Dev/download_proteins_from_assembly_summary_slurm.sh refseq95_ecoli_assembly_summary.txt $DB 10
```

* Check if all proteins have been downloaded 
```console
bash /home/chilpert/Dev/verif_download.sh $DB > empty_fasta.txt
```
If you have line on `empty_fasta.txt`, delete this entries and relaunch download. Do this unless you don't have lines on empty_fasta.txt anymore. Maybe you will have to download some entries manually (example: sometimes ftp link is not the last element of assembly_summary line (very rare, 3 cases for refseq95 over 17k) and it's not handled for now)

```console
for f in $(cut -f 2 empty_fasta.txt); do rm $f; done
bash /home/chilpert/Dev/download_proteins_from_assembly_summary_slurm.sh refseq95_ecoli_assembly_summary.txt $DB 10
```

* Concatenate E.coli proteins inside volumes. Create fasta file with proteins of 10 E.coli. Slurm concatenation jobs create problems, don't do. 
```console
SLURM : (WARNING : CG status problem when try it)
bash /home/chilpert/Dev/concatenate_volumes_slurm.sh $DB /mobi/group/NOX_ecoli_full/volumes_sbatch
mv /mobi/group/NOX_ecoli_full/volumes_sbatch/volume*/*.faa.gz /mobi/group/NOX_ecoli_full/volumes/
MANUAL : 
...
```

* Concatenate by 100 E.coli to have less hmm jobs. 
```console
mkdir -p /mobi/group/NOX_ecoli_full/volumes_concat10
python /home/chilpert/Dev/arwen_scripts/concat_by_number.py /mobi/group/NOX_ecoli_full/volumes 10 /mobi/group/NOX_ecoli_full/volumes_concat10
```


## 2. Predict NOX proteins

### Launch script
In `arwen`:
```console
ROOT_DIR=/mobi/group/NOX_ecoli_full
SCRIPT_DIR=/mobi/group/NOX_clean/nox-analysis/scripts
bash $SCRIPT_DIR/runHMMR_slurm.sh $ROOT_DIR/volumes_concat10 $ROOT_DIR/work /mobi/group/NOX_clean/data/refined_profiles_3
```
### Parse results

In [3]:
def parsing_results(dataDir):
    dataDir_length=len(dataDir)
    c=1
    print(c,"/",dataDir_length)
    dataContainer=topology.parse(hmmrOut=dataDir[0]+"/hmmsearch.out",tmhmmOut=dataDir[0]+"/tmhmm.out",fastaOut=dataDir[0]+"/hmmsearch.fasta")
    for dir in dataDir:
        c+=1
        if c%10==0:
            print(c,"/",dataDir_length)
        dataContainer=dataContainer.addParsing(topology.parse(hmmrOut=dir+"/hmmsearch.out",tmhmmOut=dir+"/tmhmm.out",fastaOut=dir+"/hmmsearch.fasta"))
    return dataContainer

dataDir=glob.glob('/Volumes/arwen/mobi/group/NOX_ecoli_full/work/volume*')
dataContainer=parsing_results(dataDir)

1 / 172
10 / 172
20 / 172
30 / 172
40 / 172
50 / 172
60 / 172
70 / 172
80 / 172
90 / 172
100 / 172
110 / 172
120 / 172
130 / 172
140 / 172
150 / 172
160 / 172
170 / 172


### Filter NOX proteins

In [6]:
def filter_three_domains(entry): 
    '''Filter proteins that contains 3 domains'''
    domains=set([hmm_obj.domain for hmm_obj in entry.hmmr])
    if len(domains)==3:
        return True
    return False

def filter_nb_helix(entry,**kwargs):
    '''Filter proteins that have between min_helix and max_helix transmembrane helix'''
    min_helix=kwargs["min_helix"]
    max_helix=kwargs["max_helix"]
    if entry.tmhmm.nb_helix>=min_helix and entry.tmhmm.nb_helix<=max_helix:
        return True 
    return False

def filter_bi_histidine(entry):
    '''
    Filter proteins that have the bi-histidine pattern. Bi-histidine pattern is when we have 2 histidines separate
    by 12 to 14 residues, in 2 distinct helixes. 
    '''
    seq=entry.fasta.seq
    topo_seq=entry.tmhmm.topology_seq
    H_status = []
    if len(topo_seq)!=len(seq):
        raise Exception("Topology seq and amino acids seq with different size. Check !")
    for i in range(len(seq)):
        if topo_seq[i]=="i" or topo_seq[i]=="o":
            continue
        if not seq[i]=="H":
            continue
        H_status.append( [i, topo_seq[i], False] )     
    for i in range (len(H_status) - 1):
        for j in range (i + 1, len(H_status)):
            if H_status[i][1] != H_status[j][1]:
                continue
            d = H_status[j][0] - H_status[i][0]
            if d >= 12 and d <= 14:
                H_status[i][2] = True
                
    # Only keep marked histidine
    H_status = [ x for x in H_status if x[2] ]
    # Create a dicitinary where keys are Helices numbers
    H_groups = {}
    for x in H_status:
        if not x[2]:
            continue
        if x[1] not in H_groups:
            H_groups[x[1]]=[]
        H_groups[x[1]].append(x)
        
    # The test is passed if at least two distinct helices feature at least one correctly spaced histidine pair
    # ie : if the helice dictionary has more than 1 entrie
    HisTestBool = True if len(H_groups) > 1 else False     
    return HisTestBool

def filter_evalue(entry,**kwargs):
    '''Just keep proteins that have an evalue <= threshold for all domains'''
    threshold=kwargs["threshold"]
    conserve=0
    domains=set()
    for h in entry.hmmr: 
        if float(h.hit.iEvalue) <= threshold:
            domains.add(h.domain)
    if len(domains)==3:
        return True
    return False     

def filter_evalue_hit(hit,**kwargs):
    '''Don't keep hit inside hmm hits whith evalue > threshold''' 
    threshold=kwargs["threshold"]
    if float(hit.hit.iEvalue)<=threshold: 
        return True 
    return False

def filter_NOX(data_container):
    print("Initial proteins :", len(data_container))
    '''All filters function to get predicted NOX proteins'''
    #3 domains 
    filterThreeDomains=data_container.filter(filter_three_domains)
    print("Number of proteins with 3 domains : ",len(filterThreeDomains))

    # Helix filter
    filterHelix=filterThreeDomains.filter(filter_nb_helix,min_helix=2,max_helix=7).filter(filter_bi_histidine)
    print("Number of proteins after helix filter : ",len(filterHelix))

    #Evalue 1e-3 filter 
    filterEvalue3=filterHelix.filter(filter_evalue,threshold=1e-3).filter_hit(filter_evalue_hit,threshold=1e-3)
    print("Number of proteins after evalue 1e-3 filter :",len(filterEvalue3))
    return filterEvalue3

In [5]:
dataFiltered = filter_NOX(dataContainer)

Initial proteins : 126901
Number of proteins with 3 domains :  546
Number of proteins after helix filter :  526
Number of non-eukaryotic proteins after evalue 1e-3 filter : 526


#### E.coli accession with NOX

In [32]:
ecoli_NOX_info = {}
ecoli_NOX = set([e.prot.split("|")[0] for e in dataFiltered])
for acc in ecoli_NOX:
    ecoli_NOX_info[acc]={'strain': None,'taxid': None, "isolate":None, "name":None}

In [33]:
f = open("/Volumes/arwen/mobi/group/databases/refseq95_proteins/refseq95_ecoli_assembly_summary.txt" , "r")
for l in f:
    l_split = l.rstrip().split("\t")
    gcf = l_split[0]
    if gcf in ecoli_NOX_info:
        taxid = l_split[5]
        organism_name = l_split[7]
        strain = l_split[8]
        isolate = l_split[9]
        ecoli_NOX_info[gcf]["taxid"] = taxid
        ecoli_NOX_info[gcf]["name"] = organism_name
        ecoli_NOX_info[gcf]["strain"] = strain
        ecoli_NOX_info[gcf]["isolate"] = isolate

In [49]:
taxids = set([ecoli_NOX_info[gcf]["taxid"] for gcf in ecoli_NOX_info])
names = set([ecoli_NOX_info[gcf]["name"] for gcf in ecoli_NOX_info])
name_strain_isolate = [ecoli_NOX_info[gcf]["name"] + "|" + ecoli_NOX_info[gcf]["strain"] + "|" + ecoli_NOX_info[gcf]["isolate"] for gcf in ecoli_NOX_info]
print(len(taxids), "different taxids")
print(len(names), "different names")
print(len(set(name_strain_isolate)), "different combination name+strain")
for ns in set(name_strain_isolate):
    count = name_strain_isolate.count(ns)
    if count != 1:
        print(ns, count)

108 different taxids
108 different names
523 different combination name+strain
Escherichia coli|| 3


#### Look if we have identical proteins

In [62]:
for i in range(len(dataFiltered)):
    for j in range(i+1, len(dataFiltered)):
        if dataFiltered[i].fasta.seq == dataFiltered[j].fasta.seq:
            print(dataFiltered[i].prot, dataFiltered[j].prot)
            break

GCF_001518365.1|WP_001305829.1 GCF_002520325.1|WP_001305829.1
GCF_000937355.1|WP_072042835.1 GCF_000938175.1|WP_072042835.1
GCF_000938175.1|WP_072042835.1 GCF_000939135.1|WP_072042835.1
GCF_002520325.1|WP_001305829.1 GCF_002521855.1|WP_001305829.1
GCF_002521855.1|WP_001305829.1 GCF_003721735.1|WP_001305829.1
GCF_003721735.1|WP_001305829.1 GCF_000408385.1|WP_001305829.1
GCF_900195765.1|WP_072169412.1 GCF_002510355.1|WP_072169412.1
GCF_000408385.1|WP_001305829.1 GCF_000781605.1|WP_001305829.1
GCF_000408405.1|WP_001571863.1 GCF_003333875.1|WP_001571863.1
GCF_003333875.1|WP_001571863.1 GCF_003019035.1|WP_001571863.1
GCF_000781605.1|WP_001305829.1 GCF_000781635.1|WP_001305829.1
GCF_000781635.1|WP_001305829.1 GCF_004170565.1|WP_001305829.1
GCF_004170565.1|WP_001305829.1 GCF_003145085.1|WP_001305829.1
GCF_003145085.1|WP_001305829.1 GCF_003145115.1|WP_001305829.1
GCF_003145115.1|WP_001305829.1 GCF_000219515.2|WP_001305829.1
GCF_000219515.2|WP_001305829.1 GCF_003591905.1|WP_001305829.1
GCF_0035

GCF_000797775.1|WP_072024624.1 GCF_001519525.1|WP_072024624.1
GCF_000459795.1|WP_001305829.1 GCF_000458055.1|WP_001305829.1
GCF_003318475.1|WP_075208819.1 GCF_005399185.1|WP_075208819.1
GCF_002587005.1|WP_001571863.1 GCF_002474725.1|WP_001571863.1
GCF_000458055.1|WP_001305829.1 GCF_000458155.1|WP_001305829.1
GCF_000458155.1|WP_001305829.1 GCF_000456165.1|WP_001305829.1
GCF_002231115.1|WP_021557189.1 GCF_000460255.1|WP_021557189.1
GCF_002474725.1|WP_001571863.1 GCF_002511295.1|WP_001571863.1
GCF_000456165.1|WP_001305829.1 GCF_006230795.1|WP_001305829.1
GCF_900195445.1|WP_072169412.1 GCF_002485275.1|WP_072169412.1
GCF_002511295.1|WP_001571863.1 GCF_002291025.1|WP_001571863.1
GCF_000460255.1|WP_021557189.1 GCF_003859345.1|WP_021557189.1
GCF_006230795.1|WP_001305829.1 GCF_002917255.1|WP_001305829.1
GCF_002917255.1|WP_001305829.1 GCF_000617985.2|WP_001305829.1
GCF_000617985.2|WP_001305829.1 GCF_000776335.1|WP_001305829.1
GCF_003859345.1|WP_021557189.1 GCF_004284275.1|WP_021557189.1
GCF_0015

In [67]:
print(dataFiltered.entries["GCF_001518365.1|WP_001305829.1"].fasta.seq)
print()
print(dataFiltered.entries["GCF_002520325.1|WP_001305829.1"].fasta.seq)


MKKANLTGTLVCFVACLIFLSSGDIFREKPIALIWFLRQNILFFTGTLAWCLMTLTMCLSLRSSWLNKVLGGLDKAWRLHKWAGICAIAFAFAHWLDEKLPQLFVAFGWLTHPGKIVDINLTPVQENWLHAGLLVGECAMFIMIAMIFVSLSKKVPYHLFHLVHRLFPVFYLAIAFHVFTALFKSYWWETPAAYLLILITIPGVFAAFISLLKLNGSKNKHQATIKNIVNHPGQITEVTLELEHAIDYSPGQFAFLTFAHSKESHPFTIASYTKEKNTLRFAIKHLGDYTSTLASSIKIGQSAFVEGPWGKFDFTLPCSHQVWIAGGIGITPFIAQLEYRKHHGASFVPVDLWYCVSRSEDAWYIDKLTSLCAQARVTLHLLDAHKGERLQVHYLTDKIANKGDTHFWFCGPQSFAKALSKGLYENGIASQNFHFDRFCMR

MKKANLTGTLVCFVACLIFLSSGDIFREKPIALIWFLRQNILFFTGTLAWCLMTLTMCLSLRSSWLNKVLGGLDKAWRLHKWAGICAIAFAFAHWLDEKLPQLFVAFGWLTHPGKIVDINLTPVQENWLHAGLLVGECAMFIMIAMIFVSLSKKVPYHLFHLVHRLFPVFYLAIAFHVFTALFKSYWWETPAAYLLILITIPGVFAAFISLLKLNGSKNKHQATIKNIVNHPGQITEVTLELEHAIDYSPGQFAFLTFAHSKESHPFTIASYTKEKNTLRFAIKHLGDYTSTLASSIKIGQSAFVEGPWGKFDFTLPCSHQVWIAGGIGITPFIAQLEYRKHHGASFVPVDLWYCVSRSEDAWYIDKLTSLCAQARVTLHLLDAHKGERLQVHYLTDKIANKGDTHFWFCGPQSFAKALSKGLYENGIASQNFHFDRFCMR


#### Save NOX

In [53]:
mfasta = dataFiltered.proteins_mfasta()
with open("/Volumes/arwen/mobi/group/NOX_ecoli_full/predictedNOX_ecoli.mfasta", "w") as o:
    o.write(mfasta)

In [78]:
ecoli_NOX_refseq = set([e.prot for e in dataFiltered])

In [92]:
data_trembl = pickle.load(open("/Volumes/arwen/mobi/group/NOX_clean/pickle_saved/NOX_annotation_neighborhood10_20190808-181008.pickle", "rb"))

In [83]:
ncbi = NCBITaxa()
def get_species(taxid):
    lineage = ncbi.get_lineage(taxid)
    ranks = ncbi.get_rank(lineage)
    species_taxid = [taxid for taxid in ranks if ranks[taxid] == "species"][0]
    species_taxname = ncbi.get_taxid_translator([species_taxid])[species_taxid]
    return species_taxid, species_taxname

In [93]:
ecoli_NOX_trembl = set()
for e in data_trembl:
    species = get_species(e.taxo.taxid)
    if species[0] == 562:
        ecoli_NOX_trembl.add(e.prot)

In [95]:
print(len(ecoli_NOX_trembl))

23


In [87]:
print(len(ecoli_NOX_refseq))

11


In [98]:
for p in ecoli_NOX_trembl:
    uniprot_entry = data_trembl.entries[p].uniprot_xref
    print(uniprot_entry)

{'EMBL': {'ASUD01000080': 'EOV04566.1'}, 'RefSeq': {'NZ_KE136818.1': 'WP_001571863.1'}}
{'EMBL': {'UGAB01000002': 'STF41292.1', 'UFZP01000007': 'STF52849.1'}, 'RefSeq': {}}
{'EMBL': {'LOGT01000057': 'KYS98070.1', 'MIWY01000005': 'OEN36937.1'}, 'RefSeq': {}}
{'EMBL': {'CP006632': 'AKK49692.1'}, 'RefSeq': {'NZ_CP006632.1': 'WP_001305829.1'}}
{'EMBL': {'RNVT01000009': 'MIC60481.1'}, 'RefSeq': {}}
{'EMBL': {'MOIL01000010': 'OJP13881.1'}, 'RefSeq': {}}
{'EMBL': {'ASUC01000038': 'EOU90179.1'}, 'RefSeq': {'NZ_KE136770.1': 'WP_001571863.1'}}
{'EMBL': {'UFZN01000003': 'STF52161.1'}, 'RefSeq': {'NZ_LRXQ01000033.1': 'WP_001329701.1'}}
{'EMBL': {'CU928163': 'CAR14422.1'}, 'RefSeq': {'NC_011751.1': 'YP_002413941.1'}}
{'EMBL': {'ANSQ01000006': 'ELC15145.1'}, 'RefSeq': {}}
{'EMBL': {'ROFO01000002': 'MGQ47977.1'}, 'RefSeq': {'NZ_BDLM01000054.1': 'WP_075843042.1'}}
{'EMBL': {'ADTR01000108': 'EFK22299.1'}, 'RefSeq': {'NZ_GG772649.1': 'WP_001329701.1'}}
{'EMBL': {'ADKD01000017': 'OSL82761.1'}, 'RefSeq': 

In [161]:
enaColl=ena.getENACollection()
enaColl.setCache("/Users/chilpert/cache/ena")

Get ENA Collection
Acknowledged 0 entries (/Users/chilpert)
Changing cache location to /Users/chilpert/cache/ena
Reindexing /Users/chilpert/cache/ena
Acknowledged 810 entries (/Users/chilpert/cache/ena)


In [165]:
for p in ecoli_NOX_trembl: 
    print(p)
    e = data_trembl.entries[p]
    embl_id = list(e.uniprot_xref["EMBL"].keys())[0]
    ena_entry = enaColl.get(embl_id, charge_features = False)
    print(ena_entry.metadata)

tr|S0YK37|S0YK37_ECOLX
{'Project': 'PRJNA157629', 'Sample': 'SAMN00847673'}
tr|A0A376LV77|A0A376LV77_ECOLX
{'Project': 'PRJEB6403', 'Sample': 'SAMEA3307894'}
tr|A0A161R3H5|A0A161R3H5_ECOLX
{'Project': 'PRJNA285020', 'Sample': 'SAMN04002660'}
tr|A0A0G3KBG1|A0A0G3KBG1_ECOLX
{'Project': 'PRJNA64999', 'Sample': 'SAMN02469619'}
tr|A0A3L5Q8A7|A0A3L5Q8A7_ECOLX
{'Project': 'PRJNA292663', 'Sample': 'SAMN10221397'}
tr|A0A1M1NE58|A0A1M1NE58_ECOLX
{'Project': 'PRJNA349231', 'Sample': 'SAMN05928944'}
tr|S0XWD8|S0XWD8_ECOLX
{'Project': 'PRJNA157627', 'Sample': 'SAMN00847672'}
tr|A0A376LTK0|A0A376LTK0_ECOLX
{'Project': 'PRJEB6403', 'Sample': 'SAMEA2689763'}
tr|B7N7G0|B7N7G0_ECOLU
{'Project': 'PRJNA33415', 'Sample': 'SAMEA3138233'}
tr|L2VL28|L2VL28_ECOLX
{'Project': 'PRJNA157579', 'Sample': 'SAMN00847648'}
tr|A0A3K3H1D4|A0A3K3H1D4_ECOLX
{'Project': 'PRJNA292663', 'Sample': 'SAMN10221286'}
tr|D8A3G5|D8A3G5_ECOMS
{'Project': 'PRJNA47205', 'Sample': 'SAMN00189180'}
tr|A0A1X3LS50|A0A1X3LS50_ECOLX
{'Projec