# Domain analysis

In [3]:
import pickle
import sys, os
sys.path.append("/Users/chilpert/Work/pyproteinsExt/src")
sys.path.append("/Users/chilpert/Work/pyproteins/src")
import pyproteinsExt 
import pyproteins

##### Load full Pfam annotation

In [4]:
data=pickle.load(open("/Volumes/arwen/mobi/group/NOX_CH/pickle_saved/NOX_annotation_fullPFAM_20190411-174326.pickle","rb"))
print(len(data))

832


##### Parse data 

In [4]:
def parse_data(data): 
    '''Return dictionnary with domain as key and list of proteins that contains this domain as value'''
    dic_domain={}
    for protein in data :
        for domain in data[protein]['hmmr']:
            if domain in dic_domain:
                dic_domain[domain].append(protein)
            else:     
                dic_domain[domain]=[protein]
    return(dic_domain)       

def simplify_data(data):
    dic_protein={}
    for protein in data : 
        dic_protein[protein]=list(data[protein]['hmmr'].keys())
    return dic_protein    

def assemble_domains_with_same_proteins(dic_domain):
    dic_prot={}
    for d in dic_domain:
        dic_domain[d].sort()
        prots=",".join(dic_domain[d])
        if not prots in dic_prot: 
            dic_prot[prots]=set()
        dic_prot[prots].add(d)
    new_dic_domain={}    
    for p in dic_prot: 
        doms=",".join(dic_prot[p])
        new_dic_domain[doms]=p.split(",")
    return new_dic_domain    

In [9]:
from collections import OrderedDict
dic_domain=parse_data(data)
ordered_dic_domain=OrderedDict(sorted(dic_domain.items(),key=lambda kv: len(kv[1]),reverse=True))
dic_protein=simplify_data(data)
ordered_dic_protein=OrderedDict(sorted(dic_protein.items(),key=lambda kv: len(kv[1]),reverse=True))
assemble_dic_domain=assemble_domains_with_same_proteins(dic_domain)

##### Handle taxonomy with ete3

In [13]:
from ete3 import NCBITaxa
ncbi = NCBITaxa()
#ncbi.update_taxonomy_database()

In [23]:
def get_tax_level(taxid,tax_level): 
    taxid=int(taxid)
    rank=ncbi.get_rank([taxid])[taxid]
    if rank == tax_level : 
        taxname=ncbi.get_taxid_translator([taxid])[taxid]
        return(taxid,taxname)
    else:
        lineage=ncbi.get_lineage(taxid)
        ranks=ncbi.get_rank(lineage)
        specie=[taxid for taxid in ranks if ranks[taxid]==tax_level]
        if specie : 
            taxname=ncbi.get_taxid_translator(specie)[specie[0]]
            return(taxid,taxname)

In [322]:
dic_taxonomy={}
for p in data: 
    taxid=data[p]['taxid']
    taxname=ncbi.get_taxid_translator([taxid])[int(taxid)]
    taxrank=ncbi.get_rank([taxid])[int(taxid)]
    dic_taxonomy[p]={'taxid':taxid,'taxname':taxname,'taxrank':taxrank}

In [324]:
dic_specie={}
for p in dic_taxonomy: 
    specie=get_tax_level(dic_taxonomy[p]['taxid'],"species")
    if specie: 
        if specie not in dic_specie : 
            dic_specie[specie]={'proteins':set(),'domains':set()}
        dic_specie[specie]['proteins'].add(p)
        dic_specie[specie]['domains'].update(dic_protein[p])  

ordered_prot_dic_species=OrderedDict(sorted(dic_specie.items(),key=lambda kv: len(kv[1]['proteins']),reverse=True))
ordered_domain_dic_species=OrderedDict(sorted(dic_specie.items(),key=lambda kv: len(kv[1]['domains']),reverse=True))        

In [24]:
from statistics import mean
class DomainGroup: 
    def __init__(self,domains,proteins,data):
        self.domains=domains
        self.proteins=proteins
        self.taxids=list(set([data[p]["taxid"] for p in self.proteins]))
        
    def compute_upper_node(self):
        tree=ncbi.get_topology(self.taxids)
        traverse_generator=tree.traverse()
        self.upper_node=next(traverse_generator)
        
    def compute_mean_max_distance(self):
        self.dists=[]
        tree=ncbi.get_topology(self.taxids)
        if len(self.taxids)==1: 
            self.mean_dist=0
            self.max_dist=0
            return 
        for i in range(len(self.taxids)): 
            for j in range(i+1,len(self.taxids)):
                dist=tree.get_distance(self.taxids[i],self.taxids[j])
                self.dists.append(dist)
        self.mean_dist=mean(self.dists)
        self.max_dist=max(self.dists)

In [None]:
set_DomainGroup=set()
for k in assemble_dic_domain:
    d=DomainGroup(k.split(","),assembled_dic_domain[k],data)
    d.compute_upper_node()
    d.compute_mean_max_distance()
    set_DomainGroup.add(d)

In [None]:
pickle_saved="/Volumes/arwen/mobi/group/NOX_CH/pickle_saved/domain_groups.pickle"
pickle.dump(set_DomainGroup,open(pickle_saved,"wb"))

##### Generate outputs

In [40]:
setDomains=pickle.load(open(pickle_saved,"rb"))

In [27]:
single_domains_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/domains_stats.tsv"
assemble_domains_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/assembled_domains_stats.tsv"
assemble_domains_by_prot_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/assembled_domains_sort_by_prot_stats.tsv"
proteins_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/proteins_stats.tsv"
by_species_order_prot_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/by_species_prot_order.tsv"
by_species_order_dom_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/by_species_dom_order.tsv"

In [71]:
def write_domains_output(set_domain_groups,domains_output): 
    o=open(domains_output,"w")
    o.write("#Domain(s)\tNumber of associated proteins\tAssociated proteins\tUpper node(taxid)\tUpper node(taxname)\tUpper node(taxrank)\tMean distance\tMax distance\n")
    c=0
    for d in set_domain_groups:
        o.write("%s\t%d\t%s\t%s\t%s\t%s\t%.5f\t%.5f\n" % (",".join(d.domains),len(d.proteins),",".join(d.proteins),d.upper_node.name,d.upper_node.sci_name,d.upper_node.rank,d.mean_dist,d.max_dist))
    o.close()  

def write_by_specie_output(output,dic_species): 
    o=open(output,"w")
    o.write("#Species taxid\Species name\tNumber of proteins\tNumber of domains\tProteins\tDomains\n")
    for sp in dic_species:
        o.write(str(sp[0])+"\t"+sp[1]+"\t"+str(len(dic_species[sp]['proteins']))+"\t"+str(len(dic_species[sp]['domains']))+"\t"+",".join(dic_species[sp]['proteins'])+"\t"+",".join(dic_species[sp]['domains'])+"\n")
    o.close()

In [72]:
ordered_domain_setDomains=sorted(setDomains,key=lambda d: len(d.domains),reverse=True)
ordered_protein_setDomains=sorted(setDomains,key=lambda d: len(d.proteins),reverse=True)

write_domains_output(ordered_domain_setDomains,assemble_domains_output)
write_domains_output(ordered_protein_setDomains,assemble_domains_by_prot_output)

In [330]:
o=open(proteins_output,"w")
o.write("#Protein\tTaxid\tTaxname\tTaxrank\tNumber of domains\tDomains\n")
for p in ordered_dic_protein: 
    o.write(p+"\t"+dic_taxonomy[p]['taxid']+"\t"+dic_taxonomy[p]['taxname']+"\t"+dic_taxonomy[p]['taxrank']+"\t"+str(len(ordered_dic_protein[p]))+"\t"+",".join(ordered_dic_protein[p])+"\n")
o.close()

In [331]:
write_by_specie_output(by_species_order_prot_output,ordered_prot_dic_species)
write_by_specie_output(by_species_order_dom_output,ordered_domain_dic_species)

In [213]:
core_domain=["NAD_binding_1","FAD_binding_8","Ferric_reduct","FAD_binding_6","NAD_binding_6"]
all_taxids=set([data[p]['taxid'] for p in data])
dic_all_taxids={}
all_domains=set()
for p in data:
    taxid=data[p]['taxid']
    domains=data[p]['hmmr']
    domains_to_add=set()
    for d in domains: 
        if d not in core_domain:
            domains_to_add.add(d)
        all_domains.add(d)
    if taxid not in dic_all_taxids: 
        dic_all_taxids[taxid]=set()
    if not domains_to_add:
        domains_to_add.add("Core domains")
    dic_all_taxids[taxid].update(domains_to_add)    
    
print(len(all_taxids))

644


In [79]:
print(len(all_domains))

250


In [214]:
all_tree=ncbi.get_topology(list(all_taxids))

In [215]:
node_list=[]
for n in all_tree.traverse('postorder'):
    node_list.append(n)
    n.sameDomainNode=set()
    #print(n.name,n.sci_name,n.rank)
    if n.name in dic_all_taxids:
        n.domains=dic_all_taxids[n.name]   
    else:
        n.domains=set()
    if n.get_descendants():
        for child in n.children: 
            n.domains.update(child.domains)    

In [216]:
c=0
for i in range(len(node_list)):
    c+=1
    for j in range(i+1,len(node_list)):
        #print(i,j)
        n1=node_list[i]
        n2=node_list[j]
        if len(n1.domains)==len(n2.domains):
            if not n1.domains.difference(n2.domains):
                n1.sameDomainNode.add(n2)
                n2.sameDomainNode.add(n1)                

In [218]:
c=0
for n in all_tree.traverse():
    desc=set(n.get_descendants())
    if n.get_descendants() and not desc.difference(n.sameDomainNode) and n.domains != {'Core domains'}:
        print(n.name,n.sci_name,n.rank)
        print(len(n.sameDomainNode))
        print(len(desc))
        print(n.domains)

253239 Ethanoligenens harbinense species
1
1
{'RSN1_7TM'}
2053578 unclassified Microcoleaceae no rank
6
2
{'EF-hand_7', 'EF-hand_8', 'EF-hand_1', 'EF-hand_5', 'EF-hand_6'}
69541 Desulfuromonadales order
3
3
{'TctB', 'Polysacc_synt_C', 'DUF4405', 'DUF4079'}
80841 unclassified Burkholderiales (miscellaneous) no rank
2
2
{'RRM_1', 'YlaH', 'MIG-14_Wnt-bd'}
1873 Micromonospora genus
2
2
{'DUF2070', 'DUF4405', 'DUF5467'}
56 Sorangium cellulosum species
1
1
{'DUF2919'}
392333 Geoalkalibacter ferrihydriticus species
3
1
{'TctB', 'Polysacc_synt_C', 'DUF4405', 'DUF4079'}
85 Hyphomonas genus
2
2
{'DUF4405', 'DUF5336'}
201096 Alicycliphilus genus
13
4
{'MIG-14_Wnt-bd'}
1752733 Candidatus Yanofskybacteria phylum
2
2
{'Lipoprotein_9', 'Glyco_transf_28'}
1752735 Candidatus Wolfebacteria phylum
3
2
{'MtrA', 'SdpI'}
1752725 Candidatus Collierbacteria phylum
6
6
{'SdpI', 'DUF2070'}
1930 Streptomyces scabiei species
1
1
{'Selenoprotein_S', 'VKOR', 'DUF423'}
179636 Alicycliphilus denitrificans species
13


In [149]:
print(node_list[2].domains,node_list[3].domains)

{'NAD_binding_1', 'FAD_binding_8', 'ABC2_membrane_2', 'NAD_binding_6', 'Ferric_reduct'} {'NAD_binding_1', 'FAD_binding_8', 'ABC2_membrane_2', 'NAD_binding_6', 'Ferric_reduct'}


In [114]:
c=0
for n in all_tree.traverse('preorder'):
    c+=1
    print(n.name,n.sci_name,len(n.domains))
    if c == 20 :
        break

TypeError: object of type 'generator' has no len()

In [83]:
all_domains.difference(a)

for t in dic_all_taxids: 
    if 'DUF766' in dic_all_taxids[t]: 
        print(t)

29497


In [36]:
all_taxids.difference(leaf)

{'1076',
 '1311',
 '1421',
 '171437',
 '1930',
 '1971',
 '216495',
 '220697',
 '246167',
 '253239',
 '28031',
 '28901',
 '29497',
 '327160',
 '380021',
 '392333',
 '502049',
 '511',
 '56',
 '562',
 '666',
 '668',
 '669',
 '670',
 '672',
 '674',
 '696485'}