# Domain analysis

In [219]:
import pickle
import sys, os
sys.path.append("/Users/chilpert/Work/pyproteinsExt/src")
sys.path.append("/Users/chilpert/Work/pyproteins/src")
import pyproteinsExt 
import pyproteins
import time

##### Load full Pfam annotation

In [2]:
data=pickle.load(open("/Volumes/arwen/mobi/group/NOX_CH/pickle_saved/NOX_annotation_fullPFAM_20190411-174326.pickle","rb"))
print(len(data))

832


##### Parse data 

In [15]:
def parse_data(data): 
    '''Return dictionnary with domain as key and list of proteins that contains this domain as value'''
    dic_domain={}
    for protein in data :
        for domain in data[protein]['hmmr']:
            if domain in dic_domain:
                dic_domain[domain].append(protein)
            else:     
                dic_domain[domain]=[protein]
    return(dic_domain)       

def simplify_data(data):
    dic_protein={}
    for protein in data : 
        dic_protein[protein]=list(data[protein]['hmmr'].keys())
    return dic_protein    

def assemble_domains_with_same_proteins(dic_domain):
    dic_prot={}
    for d in dic_domain:
        dic_domain[d].sort()
        prots=",".join(dic_domain[d])
        if not prots in dic_prot: 
            dic_prot[prots]=set()
        dic_prot[prots].add(d)
    new_dic_domain={}    
    for p in dic_prot: 
        doms=",".join(dic_prot[p])
        new_dic_domain[doms]=p.split(",")
    return new_dic_domain    

In [16]:
from collections import OrderedDict
dic_domain=parse_data(data)
ordered_dic_domain=OrderedDict(sorted(dic_domain.items(),key=lambda kv: len(kv[1]),reverse=True))
dic_protein=simplify_data(data)
ordered_dic_protein=OrderedDict(sorted(dic_protein.items(),key=lambda kv: len(kv[1]),reverse=True))
assemble_dic_domain=assemble_domains_with_same_proteins(dic_domain)

#### Create, browse and modify ete3 tree 

In [211]:
from ete3 import NCBITaxa
ncbi = NCBITaxa()

* Create tree from all taxids 

In [212]:
all_taxids=set([data[p]['taxid'] for p in data])
all_tree=ncbi.get_topology(list(all_taxids))

* Create dictionnary to complete ete3 tree  
Dic_all_taxids :   
    Key : taxid   
    Value : dictionnary {'domains' : set of associated domains, 'proteins':set of associated proteins}

In [213]:
core_domain=["NAD_binding_1","FAD_binding_8","Ferric_reduct","FAD_binding_6","NAD_binding_6"]
dic_all_taxids={}
all_domains=set()
for p in data:
    taxid=data[p]['taxid']
    domains=data[p]['hmmr']
    domains_to_add=set()
    for d in domains: 
        if d not in core_domain:
            domains_to_add.add(d)
        all_domains.add(d)
    if taxid not in dic_all_taxids: 
        dic_all_taxids[taxid]={'domains':set(),'proteins':set()}
    if not domains_to_add:
        domains_to_add.add("Core domains")
    dic_all_taxids[taxid]['domains'].update(domains_to_add)    
    dic_all_taxids[taxid]['proteins'].add(p)

* Complete Tree object with list of domains and proteins for each node

In [214]:
node_list=[]
for n in all_tree.traverse('postorder'):
    node_list.append(n)
    n.sameDomainNode=set()
    #print(n.name,n.sci_name,n.rank)
    if n.name in dic_all_taxids:
        n.domains=dic_all_taxids[n.name]['domains']   
        n.proteins=dic_all_taxids[n.name]['proteins']
    else:
        n.domains=set()
        n.proteins=set()
    if n.get_descendants():
        for child in n.children: 
            n.domains.update(child.domains)    
            n.proteins.update(child.proteins)

* Complete Tree object with list of nodes with same domains for each node

In [215]:
c=0
for i in range(len(node_list)):
    c+=1
    for j in range(i+1,len(node_list)):
        #print(i,j)
        n1=node_list[i]
        n2=node_list[j]
        if len(n1.domains)==len(n2.domains):
            if not n1.domains.difference(n2.domains):
                n1.sameDomainNode.add(n2)
                n2.sameDomainNode.add(n1)       

* Create DomainGroup objects, that contains domains associated with proteins

In [234]:
from statistics import mean
class DomainGroup: 
    def __init__(self,domains,proteins,data):
        self.domains=domains
        self.proteins=proteins
        self.taxids=list(set([data[p]["taxid"] for p in self.proteins]))
        
    def compute_upper_node(self,all_tree):
        if len(self.taxids)==1: 
            self.upper_node=all_tree.search_nodes(name=self.taxids[0])[0]
        else:     
            tree=ncbi.get_topology(self.taxids)
            traverse_generator=tree.traverse()
            self.upper_node=next(traverse_generator)
        
    def compute_mean_max_distance(self):
        self.dists=[]
        tree=ncbi.get_topology(self.taxids)
        if len(self.taxids)==1: 
            self.mean_dist=0
            self.max_dist=0
            return 
        for i in range(len(self.taxids)): 
            for j in range(i+1,len(self.taxids)):
                dist=tree.get_distance(self.taxids[i],self.taxids[j])
                self.dists.append(dist)
        self.mean_dist=mean(self.dists)
        self.max_dist=max(self.dists)

In [235]:
set_DomainGroup=set()
for k in assemble_dic_domain:
    d=DomainGroup(k.split(","),assemble_dic_domain[k],data)
    d.compute_upper_node(all_tree)
    d.compute_mean_max_distance()
    set_DomainGroup.add(d)

In [237]:
for s in set_DomainGroup: 
    n=all_tree.search_nodes(name=s.upper_node.name)[0]
    percent_taxo=len(s.proteins)/len(n.proteins)*100
    s.percent_taxo=percent_taxo

* Save objects

In [238]:
strtime = time.strftime("%Y%m%d-%H%M%S")
domain_groups_saved="/Volumes/arwen/mobi/group/NOX_CH/pickle_saved/domain_groups_"+strtime+".pickle"
tree_saved="/Volumes/arwen/mobi/group/NOX_CH/pickle_saved/ete3_tree_"+strtime+".pickle"

In [239]:
pickle.dump(set_DomainGroup,open(domain_groups_saved,"wb"))
pickle.dump(all_tree,open(tree_saved,"wb"))

#### Generate outputs

* Load objects

In [243]:
setDomains=pickle.load(open('/Volumes/arwen/mobi/group/NOX_CH/pickle_saved/domain_groups_20190423-142117.pickle',"rb"))
all_tree=pickle.load(open('/Volumes/arwen/mobi/group/NOX_CH/pickle_saved/ete3_tree_20190423-142117.pickle',"rb"))

* Output names

In [244]:
single_domains_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/domains_stats.tsv"
assemble_domains_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/assembled_domains_stats.tsv"
assemble_domains_by_prot_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/assembled_domains_sort_by_prot_stats.tsv"
proteins_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/proteins_stats.tsv"
by_species_order_prot_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/by_species_prot_order.tsv"
by_species_order_dom_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/by_species_dom_order.tsv"

In [256]:
def write_domains_output(set_domain_groups,domains_output): 
    o=open(domains_output,"w")
    o.write("#Domain(s)\tNumber of associated proteins\tAssociated proteins\tUpper node(taxid)\tUpper node(taxname)\tUpper node(taxrank)\t% taxon\tMean distance\tMax distance\n")
    c=0
    for d in set_domain_groups:
        o.write("%s\t%d\t%s\t%s\t%s\t%s\t%.5f\t%.5f\t%.5f\n" % (",".join(d.domains),len(d.proteins),",".join(d.proteins),d.upper_node.name,d.upper_node.sci_name,d.upper_node.rank,d.percent_taxo,d.mean_dist,d.max_dist))
    o.close()  
    
def write_by_specie_output(output,dic_species): 
    o=open(output,"w")
    o.write("#Species taxid\tSpecies name\tNumber of proteins\tNumber of domains\tProteins\tDomains\n")
    for sp in dic_species:
        o.write(str(sp[0])+"\t"+sp[1]+"\t"+str(len(dic_species[sp]['proteins']))+"\t"+str(len(dic_species[sp]['domains']))+"\t"+",".join(dic_species[sp]['proteins'])+"\t"+",".join(dic_species[sp]['domains'])+"\n")
    o.close()    

In [249]:
ordered_domain_setDomains=sorted(setDomains,key=lambda d: len(d.domains),reverse=True)
ordered_protein_setDomains=sorted(setDomains,key=lambda d: len(d.proteins),reverse=True)

write_domains_output(ordered_domain_setDomains,assemble_domains_output)
write_domains_output(ordered_protein_setDomains,assemble_domains_by_prot_output)

In [250]:
o=open(proteins_output,"w")
o.write("#Protein\tTaxid\tTaxname\tTaxrank\tNumber of domains\tDomains\n")
for p in ordered_dic_protein: 
    o.write(p+"\t"+dic_taxonomy[p]['taxid']+"\t"+dic_taxonomy[p]['taxname']+"\t"+dic_taxonomy[p]['taxrank']+"\t"+str(len(ordered_dic_protein[p]))+"\t"+",".join(ordered_dic_protein[p])+"\n")
o.close()

In [252]:
def get_tax_level(taxid,tax_level): 
    taxid=int(taxid)
    rank=ncbi.get_rank([taxid])[taxid]
    if rank == tax_level : 
        taxname=ncbi.get_taxid_translator([taxid])[taxid]
        return(taxid,taxname)
    else:
        lineage=ncbi.get_lineage(taxid)
        ranks=ncbi.get_rank(lineage)
        specie=[taxid for taxid in ranks if ranks[taxid]==tax_level]
        if specie : 
            taxname=ncbi.get_taxid_translator(specie)[specie[0]]
            return(taxid,taxname)

In [253]:
dic_taxonomy={}
for p in data: 
    taxid=data[p]['taxid']
    taxname=ncbi.get_taxid_translator([taxid])[int(taxid)]
    taxrank=ncbi.get_rank([taxid])[int(taxid)]
    dic_taxonomy[p]={'taxid':taxid,'taxname':taxname,'taxrank':taxrank}

In [254]:
dic_specie={}
for p in dic_taxonomy: 
    specie=get_tax_level(dic_taxonomy[p]['taxid'],"species")
    if specie: 
        if specie not in dic_specie : 
            dic_specie[specie]={'proteins':set(),'domains':set()}
        dic_specie[specie]['proteins'].add(p)
        dic_specie[specie]['domains'].update(dic_protein[p])  

ordered_prot_dic_species=OrderedDict(sorted(dic_specie.items(),key=lambda kv: len(kv[1]['proteins']),reverse=True))
ordered_domain_dic_species=OrderedDict(sorted(dic_specie.items(),key=lambda kv: len(kv[1]['domains']),reverse=True))        

In [257]:
write_by_specie_output(by_species_order_prot_output,ordered_prot_dic_species)
write_by_specie_output(by_species_order_dom_output,ordered_domain_dic_species)