# Domain analysis

In [8]:
import pickle
import sys, os
sys.path.append("/Users/chilpert/Work/pyproteinsExt/src")
sys.path.append("/Users/chilpert/Work/pyproteins/src")
import pyproteinsExt 
import pyproteins

##### Load full Pfam annotation

In [9]:
data=pickle.load(open("/Volumes/arwen/mobi/group/NOX_CH/pickle_saved/NOX_annotation_fullPFAM_20190411-174326.pickle","rb"))
print(len(data))

832


##### Parse data 

In [317]:
def parse_data(data): 
    '''Return dictionnary with domain as key and list of proteins that contains this domain as value'''
    dic_domain={}
    for protein in data :
        for domain in data[protein]['hmmr']:
            if domain in dic_domain:
                dic_domain[domain].append(protein)
            else:     
                dic_domain[domain]=[protein]
    return(dic_domain)       

def simplify_data(data):
    dic_protein={}
    for protein in data : 
        dic_protein[protein]=list(data[protein]['hmmr'].keys())
    return dic_protein    

def assemble_domains_with_same_proteins(dic_domain):
    dic_prot={}
    for d in dic_domain:
        dic_domain[d].sort()
        prots=",".join(dic_domain[d])
        if not prots in dic_prot: 
            dic_prot[prots]=set()
        dic_prot[prots].add(d)
    new_dic_domain={}    
    for p in dic_prot: 
        doms=",".join(dic_prot[p])
        new_dic_domain[doms]=p.split(",")
    return new_dic_domain    

In [318]:
from collections import OrderedDict
dic_domain=parse_data(data)
ordered_dic_domain=OrderedDict(sorted(dic_domain.items(),key=lambda kv: len(kv[1]),reverse=True))
dic_protein=simplify_data(data)
ordered_dic_protein=OrderedDict(sorted(dic_protein.items(),key=lambda kv: len(kv[1]),reverse=True))
assembled_dic_domain=assemble_domains_with_same_proteins(dic_domain)
ordered_assembled_dic_domain=OrderedDict(sorted(assembled_dic_domain.items(),key=lambda kv: kv[0].count(","),reverse=True))
ordered_assembled_dic_domain_by_proteins=OrderedDict(sorted(assembled_dic_domain.items(),key=lambda kv: len(kv[1]),reverse=True))

##### Handle taxonomy with ete3

In [332]:
from ete3 import NCBITaxa
ncbi = NCBITaxa()
#ncbi.update_taxonomy_database()

In [320]:
def get_upper_node(list_taxid):
    tree=ncbi.get_topology(list_taxid)
    #print(tree.get_ascii(attributes=["sci_name","rank"]))
    traverse_generator=tree.traverse()
    first_node=next(traverse_generator)
    return(first_node.name,first_node.sci_name,first_node.rank)

def get_tax_level(taxid,tax_level): 
    taxid=int(taxid)
    rank=ncbi.get_rank([taxid])[taxid]
    if rank == tax_level : 
        taxname=ncbi.get_taxid_translator([taxid])[taxid]
        return(taxid,taxname)
    else:
        lineage=ncbi.get_lineage(taxid)
        ranks=ncbi.get_rank(lineage)
        specie=[taxid for taxid in ranks if ranks[taxid]==tax_level]
        if specie : 
            taxname=ncbi.get_taxid_translator(specie)[specie[0]]
            return(taxid,taxname)

In [322]:
dic_taxonomy={}
for p in data: 
    taxid=data[p]['taxid']
    taxname=ncbi.get_taxid_translator([taxid])[int(taxid)]
    taxrank=ncbi.get_rank([taxid])[int(taxid)]
    dic_taxonomy[p]={'taxid':taxid,'taxname':taxname,'taxrank':taxrank}

In [324]:
dic_specie={}
for p in dic_taxonomy: 
    specie=get_tax_level(dic_taxonomy[p]['taxid'],"species")
    if specie: 
        if specie not in dic_specie : 
            dic_specie[specie]={'proteins':set(),'domains':set()}
        dic_specie[specie]['proteins'].add(p)
        dic_specie[specie]['domains'].update(dic_protein[p])  

ordered_prot_dic_species=OrderedDict(sorted(dic_specie.items(),key=lambda kv: len(kv[1]['proteins']),reverse=True))
ordered_domain_dic_species=OrderedDict(sorted(dic_specie.items(),key=lambda kv: len(kv[1]['domains']),reverse=True))        

##### Generate outputs

In [328]:
single_domains_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/domains_stats.tsv"
assemble_domains_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/assembled_domains_stats.tsv"
assemble_domains_by_prot_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/assembled_domains_sort_by_prot_stats.tsv"
proteins_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/proteins_stats.tsv"
by_species_order_prot_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/by_species_prot_order.tsv"
by_species_order_dom_output="/Volumes/arwen/mobi/group/NOX_CH/Domain/by_species_dom_order.tsv"

In [325]:
def write_domains_output(domains_output,dic_domain): 
    o=open(domains_output,"w")
    o.write("#Domain(s)\tNumber of associated proteins\tAssociated proteins\tUpper node(taxid)\tUpper node(taxname)\tUpper node(taxrank)\n")
    for d in dic_domain: 
        taxids=[dic_taxonomy[p]['taxid'] for p in dic_domain[d]]
        upper_node=get_upper_node(taxids)
        o.write(d+"\t"+str(len(dic_domain[d]))+"\t"+",".join(dic_domain[d])+"\t"+upper_node[0]+"\t"+upper_node[1]+"\t"+upper_node[2]+"\n") 
    o.close()  

def write_by_specie_output(output,dic_species): 
    o=open(output,"w")
    o.write("#Species taxid\Species name\tNumber of proteins\tNumber of domains\tProteins\tDomains\n")
    for sp in dic_species:
        o.write(str(sp[0])+"\t"+sp[1]+"\t"+str(len(dic_species[sp]['proteins']))+"\t"+str(len(dic_species[sp]['domains']))+"\t"+",".join(dic_species[sp]['proteins'])+"\t"+",".join(dic_species[sp]['domains'])+"\n")
    o.close()

In [329]:
write_domains_output(single_domains_output,ordered_dic_domain)
write_domains_output(assemble_domains_output,ordered_assembled_dic_domain)
write_domains_output(assemble_domains_by_prot_output,ordered_assembled_dic_domain_by_proteins)

In [330]:
o=open(proteins_output,"w")
o.write("#Protein\tTaxid\tTaxname\tTaxrank\tNumber of domains\tDomains\n")
for p in ordered_dic_protein: 
    o.write(p+"\t"+dic_taxonomy[p]['taxid']+"\t"+dic_taxonomy[p]['taxname']+"\t"+dic_taxonomy[p]['taxrank']+"\t"+str(len(ordered_dic_protein[p]))+"\t"+",".join(ordered_dic_protein[p])+"\n")
o.close()

In [331]:
write_by_specie_output(by_species_order_prot_output,ordered_prot_dic_species)
write_by_specie_output(by_species_order_dom_output,ordered_domain_dic_species)

In [32]:
class Domains: 
    '''Represent set of specific domains with associated proteins'''
    def __init__(self,domains,proteins): 
        self.domains=domains
        self.proteins=proteins

* Search "specific domains". 

List of domains is associated with list of proteins. Domains are contained only in associated proteins. 

In [30]:
def compute_specific_domains(dic_domain):
    '''Return dictionnary with list of domains (comma-separated list) as key and set of proteins as value. Parse domains dictionnary to assemble domains that belongs to same list of proteins'''
    dic_protein={}
    for d in dic_domain : 
        proteins=",".join(dic_domain[d])
        if proteins in dic_protein : 
            dic_protein[proteins].append(d)
        else:     
            dic_protein[proteins]=[d]
    return dic_protein

In [33]:
dic_protein=compute_specific_domains(dic_domain)
list_domains=[]
for p in dic_protein: 
    d=Domains(dic_protein[p],set(p.split(",")))
    list_domains.append(d)

{'tr|A0A1N6G2S4|A0A1N6G2S4_9LACT,tr|C5R8S4|C5R8S4_WEIPA,tr|A0A1Q8L5Q3|A0A1Q8L5Q3_9PSEU,tr|A0A3F2SH26|A0A3F2SH26_9PSED,tr|A0A3A4ZND6|A0A3A4ZND6_9BACT,tr|A0A0B1XSI4|A0A0B1XSI4_9BACI,tr|A0A2K7SPG0|A0A2K7SPG0_9VIBR,tr|A0A3A2I0H6|A0A3A2I0H6_9VIBR,tr|A0A0W7Y8U8|A0A0W7Y8U8_9BACI,tr|A0A136KU56|A0A136KU56_9CHLR,tr|A0A0P6XZE1|A0A0P6XZE1_9CHLR,tr|A0A2I0FGC7|A0A2I0FGC7_9GAMM,tr|A3TNH7|A3TNH7_9MICO,tr|A0A2G2GL45|A0A2G2GL45_9RHOB,tr|A0A1G5FSB5|A0A1G5FSB5_9RHOB,tr|A0A285KXT9|A0A285KXT9_9ACTN,tr|A0A1I4C3G4|A0A1I4C3G4_9RHOB,tr|A0A285ITD7|A0A285ITD7_9ACTN,tr|A0A368T5V7|A0A368T5V7_9ACTN,tr|A0A2G9PU25|A0A2G9PU25_9ARCH,tr|D6Z303|D6Z303_DESAT,tr|A0A1C5JKJ0|A0A1C5JKJ0_9ACTN,tr|A0A1U9JJ54|A0A1U9JJ54_9NEIS,tr|A0A1B9Q0I3|A0A1B9Q0I3_9VIBR,tr|A0A2N7ICW5|A0A2N7ICW5_9VIBR,tr|A0A2N7GWA1|A0A2N7GWA1_9VIBR,tr|A0A2T8L9H0|A0A2T8L9H0_SALET,tr|A0A2N7IHZ8|A0A2N7IHZ8_9VIBR,tr|A0A2J6UIY5|A0A2J6UIY5_9VIBR,tr|A0A0L1LGC9|A0A0L1LGC9_9VIBR,tr|A0A0G0I7W1|A0A0G0I7W1_9BACT,tr|A0A2N2GTA6|A0A2N2GTA6_9DELT,tr|A0A0M4Q5C5|A0A0M4Q5C5_9PSEU

In [53]:
# Sort set of domains by number of associated proteins
sorted_domains=sorted(list_domains,key=lambda x:len(x.proteins),reverse=True)
for d in sorted_domains: 
    print(d.domains,len(d.proteins))

['NAD_binding_1'] 827
['FAD_binding_8'] 817
['Ferric_reduct'] 784
['FAD_binding_6'] 703
['NAD_binding_6'] 616
['SdpI'] 85
['DUF4405'] 68
['Glyco_tranf_2_3'] 31
['PrgI'] 29
['VKOR'] 26
['MIG-14_Wnt-bd'] 24
['DUF4079'] 23
['7TMR-DISM_7TM'] 22
['GH97_C'] 18
['DUF1516'] 15
['Polysacc_synt_C'] 14
['DUF2070'] 11
['Selenoprotein_S'] 10
['EF-hand_1', 'EF-hand_7', 'EF-hand_6', 'EF-hand_8', 'EF-hand_5'] 10
['DUF3397'] 10
['DUF4131'] 10
['Acyl_transf_3'] 9
['ABC2_membrane_2'] 9
['DUF3382'] 9
['Peptidase_U4'] 9
['DUF5467'] 8
['LapA_dom'] 8
['PgaD'] 7
['Phage_holin_3_6'] 7
['DUF2157'] 7
['Met_gamma_lyase'] 5
['NfeD'] 5
['YaeQ', 'Auxin_resp'] 5
['DUF5336'] 5
['UPF0242'] 5
['Rhamno_transf'] 5
['DUF5396'] 5
['FTSW_RODA_SPOVE'] 5
['DUF4381'] 5
['Herpes_UL49_5'] 5
['DUF3043'] 5
['DUF1282'] 5
['YlaH'] 5
['Wzy_C'] 4
['MerC'] 4
['Ni_hydr_CYTB'] 4
['DUF2919'] 4
['Trp_oprn_chp'] 4
['EamA'] 4
['B12-binding'] 4
['TrbC'] 4
['7TM-7TMR_HD'] 4
['Phage_holin_3_5'] 4
['PhoLip_ATPase_C'] 4
['SecD_SecF'] 4
['SIP'] 4
[

In [94]:
total_number_proteins=len(data)
major_domains=sorted_domains[:5]
proteins_major_domains=set()
proteins_nad=set()
proteins_fad=set()
proteins_ferric=set()
for d in major_domains: 
    for domain in d.domains: 
        if "NAD_binding" in domain : 
            for p in d.proteins: 
                proteins_nad.add(p)
        elif "FAD_binding" in domain : 
            for p in d.proteins: 
                proteins_fad.add(p)
        elif "Ferric_reduct" in domain : 
            for p in d.proteins: 
                proteins_ferric.add(p)      
    print(d.domains,len(d.proteins),len(d.proteins)/total_number_proteins*100)
    for p in d.proteins: 
        proteins_major_domains.add(p)

print(str(len(proteins_major_domains)/total_number_proteins*100)+"% of proteins contains at least 1 major domain.")
print(str(len(proteins_nad)/total_number_proteins*100)+'% of proteins contains a domain labelled "NAD_binding".')
print(str(len(proteins_fad)/total_number_proteins*100)+'% of proteins contains a domain labelled "FAD_binding".')
print(str(len(proteins_ferric)/total_number_proteins*100)+'% of proteins contains a domain labelled "Ferric_reduct"')

['NAD_binding_1'] 827 99.39903846153845
['FAD_binding_8'] 817 98.19711538461539
['Ferric_reduct'] 784 94.23076923076923
['FAD_binding_6'] 703 84.4951923076923
['NAD_binding_6'] 616 74.03846153846155
100.0% of proteins contains at least 1 major domain.
100.0% of proteins contains a domain labelled "NAD_binding".
99.51923076923077% of proteins contains a domain labelled "FAD_binding".
94.23076923076923% of proteins contains a domain labelled "Ferric_reduct"


In [104]:
all_proteins=set([p for p in data])
print("Proteins that don't contain FAD_binding :", all_proteins.difference(proteins_fad))
print("Proteins that don't contain Ferric_reduct :",all_proteins.difference(proteins_ferric))

Proteins that don't contain FAD_binding : {'tr|A0A327VK97|A0A327VK97_9ACTN', 'tr|A0A0M8U3E5|A0A0M8U3E5_9ACTN', 'tr|A0A0N0SW46|A0A0N0SW46_9ACTN', 'tr|A0A0F4IKA1|A0A0F4IKA1_9ACTN'}
Proteins that don't contain Ferric_reduct : {'tr|A0A345HY22|A0A345HY22_9ACTN', 'tr|A0A383TAJ2|A0A383TAJ2_9LACT', 'tr|A0A2C9EM45|A0A2C9EM45_PSEPH', 'tr|C5R8S4|C5R8S4_WEIPA', 'tr|A0A1H8Q3U4|A0A1H8Q3U4_9GAMM', 'tr|A0A3D3GDH3|A0A3D3GDH3_9PROT', 'tr|A0A1M4XSG7|A0A1M4XSG7_9LACT', 'tr|A0A2K4JCS1|A0A2K4JCS1_9PSED', 'tr|A0A2T5IGN1|A0A2T5IGN1_9LACT', 'tr|A0A0J7HUJ9|A0A0J7HUJ9_9BACT', 'tr|A0A1N6G2S4|A0A1N6G2S4_9LACT', 'tr|A0A3F2SH26|A0A3F2SH26_9PSED', 'tr|G8PEX6|G8PEX6_PEDCP', 'tr|A0A143YUW5|A0A143YUW5_9LACT', 'tr|A0A3D3QU40|A0A3D3QU40_9FIRM', 'tr|A0A261FX35|A0A261FX35_9BIFI', 'tr|A0A0W0NTK0|A0A0W0NTK0_9PSED', 'tr|A0A352TU28|A0A352TU28_9MOLU', 'tr|Q04GH6|Q04GH6_OENOB', 'tr|A0A3G7ANK0|A0A3G7ANK0_9PSED', 'tr|A0A1G3QHY7|A0A1G3QHY7_9SPIR', 'tr|A0A143YPZ0|A0A143YPZ0_9LACT', 'tr|A0A2T0ZMP6|A0A2T0ZMP6_9PSED', 'tr|S4NJU4|S4NJU4_

In [106]:
print(data['tr|A0A327VK97|A0A327VK97_9ACTN']['hmmr'].keys())

dict_keys(['NAD_binding_1', 'Ferric_reduct', 'NAD_binding_6'])
