In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns

In [3]:
import xml.etree.ElementTree as ET
tree = ET.parse('../data/features/interpro/interpro.xml')
root = tree.getroot()
root

<Element 'interprodb' at 0x1109c3f48>

In [4]:
item = root[1]
item.attrib

{'id': 'IPR000001',
 'protein_count': '3887',
 'short_name': 'Kringle',
 'type': 'Domain'}

In [5]:
def get_item_pfams(item):
    member_list = item.find('member_list')
    return list(set([xref.get('dbkey') for xref in member_list if xref.get('db') == 'PFAM']))

In [6]:
items = root.findall('interpro')

In [7]:
pfam_lists = [get_item_pfams(item) for item in items]

In [8]:
ids = [item.get('id') for item in items] 

In [9]:
pd.Series([len(p) for p in pfam_lists], ids).sort_values(ascending=False)[:10]

IPR017896    13
IPR000182     6
IPR002048     5
IPR012336     5
IPR025668     5
IPR024311     5
IPR001841     5
IPR002885     4
IPR009959     4
IPR001357     4
dtype: int64

In [22]:
items[2].find('class_list')[0].find('description').text

'metal ion binding'

In [23]:
def get_item_classes(item):
    classes = item.find('class_list') or []
    return list(set([c.find('description').text for c in classes]))

In [24]:
classes_lists = [get_item_classes(item) for item in items]

In [33]:
classes_merged = pd.Series([c for classes in classes_lists for c in classes])

In [56]:
top_classes = classes_merged.value_counts(ascending=False)[:30]
top_classes

integral component of membrane                                  1089
oxidation-reduction process                                      807
DNA binding                                                      652
membrane                                                         633
ATP binding                                                      585
nucleus                                                          495
regulation of transcription, DNA-templated                       483
cytoplasm                                                        441
extracellular region                                             399
G-protein coupled receptor signaling pathway                     287
RNA binding                                                      285
structural constituent of ribosome                               274
translation                                                      266
transcription factor activity, sequence-specific DNA binding     253
zinc ion binding                  

In [57]:
props = []
for classes, pfam_ids in zip(classes_lists, pfam_lists):
    for pfam_id in pfam_ids:
        prop = {
            'pfam_id': pfam_id, 
            'classes': ';'.join(classes)
        }
        for c in top_classes.index:
            prop[c] = int(c in classes) if classes else None
        props.append(prop)
props = pd.DataFrame(props).set_index('pfam_id').sort_index()
props = props[['classes']+list(top_classes.index)]
props.head()

Unnamed: 0_level_0,classes,integral component of membrane,oxidation-reduction process,DNA binding,membrane,ATP binding,nucleus,"regulation of transcription, DNA-templated",cytoplasm,extracellular region,...,pathogenesis,carbohydrate metabolic process,protein binding,structural molecule activity,DNA repair,catalytic activity,calcium ion binding,metal ion binding,integral component of plasma membrane,"transcription, DNA-templated"
pfam_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PF00001,integral component of membrane;G-protein coupl...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PF00002,integral component of membrane;G-protein coupl...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PF00003,integral component of membrane;G-protein coupl...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PF00004,ATP binding,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PF00005,ATPase activity;ATP binding,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
props.to_csv('../data/features/interpro/interpro_props.csv', index=True)