expected csv file column names:

ID </br>
InterPro </br>
Domain_architecture </br>

In [1]:
!pip install spyprot



In [1]:
import spyprot
import pandas as pd
from tqdm import tqdm



### Test Uniprot search

In [2]:
uni = spyprot.UniprotSearch(
    ['xref_interpro', 'xref_pfam'], accessions=['A0A011VZ55'])
res = uni.get()
print(res['A0A011VZ55'])

['IPR029028;IPR003742;IPR029026;', 'PF02590;']


## Add IPRO and PFAM to the csv

In [3]:
# size of accessions list in the UniprotSearch, 100 looks like reasonable constant, bigger numbers do weird things
SIZE = 100

FILE_IN_NAME = "length_normalized_v2.csv.gz"
FILE_OUT_NAME = "families_added_v2.csv.gz"

In [4]:
tqdm.pandas()
df = pd.read_csv(FILE_IN_NAME)
df['ID'] = df['ID'].apply(lambda x: x.split(sep='-')[1])

print(df.head())

accessions = df['ID'].tolist()
print(accessions[:10])

final_res = {}

for i in tqdm(range(0, len(accessions) - SIZE, SIZE)):
    uni = spyprot.UniprotSearch(['xref_interpro', 'xref_pfam'], accessions=accessions[i:i + SIZE])
    res = uni.get()
    final_res.update(res)
    
uni = spyprot.UniprotSearch(['xref_interpro', 'xref_pfam'], accessions=accessions[len(accessions) - SIZE:])
res = uni.get()
final_res.update(res)


df['InterPro'] = df['ID'].progress_apply(lambda x: final_res[x][0])
df['Domain_architecture'] = df['ID'].progress_apply(lambda x: final_res[x][1])


           ID  latestVersion  globalMetricValue  uniprotStart  uniprotEnd  \
0  A0A7M7PLY5              4              86.25             1         343   
1      U3KBG8              4              72.88             1         727   
2  A0A6L8UGY5              4              95.88             1         333   
3  A0A1E4KET7              4              88.75             1         359   
4  A0A2D6DPQ0              4              86.19             1         317   

                                     uniprotSequence  Length  \
0  MAGASAGDWCLIESDPGVFTELIRGFGVGGMQVEEIWTLDDDTALE...   343.0   
1  MKATAIATFFGVFLTCTYTAKEATKKTKKAKLYVPQIDCDVKAGKI...   727.0   
2  MTISFRNRSFLKLLDFETEEIQYLLDLAASLKKAKRSGTEQQYLKG...   333.0   
3  MLYLTFAAGLVLLILGADVLVRGASKLALSWGISPLVVGLTVVAFG...   359.0   
4  MTFVYFGIFITSFFILAYASSRLISSLTDIAKFLGWKEFVVAFFTM...   317.0   

  Domain_architecture                                           InterPro  \
0    PF01088;PF18031;  IPR038765;IPR001578;IPR036959;IPR017390;IPR041...   


100%|██████████| 1987/1987 [07:11<00:00,  4.61it/s]
100%|██████████| 198786/198786 [00:00<00:00, 930178.16it/s]
100%|██████████| 198786/198786 [00:00<00:00, 1154886.17it/s]


In [10]:
print(df.head())

           ID  latestVersion  globalMetricValue  uniprotStart  uniprotEnd  \
0  A0A7M7PLY5              4              86.25             1         343   
1      U3KBG8              4              72.88             1         727   
2  A0A6L8UGY5              4              95.88             1         333   
3  A0A1E4KET7              4              88.75             1         359   
4  A0A2D6DPQ0              4              86.19             1         317   

                                     uniprotSequence  Length  \
0  MAGASAGDWCLIESDPGVFTELIRGFGVGGMQVEEIWTLDDDTALE...   343.0   
1  MKATAIATFFGVFLTCTYTAKEATKKTKKAKLYVPQIDCDVKAGKI...   727.0   
2  MTISFRNRSFLKLLDFETEEIQYLLDLAASLKKAKRSGTEQQYLKG...   333.0   
3  MLYLTFAAGLVLLILGADVLVRGASKLALSWGISPLVVGLTVVAFG...   359.0   
4  MTFVYFGIFITSFFILAYASSRLISSLTDIAKFLGWKEFVVAFFTM...   317.0   

  Domain_architecture                                           InterPro  \
0    PF01088;PF18031;  IPR038765;IPR001578;IPR036959;IPR017390;IPR041...   


## Map InterPro numbers to readable family names

In [6]:
family_mapping = {
'IPR029026': 'SPOUT',
'IPR004837': 'membrane',
'IPR036398': 'Carbonic anhydrase',
'IPR036901': 'ATCase/OTCase',
'IPR006131': 'ATCase/OTCase',
'IPR032695': 'membrane',
'IPR013649': 'membrane',
'IPR004813': 'membrane',
'IPR013694': 'VIT',
'IPR018723': 'DUF',
'IPR002035': 'VIT',
'IPR022628': 'AdoMet synthase',
'IPR002133': 'AdoMet synthase',
'IPR022636': 'AdoMet synthase',
'IPR008927': 'PGluconate dehydrogenase',
'IPR013328': 'PGluconate dehydrogenase',
'IPR013023': 'PGluconate dehydrogenase',
'IPR003929': 'membrane',
'IPR001578': 'UCH',
'IPR036959': 'UCH',
'IPR006827': 'biosynthesis of lantibiotics',
'IPR005636': 'TDD',
'IPR038459': 'SPOUT',
'IPR007209': 'TDD',
'IPR022968': 'TDD',
'IPR007177': 'TDD',
'IPR010793': 'ribosomal-mitochondrial',
'IPR025349': 'DUF',
'IPR027790': 'AdoMet synthase',
'IPR042544': 'AdoMet synthase',
'IPR002795': 'AdoMet synthase',
'IPR018883': 'Carbonic anhydrase'
}

In [7]:
def map_family(ipro):
    if pd.isna(ipro):
        return ''
    ipro = ipro.split(';')
    families = set()
    for fam in ipro:
        if fam in family_mapping.keys():
            families.add(family_mapping[fam])
    return ','.join(families)

In [8]:
df['FamilyName'] = df['InterPro'].progress_apply(map_family)
print(df)

100%|██████████| 198786/198786 [00:00<00:00, 554149.14it/s]

                ID  latestVersion  globalMetricValue  uniprotStart  \
0       A0A7M7PLY5              4              86.25             1   
1           U3KBG8              4              72.88             1   
2       A0A6L8UGY5              4              95.88             1   
3       A0A1E4KET7              4              88.75             1   
4       A0A2D6DPQ0              4              86.19             1   
...            ...            ...                ...           ...   
198781  A0A7W0S8Y7              4              83.31             1   
198782  A0A0F7ZYI8              4              90.31             1   
198783  A0A6J1SY01              4              95.06             1   
198784  A0A2I4F3K1              4              79.38             1   
198785  A0A4D6XUF1              4              96.25             1   

        uniprotEnd                                    uniprotSequence  Length  \
0              343  MAGASAGDWCLIESDPGVFTELIRGFGVGGMQVEEIWTLDDDTALE...   343.0 




In [9]:
df.to_csv(FILE_OUT_NAME, index=False)