In [1]:
!pip install spyprot

Collecting spyprot
  Downloading spyprot-0.7.28-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting pysolr>=3.9.0
  Downloading pysolr-3.9.0.tar.gz (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting wget>=3.0
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting lxml>=4.5.0
  Downloading lxml-4.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hCollecting subprocess32>=3.5.0
  Downloading subprocess32-3.5.4.tar.gz (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.4/97.4 kB[0m [31m7.4 MB/s

In [1]:
import spyprot
import pandas as pd
from tqdm import tqdm



### Test Uniprot search

In [10]:
uni = spyprot.UniprotSearch(
    ['xref_interpro', 'xref_pfam'], accessions=['A0A011VZ55'])
res = uni.get()
print(res['A0A011VZ55'])

['IPR029028;IPR003742;IPR029026;', 'PF02590;']


## Add IPRO and PFAM to the csv

In [3]:
# size of accessions list in the UniprotSearch, 100 looks like reasonable constant, bigger numbers do weird things
SIZE = 100

FILE_IN_NAME = "spout_all_unknotted.csv"
FILE_OUT_NAME = "spout_all_unknotted_ipro.csv"

In [4]:
tqdm.pandas()
df = pd.read_csv(FILE_IN_NAME)
df['ID'] = df['ID'].apply(lambda x: x.split(sep='-')[1])

print(df.head())

accessions = df['ID'].tolist()
print(accessions[:10])

final_res = {}

for i in tqdm(range(0, len(accessions) - SIZE, SIZE)):
    uni = spyprot.UniprotSearch(['xref_interpro', 'xref_pfam'], accessions=accessions[i:i + SIZE])
    res = uni.get()
    final_res.update(res)
    
uni = spyprot.UniprotSearch(['xref_interpro''xref_pfam'], accessions=accessions[len(accessions) - SIZE:])
res = uni.get()
final_res.update(res)


df['InterPro'] = df['ID'].progress_apply(lambda x: final_res[x][0])
df['Domain architecture'] = df['ID'].progress_apply(lambda x: final_res[x][1])


           ID  latestVersion  globalMetricValue  uniprotStart  uniprotEnd  \
0  A0A009NLX4              4              94.56             1         306   
1  A0A010QHM8              4              74.75             1         575   
2  A0A010R445              4              91.31             1         353   
3  A0A014C010              4              94.69             1         306   
4  A0A014CD17              4              94.38             1         306   

                                     uniprotSequence  Length  
0  MALRHFLTLRDLSTLELNRILERASELKKMQHDNKVYQPFVGKVLG...     306  
1  MSQLRDEKDNTTLNDSKESTNPKVVVDSVFDTSEKLFLGGIDDGSD...     575  
2  MKQTAFRVTRQALNGAQSRAYSQSTGPRHLMSIADLSPAEFATLVR...     353  
3  MALRHFLTLRDLSTLELNRVLQRASELKKMQQSNKVYQPFVGKVLG...     306  
4  MALRHFLTLRDLSTLELNRVLQRASELKKMQQSNKVYQPFVGKVLG...     306  
['A0A009NLX4', 'A0A010QHM8', 'A0A010R445', 'A0A014C010', 'A0A014CD17', 'A0A015X2G2', 'A0A015YLP7', 'A0A016URG5', 'A0A016WIA3', 'A0A017TC45']


100%|██████████| 3310/3310 [55:22<00:00,  1.00s/it]t]
100%|██████████| 331052/331052 [00:00<00:00, 1066553.13it/s]


In [6]:
print(df.head())

           ID  latestVersion  globalMetricValue  uniprotStart  uniprotEnd  \
0  A0A009NLX4              4              94.56             1         306   
1  A0A010QHM8              4              74.75             1         575   
2  A0A010R445              4              91.31             1         353   
3  A0A014C010              4              94.69             1         306   
4  A0A014CD17              4              94.38             1         306   

                                     uniprotSequence  Length  \
0  MALRHFLTLRDLSTLELNRILERASELKKMQHDNKVYQPFVGKVLG...     306   
1  MSQLRDEKDNTTLNDSKESTNPKVVVDSVFDTSEKLFLGGIDDGSD...     575   
2  MKQTAFRVTRQALNGAQSRAYSQSTGPRHLMSIADLSPAEFATLVR...     353   
3  MALRHFLTLRDLSTLELNRVLQRASELKKMQQSNKVYQPFVGKVLG...     306   
4  MALRHFLTLRDLSTLELNRVLQRASELKKMQQSNKVYQPFVGKVLG...     306   

                                            InterPro  
0  IPR006132;IPR006130;IPR036901;IPR006131;IPR002...  
1                               IPR004648;

In [7]:
df.to_csv(FILE_OUT_NAME, index=False)