In [1]:
import os
import wget
import re
import pickle

import pandas

import tqdm
import urllib.request

from Bio.PDB.Polypeptide import three_to_one
from Bio import SeqIO

#### Fetch Variants

In [2]:
os.remove("./data/humsavar.txt")

wget.download("ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/variants/humsavar.txt", "./data/humsavar.txt")

'./data/humsavar.txt'

In [3]:
data = []

with open("./data/humsavar.txt", "r") as handle:
    # skip header
    for record in handle:
        if record.startswith("_"):
            break
    
    for record in handle:
        # reached EOF
        if len(record) == 1:
            break
        
        # process data
        record = re.sub(" +", " ", record[10:]).split(" ")
        
        try:
            var = record[2][2:].upper()
            
            data.append([ record[0], three_to_one(var[:3]), var[3:-3], three_to_one(var[-3:]), record[3].upper() ])
        except:
            # unknown peptide (SER)
            pass

data[0]

['P04217', 'H', '52', 'R', 'POLYMORPHISM']

In [4]:
with open("./data/humsavar.txt", "w") as handle:
    handle.write("\t".join([ "id", "ref", "position", "mutant", "target" ]) + "\n")
    
    for record in data:
        handle.write("\t".join(record) + "\n")

#### Fetch Protein Sequences

In [5]:
df = pandas.read_csv("./data/humsavar.txt", sep="\t", header=0)
df.head()

Unnamed: 0,id,ref,position,mutant,target
0,P04217,H,52,R,POLYMORPHISM
1,P04217,H,395,R,POLYMORPHISM
2,Q9NQ94,V,555,M,POLYMORPHISM
3,Q9NQ94,A,558,S,POLYMORPHISM
4,P01023,R,704,H,POLYMORPHISM


In [6]:
with open("./data/humsavar.fa", "w") as handle:
    for id in tqdm.tqdm(set(df['id'])):
        
        req = urllib.request.urlopen("https://www.uniprot.org/uniprot/{0}.fasta".format(id))
        
        handle.write(req.read().decode('utf-8'))

100%|██████████| 12845/12845 [2:00:37<00:00,  1.71it/s] 
