In [1]:
from Bio.SeqIO import parse
import pandas as pd

## Load Translation table

(Downloaded from ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping_selected.tab.gz)

In [None]:
cols = ['1. UniProtKB-AC', '2. UniProtKB-ID', '3. GeneID (EntrezGene)', '4. RefSeq', '5. GI', '6. PDB', '7. GO', '8. UniRef100', '9. UniRef90', '10. UniRef50', '11. UniParc', '12. PIR', '13. NCBI-taxon', '14. MIM', '15. UniGene', '16. PubMed', '17. EMBL', '18. EMBL-CDS', '19. Ensembl', '20. Ensembl_TRS', '21. Ensembl_PRO', '22. Additional PubMed']
cols = [c.split()[1] for c in cols]
db = pd.read_csv('/LAB_DATA/DATABASES/UniRef100/idmapping_selected.tab.gz', sep='\t', names=cols, usecols=['UniRef100', 'NCBI-taxon'], dtype={'UniRef100':str, 'NCBI-taxon':int})

# Make it into a dictionary
db = db.set_index('UniRef100')['NCBI-taxon'].to_dict()


## Make a new .fasta for UniRef

(Original location downloaded from ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz)

In [None]:
new_loc = '/LAB_DATA/DATABASES/UniRef100/uniref100.translated.fasta' # The location where you want the translated database to end up
ori_loc = '/LAB_DATA/DATABASES/UniRef100/uniref100.fasta' # The loactaion of the orginal databased (link above)
translation_table = '/LAB_DATA/DATABASES/UniRef100/uniref100.ttable' # The location of where you want a translation table to be created

o = open(new_loc, 'w')
t = open(translation_table, 'w')

for record in parse(ori_loc, 'fasta'):
    # Get new taxID
    
    try:
        new_ID = db[record.id]
    except KeyError:
        new_ID = None
    
    # Get UniRef ID
    newid = (record.id).split("_")[-1]
    
    # Get UniRef TaxID
    try:
        taxid = [x.split("=")[1] for x in (record.description).split(" ") if "TaxID" in x][0]
    except IndexError:
        taxid = None
        
    # make new ID
    # Order is UniRef100 ID; taxID from the translation table; taxID from the UniRef100 description
    fasta_id = "{0}_{1}_{2}".format(newid, new_ID, taxid)
    
    # Write
    o.write(">{0}\n{1}\n".format(fasta_id,str(record.seq)))  # Write record with new id)
    t.write("{0}\t{1}\n".format(fasta_id, record.description))
    
o.close()
t.close()