In [22]:
import urllib
import pandas as pd
import numpy as np
import glob
import os
import sys
import re
import Bio.PDB as bpdb

In [23]:
kinetic_data = pd.read_pickle('kinetic_data.pkl')
kinetic_data.head()

Unnamed: 0,name,pdb,kinetic_state,class,pdb_length,midpoint,log_kf,log_ku,deltaG,source
0,Colicin E7 immunity protein,1AYI,Two,α,85,1.22,3.13,1.0,-2.9,"Maxwell KL, Wildes D, Zarrine-Afsar A, De Los ..."
1,"Telomeric protein DNA-binding domain, human",1BA5,Two,α,49,0.69,2.56,0.52,-2.78,"Garbuzynskiy SO, Ivankov DN, Bogatyreva NS, Fi..."
2,Immunoglobulin binding B-domain,1BDD(2-59),Two,α,58,2.52,5.08,1.82,-4.44,"Garbuzynskiy SO, Ivankov DN, Bogatyreva NS, Fi..."
3,16th domain of brain α-spectrin,1CUN(7-112),Two,α,106,-0.87,2.08,-2.61,-6.4,"Garbuzynskiy SO, Ivankov DN, Bogatyreva NS, Fi..."
4,17th domain of brain α-spectrin,1CUN(113-219),Two,α,107,-1.48,1.48,-3.39,-6.64,"Garbuzynskiy SO, Ivankov DN, Bogatyreva NS, Fi..."


In [24]:
unique_pdbs = set()
for index, entry in kinetic_data.iterrows():
    unique_pdbs.add(entry.pdb[:4])

In [25]:
def download_pdb(pdbcode, datadir, downloadurl="https://files.rcsb.org/download/"):
    """
    Downloads a PDB file from the Internet and saves it in a data directory.
    :param pdbcode: The standard PDB ID e.g. '3ICB' or '3icb'
    :param datadir: The directory where the downloaded file will be saved
    :param downloadurl: The base PDB download URL, cf.
        `https://www.rcsb.org/pages/download/http#structures` for details
    :return: the full path to the downloaded PDB file or None if something went wrong
    """
    pdbfn = pdbcode + ".pdb"
    url = downloadurl + pdbfn
    outfnm = os.path.join(datadir, pdbfn)
    try:
        urllib.request.urlretrieve(url, outfnm)
        return outfnm
    except Exception as err:
        print(str(err), file=sys.stderr)
        return None


In [26]:
re_retrieve = False
existing = {os.path.basename(i).split('.')[0] for i in glob.glob('pdb_files/*.pdb')}
for pdb in unique_pdbs:
    if not re_retrieve and pdb in existing:
        continue
    download_pdb(pdb, 'pdb_files_raw')

In [27]:
class ResSelect(bpdb.Select):
    def __init__(self, begin, end):
        self.begin = int(begin)
        self.end = int(end)

    def accept_residue(self, res):
        return res.id[1] >= self.begin and res.id[1] <= self.end and res.parent.id == 'A'

class ChainSelect(bpdb.Select):
    def __init__(self, chain):
        self.chain = chain
    
    def accept_residue(self, res):
        return res.parent.id == self.chain

In [29]:
io = bpdb.PDBIO()
for index, entry in kinetic_data.iterrows():
    if not re.match('....\(.*\)', entry.pdb):
        continue
    print(entry.pdb)
    pdb = entry.pdb[:4]

    chain_regex = '(?P<pdb>....)\(Chain (?P<chain>.)\)'
    res_regex = '(?P<pdb>....)\((?P<begin>[0-9]*)-(?P<end>[0-9]*)\)'
    selector = None
    if re.match(chain_regex, entry.pdb):
        r = re.search(chain_regex, entry.pdb)
        selector = ChainSelect(r['chain'])
        #print(f"{r['pdb']} chain {r['chain']}")
    elif re.match(res_regex, entry.pdb):
        r = re.search(res_regex, entry.pdb)
        selector = ResSelect(r['begin'], r['end'])
        #print(f"{r['pdb']} residues {r['begin']}-{r['end']}")
    
    if selector is None:
        continue

    struct = bpdb.PDBParser().get_structure('tmp', os.path.join('pdb_files_raw', f'{pdb}.pdb'))
    io.set_structure(struct)
    io.save(os.path.join('pdb_files_raw', f'{entry.pdb}.pdb'), selector)


1BDD(2-59)
1CUN(7-112)
1CUN(113-219)




1E41(93-192)
1L8W(29-335)
1LMB(Chain 3)
1K85(559-644)




1K9Q(5-44)
1M9S(391-466)
1PGB(41-56)
1PIN(6-39)
1QTU(1-109)
1SHF(Chain A)
2VKN(Chain A)
1DIV(58-149)
1DIV(1-56)
1LOP(Chain A)




1PRS(91-173)
1PRS(1-90)
1SPR(Chain C)
1URN(Chain A)
2PTL(18-77)




1AU7(103-160)
2A5E(9-156)
1JQZ(Chain A)
1OPA(Chain A)




1RNB(Chain A)
1AON(191-345)




1IGS(27-248)
1PHP(176-394)
1PHP(1-175)
1QOP(Chain A)
1SCE(Chain C)




1V9E(Chain A)
3H08(Chain A)


