In [9]:
from pdfminer.high_level import extract_text
import pandas as pd
import numpy as np
import re

In [10]:
references = {
    '[1]': 'Plaxco KW, Simons KT, Baker D (1998) Contact order, transition state placement and the refolding rates of single domain proteins. J. Mol. Biol. 277:985–94.',
    '[2]': 'Ivankov DN, Garbuzynskiy SO, Alm E, Plaxco KW, Baker D, Finkelstein A V (2003) Contact order revisited: influence of protein size on the folding rate. Protein Sci. 12:2057–62.',
    '[3]': 'Gromiha MM, Selvaraj S (2001) Comparison between long-range interactions and contact order in determining the folding rate of two-state proteins: application of long-range order to folding rate prediction. J. Mol. Biol. 310:27–32.',
    '[4]': 'Zou T, Ozkan SB (2011) Local and non-local native topologies reveal the underlying folding landscape of proteins. Phys. Biol. 8:066011.',
    '[5]': 'Zhou H, Zhou Y (2002) Folding rate prediction using total contact distance. Biophys. J. 82:458–63.',
    '[6]': 'Micheletti C (2003) Prediction of folding rates and transition-state placement from native-state geometry. Proteins 51:74–84.',
    '[7]': 'Ouyang Z, Liang J (2008) Predicting protein folding rates from geometric contact and amino acid sequence. Protein Sci. 17:1256–63.',
    '[8]': 'Gutin A, Abkevich V, Shakhnovich E (1996) Chain Length Scaling of Protein Folding Time. Phys. Rev. Lett. 77:5433–5436.',
    '[9]': 'Koga N, Takada S (2001) Roles of native topology and chain-length scaling in protein folding: a simulation study with a Go-like model. J. Mol. Biol. 313:171–80.',
    '[10]': 'Kouza M, Li MS, O’brien EP, Hu C-K, Thirumalai D (2006) Effect of finite size on cooperativity and rates of protein folding. J. Phys. Chem. A 110:671–6.',
    '[11]': 'Lane TJ, Pande VS (2013) Inferring the rate-length law of protein folding. PLoS One 8:e78606.',
    '[12]': 'Naganathan AN, Muñoz V (2005) Scaling of folding times with protein size. J. Am. Chem. Soc. 127:480–1.',
    '[13]': 'Wolynes PG (1997) Folding funnels and energy landscapes of larger proteins within the capillarity approximation. Proc. Natl. Acad. Sci. U. S. A. 94:6170–5.',
    '[14]': 'Maxwell KL, Wildes D, Zarrine-Afsar A, De Los Rios MA, Brown AG, Friel CT, Hedberg L, Horng J-C, Bona D, Miller EJ, et al. (2005) Protein folding: defining a “standard” set of experimental conditions and a preliminary kinetic data set of two-state proteins. Protein Sci. 14:602–16.',
    '[15]': 'Garbuzynskiy SO, Ivankov DN, Bogatyreva NS, Finkelstein A V (2013) Golden triangle for folding rates of globular proteins. Proc. Natl. Acad. Sci. U. S. A. 110:147–50.',
    '[16]': 'Bogatyreva NS, Osypov AA, Ivankov DN (2009) KineticDB: a database of protein folding kinetics. Nucleic Acids Res. 37:D342–6.',
    '[17]': 'Ferguson N, Sharpe TD, Schartau PJ, Sato S, Allen MD, Johnson CM, Rutherford TJ, Fersht AR (2005) Ultra-fast barrier-limited folding in the peripheral subunit-binding domain family. J. Mol. Biol. 353:427–46.',
    '[18]': 'Spector S, Raleigh DP (1999) Submillisecond folding of the peripheral subunit-binding domain. J. Mol. Biol. 293:763–8.',
    '[19]': 'Smith MTJ, Meissner J, Esmonde S, Wong HJ, Meiering EM (2010) Energetics and mechanisms of folding and flipping the myristoyl switch in the {beta}-trefoil protein, hisactophilin. Proc. Natl. Acad. Sci. U. S. A. 107:20952–7.',
    '[20]': 'Lee J, Blaber SI, Dubey VK, Blaber M (2011) A polypeptide “building block” for the β-trefoil fold identified by “top-down symmetric deconstruction”. J. Mol. Biol. 407:744–63.',
    '[21]': 'Kinetic data for ThreeFoil to be published separately',
    '[22]': 'Naganathan AN, Muñoz V (2005) Scaling of folding times with protein size. J. Am. Chem. Soc. 127:480–1.',
}

In [11]:
data = pd.DataFrame(columns=['name', 'pdb', 'kinetic_state', 'class', 'pdb_length', 'midpoint', 'log_kf', 'log_ku', 'deltaG', 'source'])
text = open('supinfo_Table_SII.txt', 'r').readlines()

for l in text:
    r = re.search('(?P<Name>.*) (?P<PDB>....(?:\(.*\))?) (?P<state>Multi|Two) (?P<data>.*) (?P<source>\[[0-9]*\])', l)
    d = r['data'].split(' ')
    data = data.append({
        'name': r['Name'],
        'pdb': r['PDB'],
        'kinetic_state': r['state'],
        'class': d[0],
        'pdb_length': d[1],
        'midpoint': d[2],
        'log_kf': d[3],
        'log_ku': d[4],
        'deltaG': d[5],
        'source': references[r['source']]
    }, ignore_index=True)
data.to_pickle('kinetic_data.pkl')
data

Unnamed: 0,name,pdb,kinetic_state,class,pdb_length,midpoint,log_kf,log_ku,deltaG,source
0,Colicin E7 immunity protein,1AYI,Two,α,85,1.22,3.13,1.00,-2.90,"Maxwell KL, Wildes D, Zarrine-Afsar A, De Los ..."
1,"Telomeric protein DNA-binding domain, human",1BA5,Two,α,49,0.69,2.56,0.52,-2.78,"Garbuzynskiy SO, Ivankov DN, Bogatyreva NS, Fi..."
2,Immunoglobulin binding B-domain,1BDD(2-59),Two,α,58,2.52,5.08,1.82,-4.44,"Garbuzynskiy SO, Ivankov DN, Bogatyreva NS, Fi..."
3,16th domain of brain α-spectrin,1CUN(7-112),Two,α,106,-0.87,2.08,-2.61,-6.40,"Garbuzynskiy SO, Ivankov DN, Bogatyreva NS, Fi..."
4,17th domain of brain α-spectrin,1CUN(113-219),Two,α,107,-1.48,1.48,-3.39,-6.64,"Garbuzynskiy SO, Ivankov DN, Bogatyreva NS, Fi..."
...,...,...,...,...,...,...,...,...,...,...
103,Carbonic anhydrase,1V9E(Chain A),Multi,αβ,259,-4.60,-1.82,-10.42,-11.73,"Garbuzynskiy SO, Ivankov DN, Bogatyreva NS, Fi..."
104,"Ribonuclease H1, E. coli",2RN2,Multi,αβ,155,-2.00,0.04,-5.21,-7.17,"Garbuzynskiy SO, Ivankov DN, Bogatyreva NS, Fi..."
105,Villin 14T,2VIK,Multi,αβ,126,-0.69,2.17,-1.78,-5.39,"Garbuzynskiy SO, Ivankov DN, Bogatyreva NS, Fi..."
106,Chemotactic protein,3CHY,Multi,αβ,128,-0.56,0.43,-1.91,-3.20,"Garbuzynskiy SO, Ivankov DN, Bogatyreva NS, Fi..."


In [12]:
data[data['name'].str.contains('FGF')]

Unnamed: 0,name,pdb,kinetic_state,class,pdb_length,midpoint,log_kf,log_ku,deltaG,source
85,FGF-1,1JQZ(Chain A),Multi,β,136,-2.87,0.56,-3.08,-4.98,"Lee J, Blaber SI, Dubey VK, Blaber M (2011) A ..."
