# SAMPLE/DEMO

In [1]:
import pandas as pd
import re

PLUS_PATTERN = r'\+[0-9.]*'

In [2]:
#df = pd.concat([perrin, perrin_supp, serjeant], ignore_index=True)
df = pd.read_csv("sample_done.csv")
df['source'] = 'serjeant'
df['type_of_entry'] = 'in'
df['unique_ID'] = ''

### Clean up all entries

In [3]:
def fix_pka_type(entry, source):
    if pd.isnull(entry):
        return entry
    else:
        entry = entry.replace('pk', 'pK')
        entry = entry.replace('pka', 'pK1')
        entry = entry.replace('pKa', 'pK1')
        entry = entry.replace('cooh', 'COOH')
        entry = entry.replace('pKa(H2O)', 'pK1')
        entry = entry.replace('pK1(H2O)', 'pK1')
        entry = entry.replace('pK2(H2O)', 'pK2')
        entry = entry.replace('pK3(H2O)', 'pK3')
        entry = entry.replace('pKb', 'pKB')
        if 'pK(' in entry and (source == 'perrin' or source == 'perrin_supp'):
            entry = entry.replace('pK(', 'pKAH(')
        
        return entry

def fix_remarks(entry):
    if pd.isnull(entry):
        return entry
    else:
        entry = entry.replace('\n. ','')
        entry = entry.replace('( (','(')
        entry = entry.replace('( ','(')
        entry = entry.replace(' )',')')
        entry = entry.replace('c = ', 'C=')
        entry = entry.replace('c =', 'C=')
        entry = entry.replace('c= ', 'C=')
        entry = entry.replace('c=', 'C=')
        entry = entry.replace('c~', 'C~')
        entry = entry.replace('c<', 'C<')
        entry = entry.replace('c <', 'C<')
        entry = entry.replace('c < ', 'C<')
        entry = entry.replace('C = ', 'C=')
        entry = entry.replace('c - ', 'C=')
        entry = entry.replace('C= ', 'C=')
        entry = entry.replace('C =', 'C=')
        entry = entry.replace('concentration\nconstant', 'concentration constant')
        entry = entry.replace('concentration, constant', 'concentration constant')
        entry = entry.replace('I = ', 'I=')
        entry = entry.replace('I =', 'I=')
        entry = entry.replace('I= ', 'I=')
        entry = entry.replace('m = ', 'm=')
        entry = entry.replace('m= ', 'm=')
        entry = entry.replace('m =', 'm=')
        entry = entry.replace('C< ', 'C<')
        entry = entry.replace('C <', 'C<')
        entry = entry.replace('C < ', 'C<')
        entry = entry.replace('I< ', 'I<')
        entry = entry.replace('1m solution', '1M solution')
        entry = entry.replace('I <', 'I<')
        entry = entry.replace('I < ', 'I<')
        entry = entry.replace('S04', 'SO4')
        entry = entry.replace('KCI', 'KCl')
        entry = entry.replace('DCI', 'DCl')
        entry = entry.replace('c between', 'C between')
        entry = entry.replace('HCI', 'HCl')
        entry = entry.replace('NaCI', 'NaCl')
        entry = entry.replace('from, thermo', 'from thermo')
        entry = entry.replace(' (KCl)', ' (KCl)')
        entry = entry.replace(' (HCl', ' (HCl')
        entry = entry.replace(' (KBr)', ' (KBr)')
        entry = entry.replace(' (K2SO4)', '(K2SO4)')
        entry = entry.replace(' (NaClO4)', '(NaClO4)')
        entry = entry.replace(' (KNO3)', '(KNO3)')
        entry = entry.replace(' (NaCl)', '(NaCl)')
        entry = entry.replace(',,', ',')
        entry = entry.replace('()', '')
        entry = entry.replace('. mixed', ', mixed')
        entry = entry.replace(': mixed', ', mixed')
        entry = entry.replace(') mixed', '), mixed')
        entry = entry.replace('. conc', ', conc')
        entry = entry.replace(') conc', '), conc')
        entry = entry.replace('\n\n\n', '\n')
        entry = entry.strip('\n., ')
    
        return entry

def fix_entry_remarks(entry):
    if pd.isnull(entry):
        return entry
    else:
        entry = entry.replace('- Thermodynamic data are derived from the results','')
        entry = entry.strip('.\n ')
        entry = entry.replace('- \n', ' - ')
#        entry = entry.replace('')
        return entry

def fix_ref_remarks(entry):
    if pd.isnull(entry):
        return entry
    else:
        entry = entry.strip('; ')
        return entry

def isolate_P(entry):
    if pd.isnull(entry):
        return None
    match = re.search(P_MATCH,entry)
    if match:
        return match.group().replace('P=','')
    return None

def standardize_refs(entry):
    if pd.isnull(entry):
        return entry
    return entry.strip()

def standardize_assessment(entry):
    if pd.isnull(entry):
        return entry
    entry = entry.replace('.','')
    entry = entry.replace('Approx','Approximate')
    entry = entry.replace('approx','approximate')
    if entry == 'Uncert':
        entry = 'Uncertain'
    entry = entry.replace('Rel','Reliable')
    entry = entry.replace('Very Uncertain','Very uncertain')
    entry = entry.replace('Very Uncert','Very uncertain')
    entry = entry.replace('V Uncert','Very uncertain')
    entry = entry.replace('VUncert','Very uncertain')
    entry = entry.replace('Vuncert','Very uncertain')
    if entry == 'Very uncert':
        entry = 'Very uncertain'
    return entry
    
def make_acidity_label(entry, source):
    if pd.isnull(entry):
        return None
    if 'pKAH' in entry:
        return 'AH'
    elif 'pKB' in entry or source == 'perrin' or source == 'perrin_supp':
        return 'B'
    else:
        return 'A'
    
P_MATCH = r"P=[0-9]*[a-z ().]*"
df['remarks'] = df['remarks'].apply(lambda x: fix_remarks(x))
df['entry_remarks'] = df['entry_remarks'].apply(lambda x: fix_entry_remarks(x))
df['ref_remarks'] = df['ref_remarks'].apply(lambda x: fix_ref_remarks(x))
df['pka_type'] = df.apply(lambda x: fix_pka_type(x.pka_type, x.source), axis=1)
df['pressure'] = df['remarks'].apply(lambda x: isolate_P(x))
df['ref'] = df['ref'].apply(lambda x: standardize_refs(x))
df['assessment'] = df['assessment'].apply(lambda x: standardize_assessment(x))
df['acidity_label'] = df.apply(lambda x: make_acidity_label(x.pka_type, x.source), axis=1)

df.to_csv("all_cat.csv",index=False)
df

Unnamed: 0,entry_#,SMILES,pka_type,pka_value,T,remarks,method,assessment,ref,ref_remarks,entry_remarks,original_IUPAC_names,original_IUPAC_nicknames,name_contributors,num_name_contributors,source,type_of_entry,unique_ID,pressure,acidity_label
0,2004,CO,pK1,15.5,25.0,,C3,Uncertain,B8,,,Methanol,,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,in,,,A
1,2004,CO,pK1,15.09,25.0,,KIN,Uncertain,M126,,,Methanol,,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,in,,,A
2,2005,COO,pK1,11.5,20.0,,O5,Uncertain,E27,,,Methyl hydroperoxide,,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,,,A
3,2006,CS,pK1,10.33,25.0,"1% ethanol, gas solubility method",,Uncertain,K57,,,Methanethiol,,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,in,,,A
4,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.14,20.0,In aqueous HClO4 Ho scale,O6,Uncertain,T58a,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,in,,,A
5,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.06,25.5,Mixed constant,O5,Uncertain,H5,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,in,,,A
6,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.05,9.6,Mixed constant,O5,Uncertain,H5,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,in,,,A
7,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.23,5.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,in,,,A
8,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.17,20.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,in,,,A
9,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.11,40.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,in,,,A


### clean up and just get specific types of data

In [4]:
# THEN drop all the other types of errors: pKa nums, hydrates, salts


for ct, row in df.iterrows():
    nullsmiles = False
    unique_id = row['source'] + str(row['entry_#'])
    df.loc[ct,'unique_ID'] = unique_id
    pka = row['pka_value']
    
#    try:}
#        if 'pK of hydrate' in row[1]['remarks'] or 'pk of hydrate' in row[1]['remarks']:
#            print(">> Dropping hydrate")
#            indices_to_drop.add(row[0])
#    except TypeError:
#        pass
    
    
    try:
        '.' in row['SMILES']
#        if '.' in row['SMILES']:
#            print(">> Dropping salt {}".format(row[1]['SMILES']))
#            df.loc[ct,'type_of_entry'] = 'out'
            #indices_to_drop.add(row[0])
    except TypeError:
        print(">> Dropping N/A smiles")
        df.loc[ct,'type_of_entry'] = 'out'
#        nullsmiles = True
#    if pd.isnull(pka) and nullsmiles == False:
#        print('>> Dropping empty pKa {}'.format(row[1]['SMILES']))
#        row[1]['type_of_entry'] = 'out'
#        print(row)

>> Dropping N/A smiles
>> Dropping N/A smiles


In [5]:
in_df = df.loc[df['type_of_entry'] == 'in']
in_df = in_df.drop(['type_of_entry'],axis=1)

in_df_2 = df.loc[(df['type_of_entry'] == 'in') & (df['num_name_contributors'] >= 2)]
in_df_2 = in_df_2.drop(['type_of_entry'],axis=1)


out_df = df.loc[df['type_of_entry'] == 'out']
out_df = out_df.drop(['type_of_entry'],axis=1)


In [6]:
in_df

Unnamed: 0,entry_#,SMILES,pka_type,pka_value,T,remarks,method,assessment,ref,ref_remarks,entry_remarks,original_IUPAC_names,original_IUPAC_nicknames,name_contributors,num_name_contributors,source,unique_ID,pressure,acidity_label
0,2004,CO,pK1,15.5,25.0,,C3,Uncertain,B8,,,Methanol,,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2004,,A
1,2004,CO,pK1,15.09,25.0,,KIN,Uncertain,M126,,,Methanol,,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2004,,A
2,2005,COO,pK1,11.5,20.0,,O5,Uncertain,E27,,,Methyl hydroperoxide,,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2005,,A
3,2006,CS,pK1,10.33,25.0,"1% ethanol, gas solubility method",,Uncertain,K57,,,Methanethiol,,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2006,,A
4,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.14,20.0,In aqueous HClO4 Ho scale,O6,Uncertain,T58a,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2007,,A
5,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.06,25.5,Mixed constant,O5,Uncertain,H5,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2007,,A
6,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.05,9.6,Mixed constant,O5,Uncertain,H5,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2007,,A
7,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.23,5.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2007,,A
8,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.17,20.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2007,,A
9,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.11,40.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2007,,A


In [7]:
in_df_2

Unnamed: 0,entry_#,SMILES,pka_type,pka_value,T,remarks,method,assessment,ref,ref_remarks,entry_remarks,original_IUPAC_names,original_IUPAC_nicknames,name_contributors,num_name_contributors,source,unique_ID,pressure,acidity_label
0,2004,CO,pK1,15.5,25.0,,C3,Uncertain,B8,,,Methanol,,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2004,,A
1,2004,CO,pK1,15.09,25.0,,KIN,Uncertain,M126,,,Methanol,,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2004,,A
2,2005,COO,pK1,11.5,20.0,,O5,Uncertain,E27,,,Methyl hydroperoxide,,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2005,,A
3,2006,CS,pK1,10.33,25.0,"1% ethanol, gas solubility method",,Uncertain,K57,,,Methanethiol,,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2006,,A
4,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.14,20.0,In aqueous HClO4 Ho scale,O6,Uncertain,T58a,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2007,,A
5,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.06,25.5,Mixed constant,O5,Uncertain,H5,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2007,,A
6,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.05,9.6,Mixed constant,O5,Uncertain,H5,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2007,,A
7,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.23,5.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2007,,A
8,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.17,20.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2007,,A
9,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.11,40.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['OPSIN_name1', 'cirpy_name1', 'pubchem_name1'...",4.0,serjeant,serjeant2007,,A


In [8]:
out_df

Unnamed: 0,entry_#,SMILES,pka_type,pka_value,T,remarks,method,assessment,ref,ref_remarks,entry_remarks,original_IUPAC_names,original_IUPAC_nicknames,name_contributors,num_name_contributors,source,unique_ID,pressure,acidity_label
20,2010,,pK1,8.65,,I=0.2(NaCl),E3bg,Uncertain,C72,,,Methanohydroxamic acid,Formohydroxamic acid,,,serjeant,serjeant2010,,A
21,2011,,pK1,2.95,25.0,C=0.002-0.01,C2,Approximate,G4,,,"methanedithioic acid, amino-",Dithiocarbamic acid,,,serjeant,serjeant2011,,A


In [9]:
in_df.to_csv("SAMPLE_in.csv",index=False)
in_df_2.to_csv("SAMPLE_in-high-confidence.csv",index=False)
out_df.to_csv("SAMPLE_rejected.csv",index=False)
