# SAMPLE/DEMO

In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("sample_done.csv")
df['source'] = 'serjeant'
df['type_of_entry'] = 'in'
df['unique_ID'] = ''
PLUS_PATTERN = r'\+[0-9.]*'

df

Unnamed: 0,entry_#,SMILES,pka_type,pka_value,T,remarks,method,assessment,ref,ref_remarks,entry_remarks,original_IUPAC_names,original_IUPAC_nicknames,name_contributors,num_name_contributors,source,type_of_entry,unique_ID
0,2004,CO,pKa,15.5,25.0,,C3,Uncert.,B8,,,Methanol,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,
1,2004,CO,pKa,15.09,25.0,,KIN,Uncert.,M126,,,Methanol,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,
2,2005,COO,pKa,11.5,20.0,,O5,Uncert.,E27,,,Methyl hydroperoxide,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,
3,2006,CS,pKa,10.33,25.0,"1% ethanol, gas solubility method",,Uncert.,K57,,,Methanethiol,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,
4,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pKa,0.14,20.0,In aqueous HClO4 Ho scale,O6,Uncert.,T58a,,- 0.02 Thermodynamic quantities are derived ...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,
5,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pKa,0.06,25.5,Mixed constant,O5,Uncert.,H5,,- 0.02 Thermodynamic quantities are derived ...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,
6,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pKa,0.05,9.6,Mixed constant,O5,Uncert.,H5,,- 0.02 Thermodynamic quantities are derived ...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,
7,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pKa,0.23,5.0,In aqueous HCl Ho scale,O6,Uncert.,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived ...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,
8,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pKa,0.17,20.0,In aqueous HCl Ho scale,O6,Uncert.,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived ...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,
9,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pKa,0.11,40.0,In aqueous HCl Ho scale,O6,Uncert.,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived ...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,


### Clean up all entries

In [3]:
P_MATCH = r"P=[0-9]*[a-z ().]*"

pka_replacements = [
    ('pk', 'pK'),
    ('pka', 'pK1'),
    ('pKa', 'pK1'),
    ('cooh', 'COOH'),
    ('pKa(H2O)', 'pK1'),
    ('pK1(H2O)', 'pK1'),
    ('pK2(H2O)', 'pK2'),
    ('pK3(H2O)', 'pK3'),
    ('pKb', 'pKB')
]

def fix_pka_type(entry, source):
    """
    Fix/reformats several pKa types in the dataset and returns the reformatted entry.
    Works differently based on the reference source.
    """
    if pd.isnull(entry):
        return entry
    else:
        for replacement in pka_replacements:
            entry = entry.replace(replacement[0], replacement[1])
            
        if 'pK(' in entry and (source == 'perrin' or source == 'perrin_supp'): # as Perrin and Perrin Supp. are for basic pK's.
            entry = entry.replace('pK(', 'pKAH(')
        
        return entry

remarks_replacements = [
    ('\n. ', ''),
    ('( (', '('),
    ('( ', '('),
    (' )', ')'),
    ('c = ', 'C='),
    ('c =', 'C='),
    ('c= ', 'C='),
    ('c=', 'C='),
    ('c~', 'C~'),
    ('c<', 'C<'),
    ('c <', 'C<'),
    ('c < ', 'C<'),
    ('C = ', 'C='),
    ('c - ', 'C='),
    ('C= ', 'C='),
    ('C =', 'C='),
    ('concentration\nconstant', 'concentration constant'),
    ('concentration, constant', 'concentration constant'),
    ('I = ', 'I='),
    ('I =', 'I='),
    ('I= ', 'I='),
    ('m = ', 'm='),
    ('m= ', 'm='),
    ('m =', 'm='),
    ('C< ', 'C<'),
    ('C <', 'C<'),
    ('C < ', 'C<'),
    ('I< ', 'I<'),
    ('1m solution', '1M solution'),
    ('I <', 'I<'),
    ('I < ', 'I<'),
    ('S04', 'SO4'),
    ('KCI', 'KCl'),
    ('DCI', 'DCl'),
    ('c between', 'C between'),
    ('HCI', 'HCl'),
    ('NaCI', 'NaCl'),
    ('from, thermo', 'from thermo'),
    (' (KCl)', ' (KCl)'),
    (' (HCl', ' (HCl'),
    (' (KBr)', ' (KBr)'),
    (' (K2SO4)', '(K2SO4)'),
    (' (NaClO4)', '(NaClO4)'),
    (' (KNO3)', '(KNO3)'),
    (' (NaCl)', '(NaCl)'),
    (',,', ','),
    ('()', ''),
    ('. mixed', ', mixed'),
    (': mixed', ', mixed'),
    (') mixed', '), mixed'),
    ('. conc', ', conc'),
    (') conc', '), conc'),
    ('\n\n\n', '\n')
]

def fix_remarks(entry):
    """
    Fix some additional common mistakes that were found in the data, and returns the reformatted entry.
    """
    if pd.isnull(entry):
        return entry
    else:
        for replacement in remarks_replacements:
            entry = entry.replace(replacement[0], replacement[1])
    
        return entry

def fix_entry_remarks(entry):
    """Fixes typos in an entry's remarks, and returns the fixed entry."""
    if pd.isnull(entry):
        return entry
    else:
        entry = entry.replace('- Thermodynamic data are derived from the results','')
        entry = entry.strip('.\n ')
        entry = entry.replace('- \n', ' - ')
        return entry

def fix_ref_remarks(entry):
    """Fixes typos in an entry's references, eliminating trailing semicolons, and returns the fixed entry."""
    if pd.isnull(entry):
        return entry
    else:
        entry = entry.strip('; ')
        return entry

def isolate_P(entry):
    """Finds pressure in an entry's remark if it is available. Returns P if found, otherwise returns None."""
    if pd.isnull(entry):
        return None
    match = re.search(P_MATCH,entry)
    if match:
        return match.group().replace('P=','')
    return None

def standardize_refs(entry):
    """Fixes typos in an entry's refs by stripping trailing spaces, and returns the fixed entry."""    
    if pd.isnull(entry):
        return entry
    return entry.strip()

def standardize_assessment(entry):
    """Fixes typos in an entry's uncertainty assessments, and returns the fixed entry."""

    if pd.isnull(entry):
        return entry
    entry = entry.replace('.','')
    entry = entry.replace('Approx','Approximate')
    entry = entry.replace('approx','approximate')
    if entry == 'Uncert':
        entry = 'Uncertain'
    entry = entry.replace('Rel','Reliable')
    entry = entry.replace('Very Uncertain','Very uncertain')
    entry = entry.replace('Very Uncert','Very uncertain')
    entry = entry.replace('V Uncert','Very uncertain')
    entry = entry.replace('VUncert','Very uncertain')
    entry = entry.replace('Vuncert','Very uncertain')
    if entry == 'Very uncert':
        entry = 'Very uncertain'
    return entry
    
def make_acidity_label(entry, source) -> str:
    """
    Given an entry and its original reference text source, returns a label that assesses whether the pKa entry
    is for an acidic pK or a basic one. AH = conjugate acid, A = acid, B = base.
    """
    if pd.isnull(entry):
        return None
    if 'pKAH' in entry:
        return 'AH'
    elif 'pKB' in entry or source == 'perrin' or source == 'perrin_supp':
        return 'B'
    else:
        return 'A'
    
df['remarks'] = df['remarks'].apply(lambda x: fix_remarks(x))
df['entry_remarks'] = df['entry_remarks'].apply(lambda x: fix_entry_remarks(x))
df['ref_remarks'] = df['ref_remarks'].apply(lambda x: fix_ref_remarks(x))
df['pka_type'] = df.apply(lambda x: fix_pka_type(x.pka_type, x.source), axis=1)
df['pressure'] = df['remarks'].apply(lambda x: isolate_P(x))
df['ref'] = df['ref'].apply(lambda x: standardize_refs(x))
df['assessment'] = df['assessment'].apply(lambda x: standardize_assessment(x))
df['acidity_label'] = df.apply(lambda x: make_acidity_label(x.pka_type, x.source), axis=1)

df.to_csv("all_cat.csv",index=False)
df

Unnamed: 0,entry_#,SMILES,pka_type,pka_value,T,remarks,method,assessment,ref,ref_remarks,entry_remarks,original_IUPAC_names,original_IUPAC_nicknames,name_contributors,num_name_contributors,source,type_of_entry,unique_ID,pressure,acidity_label
0,2004,CO,pK1,15.5,25.0,,C3,Uncertain,B8,,,Methanol,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,,,A
1,2004,CO,pK1,15.09,25.0,,KIN,Uncertain,M126,,,Methanol,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,,,A
2,2005,COO,pK1,11.5,20.0,,O5,Uncertain,E27,,,Methyl hydroperoxide,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,,,A
3,2006,CS,pK1,10.33,25.0,"1% ethanol, gas solubility method",,Uncertain,K57,,,Methanethiol,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,,,A
4,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.14,20.0,In aqueous HClO4 Ho scale,O6,Uncertain,T58a,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,,,A
5,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.06,25.5,Mixed constant,O5,Uncertain,H5,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,,,A
6,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.05,9.6,Mixed constant,O5,Uncertain,H5,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,,,A
7,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.23,5.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,,,A
8,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.17,20.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,,,A
9,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.11,40.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,in,,,A


### clean up and just get specific types of data

In [4]:
# An example of filtering the dataframe
for ct, row in df.iterrows():
    nullsmiles = False
    unique_id = row['source'] + str(row['entry_#'])
    df.loc[ct,'unique_ID'] = unique_id
    pka = row['pka_value']
    
    try:
        '.' in row['SMILES'] # can also add code here that checks if the SMILES is a salt with a period in it.
    except TypeError:
        print(">> Dropping N/A smiles")
        df.loc[ct,'type_of_entry'] = 'out' # Basically filter out the entry if it's not a str.
        


>> Dropping N/A smiles


In [5]:
# Isolate the DF to all the entries with converged SMILES
# (High and medium confidence values.)
in_df = df.loc[df['type_of_entry'] == 'in']
in_df = in_df.drop(['type_of_entry'],axis=1)

# Isolate the DF to only the entries with converged SMILES and also multiple translation sources that agree.
# (High-confidence).
in_df_2 = df.loc[(df['type_of_entry'] == 'in') & (df['num_name_contributors'] >= 2)]
in_df_2 = in_df_2.drop(['type_of_entry'],axis=1)

# Isolate the DF to only the entries that did NOT converge. These are
out_df = df.loc[df['type_of_entry'] == 'out']
out_df = out_df.drop(['type_of_entry'],axis=1)


In [6]:
in_df

Unnamed: 0,entry_#,SMILES,pka_type,pka_value,T,remarks,method,assessment,ref,ref_remarks,entry_remarks,original_IUPAC_names,original_IUPAC_nicknames,name_contributors,num_name_contributors,source,unique_ID,pressure,acidity_label
0,2004,CO,pK1,15.5,25.0,,C3,Uncertain,B8,,,Methanol,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2004,,A
1,2004,CO,pK1,15.09,25.0,,KIN,Uncertain,M126,,,Methanol,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2004,,A
2,2005,COO,pK1,11.5,20.0,,O5,Uncertain,E27,,,Methyl hydroperoxide,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2005,,A
3,2006,CS,pK1,10.33,25.0,"1% ethanol, gas solubility method",,Uncertain,K57,,,Methanethiol,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2006,,A
4,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.14,20.0,In aqueous HClO4 Ho scale,O6,Uncertain,T58a,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2007,,A
5,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.06,25.5,Mixed constant,O5,Uncertain,H5,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2007,,A
6,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.05,9.6,Mixed constant,O5,Uncertain,H5,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2007,,A
7,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.23,5.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2007,,A
8,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.17,20.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2007,,A
9,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.11,40.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2007,,A


In [7]:
in_df_2

Unnamed: 0,entry_#,SMILES,pka_type,pka_value,T,remarks,method,assessment,ref,ref_remarks,entry_remarks,original_IUPAC_names,original_IUPAC_nicknames,name_contributors,num_name_contributors,source,unique_ID,pressure,acidity_label
0,2004,CO,pK1,15.5,25.0,,C3,Uncertain,B8,,,Methanol,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2004,,A
1,2004,CO,pK1,15.09,25.0,,KIN,Uncertain,M126,,,Methanol,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2004,,A
2,2005,COO,pK1,11.5,20.0,,O5,Uncertain,E27,,,Methyl hydroperoxide,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2005,,A
3,2006,CS,pK1,10.33,25.0,"1% ethanol, gas solubility method",,Uncertain,K57,,,Methanethiol,,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2006,,A
4,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.14,20.0,In aqueous HClO4 Ho scale,O6,Uncertain,T58a,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2007,,A
5,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.06,25.5,Mixed constant,O5,Uncertain,H5,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2007,,A
6,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.05,9.6,Mixed constant,O5,Uncertain,H5,,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2007,,A
7,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.23,5.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2007,,A
8,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.17,20.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2007,,A
9,2007,O=[N+]([O-])C([N+](=O)[O-])[N+](=O)[O-],pK1,0.11,40.0,In aqueous HCl Ho scale,O6,Uncertain,"N39, S82",Thermodynamic quantities are derived from the ...,- 0.02 Thermodynamic quantities are derived f...,"methane, trinitro-",,"['opsin_name1', 'cirpy_name1', 'pubchem_name1']",3.0,serjeant,serjeant2007,,A


In [8]:
out_df

Unnamed: 0,entry_#,SMILES,pka_type,pka_value,T,remarks,method,assessment,ref,ref_remarks,entry_remarks,original_IUPAC_names,original_IUPAC_nicknames,name_contributors,num_name_contributors,source,unique_ID,pressure,acidity_label
20,2010,,pK1,8.65,,I=0.2(NaCl),E3bg,Uncertain,C72,,,Methanohydroxamic acid,Formohydroxamic acid,,,serjeant,serjeant2010,,A


In [9]:
in_df.to_csv("SAMPLE_in.csv",index=False)
in_df_2.to_csv("SAMPLE_in-high-confidence.csv",index=False)
out_df.to_csv("SAMPLE_rejected.csv",index=False)
