# Define functions  and classes - run these before using anything

In [None]:
import pandas as pd
import copy
import re

In [None]:
def my_isnull(datum):
    if pd.isnull(datum) or datum == '' or datum == 'nan':
        return True
    try:
        if datum.strip(',- ') == '':
            return True
        else:
            return False
    except SystemError:
        return True
    
def clean_remarks_whitespace(text):
    if my_isnull(text):
        return text
    else:
        text = re.sub('\ \ +','  ',text)
        text = text.replace('Conditions-not-stated', 'Conditions not stated')
        # formatting
        text = text.replace("C =","C=")
        text = text.replace("C <","C<")
        text = text.replace("c <","c<")
        text = text.replace("I <","I<")
        text = text.replace("c =","c=")
        text = text.replace("C=","c=")
        return text

In [None]:
class DataEntryBlock(object): # A block of text partially parsed from one entry # to another
    # Note to self: For numbers, account for repeats. There may be "repeats" blocks due to clerical errors
    def __init__(self, smiles=None, number=None, original_name=None, names=None, description=None, entries=None,
                 refs=None, nicknames=None):
        if nicknames is None:
            nicknames = []
        if refs is None:
            refs = {}
        if entries is None:
            entries = []
        self.smiles = smiles
        self.number = number
        self.original_name = original_name
        self.refs=refs
        self.names = names
        self.description = description 
        self.entries = entries
        self.nicknames = nicknames
        
    def __str__(self): 
        return "# {}: {} [{} | {}], descript. {}, refs {} | {} entries: {}\n".format(self.number, self.smiles, self.names, self.nicknames, self.description, self.refs, len(self.entries), self.entries)
    
    def export_to_dict(self):
        entries_dict = {}
        for ct, entry in enumerate(self.entries):
            entries_dict[ct] = entry.to_dict()
        out_dict = {'data': {'names':{
                                    'SMILES': self.smiles,
                                    'IUPAC': self.names,
                                    'nicknames': self.nicknames
                                     },
                             'description':self.description,
                             'refs': self.refs,
                             'entries':entries_dict,}
                   }
        return out_dict
    
class DataEntry(object): # DataEntryBlocks "hold" multiple DataEntry objects.
    def __init__(self, pkas=None, t=None, remarks = None, method=None, assessment=None, ref=None, subtables=None):
        if subtables is None:
            subtables = []
        if ref is None:
            ref = []
        if assessment is None:
            assessment = []
        if method is None:
            method = []
        if t is None:
            t = []
        if pkas is None:
            pkas = []
        self.pkas = pkas
        self.T = t
        self.remarks = remarks
        self.method = method
        self.assessment = assessment
        self.ref = ref
        self.subtables = subtables

    def __str__(self): 
        return "pKa: {} | T: {} | Remarks: {} | Method: {} | Assessment: {} | Ref: {} \n".format(self.pkas, self.T, self.remarks, self.method, self.assessment, self.ref)


In [None]:
def process_data(data):
    # Turns a df into a list of block datas. Assumes that entry #s are always in the first column.
    block = DataEntryBlock(number = None, smiles=None, original_name = None, names = None, nicknames=[], description = None, entries=[],refs={})
    block_list = []
    for _, row in data.iterrows():
        for index, item in enumerate(row):
            if index == 0:
                entrynum = row[0]
                if not pd.isnull(entrynum):
                    # Try to clean up the IUPAC name
                    iupacname = row[1].strip()
                    iupacname = iupacname.replace(",---",",")
                    iupacname = iupacname.replace(",--",",")
                    iupacname = iupacname.replace(",-",",")
                    iupacname = iupacname.replace(".--",",")
                    iupacname = iupacname.replace(" .-",",")
                    iupacname = iupacname.replace(" ,",",")
                    iupacname = iupacname.replace(", .",",")
                    iupacname = iupacname.replace(". ",",")
                    iupacname = iupacname.replace(".-","-")
                    iupacname = iupacname.replace("---","-")
                    iupacname = iupacname.replace("--","-")
                    iupacname = iupacname.replace("’","'")
                    iupacname = iupacname.replace("”",'"')
                    iupacname = iupacname.rstrip('.')

                    # Greek characters
                    iupacname = iupacname.replace("alpha","α")
                    iupacname = iupacname.replace("beta","β")
                    iupacname = iupacname.replace("gamma","γ")
                    iupacname = iupacname.replace("Delta","Δ")
                    iupacname = iupacname.replace("delta","δ")
                    
                    try:
                        entrynum = int(entrynum)
                    except SystemError:
                        pass
                    block_list.append(block) # Add previous block because it's done parsing
                    block = DataEntryBlock(number = entrynum, smiles=None, original_name = iupacname, names = None, nicknames = [], description = None, entries=[],refs={})
                    description = row[5]
                    if not pd.isnull(description):
                        block.description = " - " + description
                        print(row[5])
                else:
                    row = row.drop(['Entry #','IUPAC'])
                    block.entries.append(row)

    block_list.append(block) # Add last block
    block_list.remove(block_list[0])# Remove first block
    return block_list

def sort_block_list(block_list):
    new_block_list = []
    block_dict = {}
    for block in block_list:
        block_dict[str(block.number)] = block
    sortedkeys = sorted(block_dict, key=str.lower)
    for key in sortedkeys:
        new_block_list.append(block_dict[key])
    return new_block_list

### Load in .csvs and concatenate them into 1

In [None]:
df = pd.read_csv("test_sample_out.csv")
dfs = {"1": df}

In [None]:
# Sort through the keys and reorder them when concatenating
sortedkeys = sorted(dfs, key=str.lower)
#print(sortedkeys)
df_cat = pd.DataFrame()
for key in sortedkeys:
    df = dfs[key]
    df_cat = pd.concat([df_cat, df])

In [None]:
df_cat.head(999)

In [None]:
df_cat.to_csv("sample_all_concat.csv",index=False)

### Convert and sort blocklist from .csv data

In [None]:
blocklist = process_data(df_cat)
#for block in blocklist:
#    print(block)

In [None]:
sorted_blocklist = sort_block_list(blocklist)
#for block in sorted_blocklist:
#    print(block.names)

### Test if every number is represented


In [None]:
blocknums = []
for block in sorted_blocklist:
    blocknums.append(block.number)

allnums = list(range(2004,2012))

In [None]:
list(set(allnums) - set(blocknums))

## Postprocess

### Remove comments that only have remarks. In those cases, add remarks to block description.

In [None]:
for block in sorted_blocklist:
    for count, entry in enumerate(block.entries):
        if my_isnull(entry['pKa']) and my_isnull(entry['pKa type']) and my_isnull(entry['T']) and my_isnull(entry['Method']) and my_isnull(entry['Ref']) and my_isnull(entry['Assessment']) and not my_isnull(entry['Remarks']):
            if block.description is None:
                block.description = " - " + entry['Remarks']
            else:
                block.description = block.description + "\n - " + entry['Remarks']
            block.entries.pop(count)

### Replace many whitespaces with just two and clean up some stuff

In [None]:
for block in sorted_blocklist:
    for count, e in enumerate(block.entries):
        block.entries[count]['Remarks'] = clean_remarks_whitespace(e['Remarks'])


### Deconvolute IUPAC names into ensemble of names

In [None]:
WORD_PATTERN = r"[a-z0-9\-,A-Z\(\)']*-?[\.A-MO-Z][0-9\-a-z']+[\[\]\.\'\"\’\”HNOa-z,\-0-9\(\)]*(?: acid| chloride| bromide| dichloride| fluoborate| acetate| iodide| oxime| dihydrogen phosphate| dihydrogen phosphorothioate| phosphoramidate| dioxime| thiocyanate)?\)?"
WORD_PATTERN_WITH_N = r"[a-z0-9\-,A-Z\(\)']*-?[\.A-Z][0-9\-a-z']+[\[\]\.\'\"HNOa-z,\-0-9\(\)]*(?: acid| chloride| bromide| dichloride| fluoborate| acetate| iodide| oxime| dihydrogen phosphate| dihydrogen phosphorothioate| phosphoramidate| dioxime| thiocyanate)?\)?"
NICKNAME_PATTERN = r"\ \([,-_\w\ 0-9']*\(?[,-_\w\ 0-9]*\)?[\"’,'-_\w0-9]*\)[\-]?$"
#NICKNAME_PATTERN = r"\ \([,-_\w\ 0-9']*\(?[,-_\w\ 0-9]*\)?[\",'-_\w\ 0-9]*\)[\-]?$"
def lowercase_func(s, ignore_n = True):
    retstring = ''
    for ct, c in enumerate(s):
        if ct == 0 and (c != 'N' or ignore_n == False):
            retstring += c.lower() 
        else:
            retstring += c
            
    # special cases: "O,O"
    retstring = retstring.replace("o,O","O,O")
    retstring = retstring.replace("n,N","N,N")
    return retstring

In [None]:
def find_nickname(name, debug=False):
    # special case: name starts with parentheses
    starts_with_parentheses = False
    if name[0] == '(':
        starts_with_parentheses = True
    match = re.findall(NICKNAME_PATTERN,name)
    if len(match) > 0:
        nicknames = [i.strip(' ') for i in match]
        for j in nicknames: # remove the matching nickname from the full name
            name = name.replace(j,'').strip('()').rstrip(' ')
#            print("Nickname : {}".format(j))
#            print(name)
        nicknames = [fix_nickname_parentheses(i.strip('( )-')) for i in nicknames]    
        if starts_with_parentheses:
            name = '(' + name
    else:
        nicknames = []
    if debug:
        print(match)
    return name, nicknames

In [None]:
def fix_nickname_parentheses(nickname):
    if nickname.count('(') > nickname.count(')'):
        nickname = nickname + ')'
    if nickname.count('(') < nickname.count(')'):
        nickname = '(' + nickname
    return nickname

In [None]:
# List out the manual names
MANUAL_NAMES = ['Desferriferrioxamin B, N-acetyl-', 'Uridylic polynucleotide, 5-bromo-',
               ]

NICKNAMES_DICT = {}



In [None]:
def process_name(name, debug=False):
    names = []

    # for a few species, we need to check if its both tricky to parse the nickname AND FUll name
    if name in NICKNAMES_DICT.keys():
        names = NICKNAMES_DICT[name]['Names']
        nicknames = NICKNAMES_DICT[name]['Nicknames']
        return names, nicknames
    
    
    # First, just search for nicknames. e.g. Methanal (Formaldehyde) --> Formaldehyde
    name, nicknames = find_nickname(name)
    if len(nicknames) > 0:
        name, nicknames_append = find_nickname(name)
        if nicknames_append:
            nicknames.append(nicknames_append[0])
            name, nicknames_append = find_nickname(name)
            if nicknames_append:
                nicknames.append(nicknames_append[0])
            
    # check if it's in the Manual Names list
    if name in MANUAL_NAMES:
        names = [name]
        return names, nicknames

    split_words = name.split()
    # If we just have 1 entry, or it's a "reference" entry, then no need to postprocess.
    if len(split_words) == 1 or ("see" in split_words and "For" in split_words):
        names.append(name)
        #print(name)

        
    # simple cases e.g. "Sodium chloride" shouldn't be decomposed
    elif (len(split_words) >= 2 and (',' not in name and '-' not in name)) or (any([x in name for x in ['ketone','phosphate','acid','trioxime','phosphorodithioate',
                                                                                                      'cation','hydrobromide','ester','dioxide', 'Phenyl',
                                                                                                      'Benzyl', 'O-Butyl', 'O-Isopropyl', 'O-Isobutyl',
                                                                                                      'chloride','Red','Purple','deoxystreptamine', 
                                                                                                      'Methy', 'Ethyl', 'phosphonothioate', "Diethyl",
                                                                                                      'Hydroxyphenyl', 'Chlorobenzaldehyde',
                                                                                                      'hydroperoxide','dioxime', 
                                                                                                      'isobutylphosphonothioate']]) and (', ' not in name and name[-1] != ',')):
        names.append(name)
#        print(name)
    # Otherwise... Try to identify the name as a format of [Main chemical name,] + [descriptor-]
    else:
        if "Naphth" in name or "Nonane" in name:
            match = re.search(WORD_PATTERN_WITH_N,name) 
            ignore_n = False
        else:
            match = re.search(WORD_PATTERN,name) 
            ignore_n = True
        if match is not None: # just take the first name as this can lead to multiple matches
            leading_name = match.group().strip(', -')
            substring = name.replace(leading_name,'').strip(', ')
            if len(substring.replace(' ','')) > 0: # basically if it's not just blanks
            #print(leading_name + ' | ' + substring + ' | ' + block.name.strip())
            
            # Permutation one: [descriptor-main_chemical_name]
            
                descriptor_name = substring.rstrip('-')+lowercase_func(leading_name, ignore_n = ignore_n).rstrip(',')
                descriptor_name_dash = substring.rstrip('-')+'-'+lowercase_func(leading_name, ignore_n = ignore_n).rstrip(',')
                original_name = lowercase_func(leading_name, ignore_n = ignore_n).rstrip(',') + ', ' + substring.rstrip('-')+'-'
                names.append(original_name)
                if leading_name[0].isdigit():
                    names.append(descriptor_name_dash)
                else:
                    names.append(descriptor_name)
            else:
                names.append(leading_name.rstrip(','))
            if debug:
                print('Lead: '+ leading_name)
                print('Substring: '+substring)
        else:
            names.append(name)
    
    return names, nicknames

In [None]:
for block in sorted_blocklist:
    name = block.original_name
    names, nicknames = process_name(name)
    block.names = names
    block.nicknames = nicknames

### Copy methods/other data if no reference for entry. Also, add "source-specific" comments to the reference description e.g. Thermodynamic quantities are derived from the results or Other measurements

In [None]:
OTHER_PATTERNS = r'Other [a-z]*:[ A-Za-z0-9,]*[.]?'
OTHER_PATTERNS_INCLUSIVE = r'Other [a-z ]* in [ A-Za-z0-9,]*[.]?'
FOR_PATTERNS = r'For [ \[\]a-z-A-Z0-9\'.\,]*see[ \[\]a-zA-Z0-9\'.]*([A-Z]*[0-9]*[a-z]*[, ]?)*[.]?'
REF_COMMENTS = ['Thermodynamic quantities for the keto and enol forms are derived from the results',
                'Thermodynamic quantities are derived from the results for both enol and keto forms',
                'Thermodynamic quantities are derived from the results', 
                'Thermodynamic quantities are also given',
                'Thermodynamic data are also given',
                'Thermodynamic quantities also given',
                'Thermodynamic values are derived from the results'
                'Thermodynamic quantities are derived from these results',
                'Thermodynamic quantities are derived from the results for both enol and keto forms',
                'Thermodynamic quantities are given',
                'Values in other inert salt solutions are also given',
                'Values in mixed solvents are also given',
                'pK assignment discussed',
                'Value in mixed solvent is also given',
                'Thermodynamic quantities are derived from the results.2-Azapropane-1,3-diphosphonic acid, 2-ethyl- (Iminodi(methylphosphonic acid) N-ethyl-)'
                
                
                
               ]

In [None]:
for block in sorted_blocklist:
    for count, entry in enumerate(block.entries):
        
        ## ONLY TAG THE REFS
        if my_isnull(entry['Ref']):
            block.entries[count]['Ref'] = block.entries[count-1]['Ref']
            # Check for other missing stuff.
            if my_isnull(entry['T']) and not my_isnull(block.entries[count-1]['T']):
                if '(assumed)' in str(block.entries[count-1]['T']):
                    add_string = ''
                else:
                    add_string = " (assumed)"
                entry['T'] = str(block.entries[count-1]['T']) + add_string
            if my_isnull(entry['Method']):
                entry['Method'] = block.entries[count-1]['Method']
            if my_isnull(entry['Assessment']):
                entry['Assessment'] = block.entries[count-1]['Assessment'] 

        else:
            block.refs[entry['Ref']] = []
            
        if not my_isnull(entry['Remarks']):
            for ref_comment in REF_COMMENTS:
                if ref_comment in entry['Remarks']:
                    try:
                        block.refs[entry['Ref']].append(ref_comment)
                    except KeyError:
                        block.refs[entry['Ref']] = [ref_comment]
                    entry['Remarks'] = entry['Remarks'].replace(ref_comment,'')
            # Other refs...
            for pattern in [OTHER_PATTERNS, OTHER_PATTERNS_INCLUSIVE, FOR_PATTERNS]:
                match = re.search(pattern,entry['Remarks'])
                if match is not None:
                    try:
                        block.refs[entry['Ref']].append(match.group())
                    except KeyError:
                        block.refs[entry['Ref']] = match.group()
                    entry['Remarks'] = entry['Remarks'].replace(match.group(),'')
                    print(str(block.number) + " " + str(block.refs))            
                    print(str(block.refs))
                    
        # NOW MODIFY THE ENTRIES
        # Check for other missing stuff.
        if my_isnull(entry['Remarks']) or entry['Remarks'] == '.' or entry['Remarks'] == '. ':
            entry['Remarks'] = block.entries[count-1]['Remarks']
#        else:
#            block.refs[entry['Ref']] = []

# Iterate one more time. If the thermo remarks are in the remarks, then drop it
# If thermo is in the block.description, then move it to the block's LAST ref.
for block in sorted_blocklist:
    for count, entry in enumerate(block.entries):
        if not my_isnull(entry['Remarks']):
            for ref_comment in REF_COMMENTS:
                # check if in remarks
                if ref_comment in entry['Remarks']:
                    block.entries[count]['Remarks'] = entry['Remarks'].replace(ref_comment,'')
    # check if in description
    if block.description:
        for ref_comment in REF_COMMENTS:
            if ref_comment in block.description:
                ref_to_change = list(block.refs)[-1]
                block.refs[ref_to_change].append(ref_comment)
                block.description = block.description.replace(' - '+ref_comment, '')
            else:
                pass
            
    # Lastly, if there's "Other measurements" in a ref's comments, then move it to the block
    for ref in block.refs:
        if len(block.refs[ref]) > 0:
            for ct3, content in enumerate(block.refs[ref]):
                for pattern in [OTHER_PATTERNS, OTHER_PATTERNS_INCLUSIVE, FOR_PATTERNS]:
                    match = re.search(pattern,content)
                    if match is not None:
                        block.refs[ref][ct3] = content.replace(match.group(), '')
                        if block.description is None:
                            block.description = " - " + match.group()
                        else:
                            block.description = block.description + "\n - " + match.group()


### If many pKas for an entry, separate them into separate pKas. 

In [None]:
for block in sorted_blocklist:
    for count, entry in enumerate(block.entries):
        entry_copy = entry.copy()
        try:
            pkas = str(entry_copy['pKa']).split(',')
            pka_types = str(entry_copy['pKa type']).split(',')
            for ct, pka_type in enumerate(pka_types):
                if '__0' in pka_type:
                    pka_types[ct] = 'pKa'
                    if len(pka_types) == 1:
                        block.entries[count]['pKa type'] = 'pKa'
            if len(pkas) != len(pka_types):
                print("Erroneous pKa tabulation in entry # {}".format(block.number))
            if len(pkas) > 1:
                entry_list = []
                for num, pka in enumerate(pkas):
                    duplicate = entry.copy()
                    duplicate['pKa type'] = pka_types[num].strip()
                    duplicate['pKa'] = pka.strip()
                    entry_list.append(duplicate)
                block.entries[count:count+len(pkas)-1] = entry_list
            else:
                pass
        except AttributeError:
            pass

### Clean entry comments

In [None]:
for block in sorted_blocklist:
    for count, entry in enumerate(block.entries):
        try:
            remarks = str(entry['Remarks'])
            print(remarks)
            if remarks.strip() == '.' or remarks.strip() == '-':
                block.entries[count]['Remarks'] = ''
            else:
                block.entries[count]['Remarks'] = block.entries[count]['Remarks'].replace("  ", ", ")
        except AttributeError:
            pass

##  DONE: Tool for adding subtables

In [None]:
import ast

In [None]:
def assign_remarks(entry_duplicate, x, running_t, running_p, running_i):
    # Case 1: Temperature
    if running_t:
        entry_duplicate['T'] = x
    elif running_p:
        if my_isnull(entry_duplicate['Remarks']):
            entry_duplicate['Remarks'] = 'P=' + x + ' ({})'.format(P_units)
        else:
            entry_duplicate['Remarks'] += '\nP=' + x + ' ({})'.format(P_units)
    elif running_i:
        if my_isnull(entry_duplicate['Remarks']):
            entry_duplicate['Remarks'] = 'I=' + str(x)
        else:
            entry_duplicate['Remarks'] += '\nI=' + str(x)
            
    return entry_duplicate

In [None]:
lambdas = ['275.89', '305.32', '334.77', '364.06', '421.97', '450.21', '478.10', '505.48']

In [None]:
#block_numbers = []
processed_string='already_processed'

for block in sorted_blocklist:
    for count, entry in enumerate(block.entries):
        if not my_isnull(entry['(Subtable)']) and entry['(Subtable)'] != processed_string:
            running_t = False
            running_p = False
            running_i = False
            running_lambda = False

            z = entry['(Subtable)'].replace('\'','"').replace('\’','"').replace('’','"').replace('‘','"')            
            subtable_as_list = ast.literal_eval(z)
#            z2 = json.loads(z) 
    
            print(block.number)
            variation_header = subtable_as_list[0]
            print(variation_header)

            # check headers
            if 'with temperature' in variation_header:
                running_t = True
            elif 'pressure' in variation_header:
                running_p = True
                P_units = variation_header.replace('Variation with pressure','').strip('\(\)')
            elif 'with I' in variation_header or 'ionic strength' in variation_header:
                running_i = True
            else:
                raise ValueError("unknown x variable type")

            end_dict = {}
            x_var_list = []
            # iterate through the rows of the subtable
            
            for i in range(1,len(subtable_as_list)):
                row = subtable_as_list[i]
#                print(row)
                if type(row) == str:
#                    print('String Row')
#                    if row != 'verbatim':
                    row = row.split()
                if type(row) == list:
                    # This case defines the x variable, e.g. pressure or temperature                
#                    print('List Row')
                    x_var_list.append(row)
                    
                elif type(row) == dict:
                    # These are the pkas
#                    print('Dict Row')
                    t = list(row.items())
                    archetype = t[0][0]
                    list_to_add = []
                    for item in t:
                        list_to_add.append(item[1])
                    end_dict[archetype] = list_to_add
                    
                else:
#                    if row != 'verbatim':
                    raise ValueError("unknown row type: {}".format(row))
            
            # Now, loop everything together
            print(x_var_list)
            print(end_dict)
            end_list = list(end_dict.items())
            print()
            # y dict: {'pk1': [1, 2, 3], 'pk2': [4, 5, 6]}
            
            # Clean subtable tag. this needs to go first
            block.entries[count]['(Subtable)'] = processed_string
            # APPEND ENTRIES
            
            # Case A: just 1 temp list
            if len(x_var_list) == 1:
                for pka_list in end_list: # e.g. [(pK1, [3.1, 3.2, 3.3]), (pk2, [1.2, 2.2, 3.2])]                    
                    for ct_x, x in enumerate(x_var_list[0]): #e.g. [5, 10, 15]
                        entry_duplicate = copy.deepcopy(entry)
                        entry_duplicate = assign_remarks(entry_duplicate, x, running_t, running_p, running_i)
                        if pka_list[0] == 'lambda_0':
                            pass
                        else:
                            entry_duplicate['pKa type'] = pka_list[0]
                            entry_duplicate['pKa'] = pka_list[1][ct_x]
                            if block.number == 2163:
#                                print(block)
                                entry_duplicate['Remarks'] = entry_duplicate['Remarks'].replace('lambda_0=393.23','')
                                entry_duplicate['Remarks'] += '\nlambda_0={}'.format(lambdas[ct_x])
    #                        print(entry_duplicate)
                            block.entries.append(entry_duplicate)
            
            # Case B: multiple temp lists. Assume temp list 1 = first pKa, temp list 2 = second pKa
            elif len(x_var_list) > 1:
                for ct_xlist, x_list in enumerate(x_var_list): # e.g. [[5, 10, 15], [5, 15, 20, 25]]:
                    for ct_x, x in enumerate(x_list): #e.g. [5, 10, 15]
                        entry_duplicate = copy.deepcopy(entry)
                        entry_duplicate = assign_remarks(entry_duplicate, x, running_t, running_p, running_i)
                        entry_duplicate['pKa type'] = end_list[ct_xlist][0]
                        entry_duplicate['pKa'] = end_list[ct_xlist][1][ct_x]
#                        print(entry_duplicate)
                        block.entries.append(entry_duplicate)
            
            else:
                raise ValueError("x_var_list = 0??")


# Tool for alphabetizing names

In [None]:
block_nums = []
names = [[],[]]
nicknames = [[],[],[]]

for block in sorted_blocklist:
    block_nums.append(block.number)
    for i in range(2):
        try:
            names[i].append(block.names[i])
        except IndexError:
            names[i].append('')
    for i in range(3):
        try:
            nicknames[i].append(block.nicknames[i])
        except IndexError:
            nicknames[i].append('')
df = pd.DataFrame.from_dict({'Entry #': block_nums, 'Name 1':names[0], #'Name 2': names[1], 
                             'Nickname 1': nicknames[0], 'Nickname 2': nicknames[1],
                            'Nickname 3': nicknames[2]})

In [None]:
df

In [None]:
df.to_csv("sample_names.csv",index=False)

# Tool for assigning SMILES to DataEntryBlocks, need RDkit for this

## At this point, in the workflow, I would manually create a spreadsheet ("names/sample_names_OUT.csv") that also has columns for OPSIN predictions. This is included in the demo for your convenience

In [None]:
from rdkit import Chem
import time
import pandas as pd
import os
import pubchempy as pcp
import cirpy
from urllib.error import HTTPError

In [None]:
def get_from_pubchempy(name):
    if pd.isnull(name):
        return None

    name = str(name)

    time.sleep(0.5)
    name_set = set()
    results = pcp.get_compounds(name, 'name')
    for compound in results:
        name_set.add(compound.isomeric_smiles)
    
    if len(name_set) == 1:
        return list(name_set)[0]
    else:
        return None
    
def get_from_cirpy(name):
    if pd.isnull(name):
        return None

    name = str(name)
    time.sleep(0.5)
    try:
        smiles = cirpy.resolve(name, 'smiles', ['name_by_cir'])
    except HTTPError:
        smiles = ''
    return smiles

In [None]:
names_df = None
if 'names_df' not in locals():
    names_df = pd.read_csv(os.path.join(os.getcwd(),"names","sample_names_OUT.csv"))
else:
    print("names_df already loaded")

In [None]:
names_df

### Create 'converged_smiles'

In [None]:
if 'pubchem_name1' not in names_df:
    print("Resolving Name 1...")
    names_df['pubchem_name1'] = names_df['Name 1'].apply(lambda x: get_from_pubchempy(x))

if 'pubchem_name2' not in names_df:
    print("Resolving Name 2...")
    names_df['pubchem_name2'] = names_df['Name 2'].apply(lambda x: get_from_pubchempy(x))    

if 'pubchem_nickname1' not in names_df:
    print("Resolving nickname 1...")
    names_df['pubchem_nickname1'] = names_df['Nickname 1'].apply(lambda x: get_from_pubchempy(x))    

if 'pubchem_nickname2' not in names_df:
    print("Resolving nickname 2...")
    names_df['pubchem_nickname2'] = names_df['Nickname 2'].apply(lambda x: get_from_pubchempy(x))    
    
    

In [None]:
if 'cirpy_name1' not in names_df:
    print("Resolving Name1...")
    names_df['cirpy_name1'] = names_df['Name 1'].apply(lambda x: get_from_cirpy(x))

if 'cirpy_name2' not in names_df:
    print("Resolving Name2...")
    names_df['cirpy_name2'] = names_df['Name 2'].apply(lambda x: get_from_cirpy(x))

if 'cirpy_nickname1' not in names_df:
    print("Resolving Nickname1...")
    names_df['cirpy_nickname1'] = names_df['Nickname 1'].apply(lambda x: get_from_cirpy(x))

if 'cirpy_nickname2' not in names_df:
    print("Resolving Nickname1...")
    names_df['cirpy_nickname2'] = names_df['Nickname 2'].apply(lambda x: get_from_cirpy(x))


In [None]:
names_df.to_csv(os.path.join(os.getcwd(),"names","sample_names_OUT.csv"))


In [None]:
headers = ['OPSIN_name1', 
#           'OPSIN_name2', 
           'OPSIN_nickname1', 
           'OPSIN_nickname2',
           'cirpy_name1',
#           'cirpy_name2',
           'cirpy_nickname1',
           'cirpy_nickname2',
           'pubchem_name1',
#           'pubchem_name2',
           'pubchem_nickname1',
           'pubchem_nickname2',
           'chemaxon_name1',
           'chemaxon_nickname1'
          ]
inconsistent_ct = 0
missing_ct = 0
converged_ct = 0

for ct, row in enumerate(names_df.iterrows()):
    contributors = []
    smiles_list = []
    for header in headers:
        smiles = row[1][header]
        if not pd.isnull(smiles) and smiles != '':
            try:
                mol = Chem.MolFromSmiles(smiles)
                smiles_list.append(Chem.MolToSmiles(mol,isomericSmiles=True))                
                contributors.append(header)
            except SystemError:
                pass
            
    # compare with manual SMILES also
    smiles_set = set(smiles_list)
    

    if len(smiles_set) > 1:
        # Algorithm: Let's say there are only 2 entries. And one of them is the isomeric version.
        # In that case, just append the isomeric version.
        if len(smiles_set) == 2:
            smiles_set_unwrapped = list(smiles_set)
            mol0 = Chem.MolFromSmiles(smiles_set_unwrapped[0])
            mol1 = Chem.MolFromSmiles(smiles_set_unwrapped[1])
            if Chem.MolToSmiles(mol0, isomericSmiles=False) == Chem.MolToSmiles(mol1):
                converged_smiles = smiles_set_unwrapped[0]
                print("Isomeric: {} for molecule {}".format(converged_smiles, row[1]['Name 1']))
                names_df.loc[ct, 'type'] = 'isomeric'
                converged_ct += 1
            elif Chem.MolToSmiles(mol1, isomericSmiles=False) == Chem.MolToSmiles(mol0):
                converged_smiles = smiles_set_unwrapped[1]
                print("Isomeric: {} for molecule {}".format(converged_smiles, row[1]['Name 1']))
                names_df.loc[ct, 'type'] = 'isomeric'
                converged_ct += 1
            else:
                converged_smiles = ''
                inconsistent_ct += 1
                print("error with SMILES set: {} | molecule {}".format(smiles_set, row[1]['Name 1']))
                names_df.loc[ct, 'type'] = 'inconsistent'
        else:        
            converged_smiles = ''
            names_df.loc[ct, 'type'] = 'inconsistent'
            print("error with SMILES set: {} | molecule {}".format(smiles_set, row[1]['Name 1']))
            inconsistent_ct += 1
    elif len(smiles_set) == 0:
        names_df.loc[ct, 'type'] = 'missing'
        converged_smiles = ''
        missing_ct += 1
    else:
        names_df.loc[ct, 'type'] = 'converged'
        converged_smiles = smiles_set.pop()
        converged_ct += 1
    names_df.loc[ct,'converged_smiles'] = converged_smiles
    if len(converged_smiles) > 0:
        names_df.loc[ct, 'contributors'] = str(contributors)
    else:
        names_df.loc[ct, 'contributors'] = ''
    names_df.loc[ct, 'num_contributors'] = int(len(contributors))


In [None]:
print(inconsistent_ct)
print(missing_ct)
print(converged_ct)
print("Inconsistent total: {}".format(inconsistent_ct / (converged_ct + missing_ct + inconsistent_ct)))
print("Missing total: {}".format(missing_ct / (converged_ct + missing_ct + inconsistent_ct)))
print("Converged total: {}".format(converged_ct / (converged_ct + missing_ct + inconsistent_ct)))
names_df.head(99)

### print out csv

In [None]:
entry_dict = {}
for row in names_df.iterrows():
    smiles = row[1]['converged_smiles']
    contributors = row[1]['contributors']
    num_contributors = row[1]['num_contributors']
    if not my_isnull(smiles):
        entry_dict[row[1]['Entry #']] = [smiles, contributors, num_contributors]

In [None]:
entry_dict

In [None]:
for block in sorted_blocklist:
    try:
        block.smiles = entry_dict[block.number][0]
        block.contributors = entry_dict[block.number][1]
        block.num_contributors = entry_dict[block.number][2]
        print(block.smiles)
    except KeyError:
        block.smiles = None
        block.contributors = None
        block.num_contributors = None
    

In [None]:
for block in sorted_blocklist:
    print(block)

### Convert to csv

In [None]:
def ref_remarks_wrapper(refs, entryref):
    if my_isnull(entryref):
        return ""
    else:
        return refs[entryref]

def unpack(strlist):
    # Unpacks a list into a string separated by semicolons
    if type(strlist) == list:
        return_string = ''
        for ct,string in enumerate(strlist):
            if ct == 0:
                return_string = string
            else:
                return_string += "; " + string
        return return_string
    else:
        return strlist

In [None]:
df = pd.DataFrame(columns=['entry_#','SMILES','pka_type','pka_value','T','remarks','method','assessment','ref','ref_remarks','entry_remarks','original_IUPAC_names'])

for block in sorted_blocklist:
    for entry in block.entries:
        df = df.append({
            "entry_#": block.number,
            "SMILES": block.smiles,
            "pka_type": entry[0],
            "pka_value": entry[1],
            "T": entry[2],
            "remarks": entry[3],
            "method": entry[4],
            "assessment": entry[5],
            "ref": entry[6],
            "ref_remarks": unpack(ref_remarks_wrapper(block.refs, entry[6])),
            "entry_remarks": block.description,
#            "original_IUPAC_names": unpack(block.names),
            "original_IUPAC_names": block.names[0],
            "original_IUPAC_nicknames": unpack(block.nicknames),
            "name_contributors": block.contributors,
            "num_name_contributors": block.num_contributors,
        },ignore_index=True)

In [None]:
df

In [None]:
df.to_csv("sample_done.csv",index=False)