# Pre-processing of data
The first thing to do is extracting the data we are interested in from the XML files (XML files are used from a dataset of US patent applications from 2001 to 2016. This dataset can be found here: https://figshare.com/articles/dataset/Chemical_reactions_from_US_patents_1976-Sep2016_/5104873). We are interested in the title, the general experimental procedure, the reaction SMILES, but most importantly the product SMILES.
Therefore, using xml.Etree in the following we iterate through the child elements of the root of the file. 


In [1]:
# First we import the necessary libraries

import xml.etree.cElementTree as et     # for parsing the XML file
import pandas as pd
import numpy as np
import os

Now we use a function to iterate over all the XML files in the applications data from 2001. We collect the title, the paragraphText, mostly including the experimental procedures, the reaction SMILES and the product SMILES from it.

In [18]:
def extract_data(root_file):
    """function that extracts data from an XML file and returns a list of dictionaries containing the extracted information.
        Information to be extracted includes the title of the reaction, the experimental procedure, the reaction SMILES, and the product SMILES.

    Args:
        root_file: root of the parsed XML file

    Returns:
        reaction_list: list of dictionaries containing the extracted information
    """
    
    # Define the namespace
    ns = {'cml': 'http://www.xml-cml.org/schema', 'dl': 'http://bitbucket.org/dan2097'}

    # Create lists to store extracted information
    reaction_list = []

    # Find all <reaction> elements
    reaction_elements = root_file.findall('.//cml:reaction', ns)

    # Iterate over each <reaction> element
    for reaction_element in reaction_elements:
        # Create a dictionary to store information about the reaction
        reaction_dict = {}

        # Extract title
        title = reaction_element.find('.//dl:headingText', ns)
        if title is not None:
            reaction_dict['title'] = title.text

        # Extract paragraph text
        paragraph_text = reaction_element.find('.//dl:paragraphText', ns)
        if paragraph_text is not None:
            reaction_dict['paragraphText'] = paragraph_text.text

        # Extract reaction SMILES
        reaction_smiles = reaction_element.find('.//dl:reactionSmiles', ns)
        if reaction_smiles is not None:
            reaction_dict['reactionSmiles'] = reaction_smiles.text

        # Extract product SMILES
        product_elements = reaction_element.findall('.//cml:product', ns)
        product_smiles = []
        for product_element in product_elements:
            identifier_element = product_element.find('.//cml:identifier[@dictRef="cml:smiles"]', ns)
            if identifier_element is not None:
                smiles_value = identifier_element.get('value')
                if smiles_value is not None:
                    product_smiles.append(smiles_value)
        if product_smiles:
            reaction_dict['productSmiles'] = product_smiles

        # Append the reaction dictionary to the reaction list
        reaction_list.append(reaction_dict)

    # Check if any values were extracted
    #print("Reaction List:", reaction_list)
    return reaction_list


In [45]:
Applications_list = []

# define path to access first XML file in the folder 2001 of applications
# Insert the path to the folder containing the XML files
for folder in os.listdir(r'C:\Users\milen\git\ppChem\PPChem_TLC\data\applications'):
    folder = os.path.join(r'C:\Users\milen\git\ppChem\PPChem_TLC\data\applications', folder)
    print(folder) 
    for file in os.listdir(folder):
        if file.endswith('.xml'):
            file = os.path.join(folder, file)
            tree = et.parse(file)
         # define root of the XML file to iterate through the file
            root = tree.getroot()
            Applications_list.append(extract_data(root))    
                  

C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2001
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2002
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2003
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2004
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2005
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2006
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2007
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2008
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2009
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2010
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2011
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2012
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2013
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2014
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2015
C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2016


In [30]:
# Applications_list = []
# # define path to access first XML file in the folder 2001 of applications
# for path in os.listdir(r'C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2001'):
#     if path.endswith('.xml'):
#         path = os.path.join(r'C:\Users\milen\git\ppChem\PPChem_TLC\data\applications\2001', path)
#         tree = et.parse(path)
        
#         # define root of the XML file to iterate through the file
#         root = tree.getroot()
#         Applications_list.append(extract_data(root))

In [29]:
# # Get more familiar with the data and with how you can iterate through an XML file.
# count = 0
# for title in root:
#     print(title[0][0].tag)
#     count += 1
# print(count)

## Collect first values
Now that we are more familiar with the XML file and how we can iterate or access the different children of the root, we try to create a list with the values of interest.

In [None]:
# Extract the title of the reactions
Title = []
for title in root.iter('{http://bitbucket.org/dan2097}headingText'):
    print(title.text)   # print the title of the reactions to see what they are like
    Title.append(title.text)
print(len(Title))

OOps! Already we encounter the first problem here: the list of titles only counts up to 32 instead of the 38 reactions we could previously extract from this first XML file! Therefore, it will not be possible to zip the different lists we will create in the following, as we would loose the information of the matching. Stay tuned for the solution to this problem!

In [None]:
# Extract the experimental procedure
ExpProcedure = []
for expprocedure in root.iter('{http://bitbucket.org/dan2097}paragraphText'):
    # print(title.text)
    ExpProcedure.append(expprocedure.text)
print(len(ExpProcedure))

In [None]:
# Extract the reaction SMILES
RxnSmiles = []
for smiles in root.iter('{http://bitbucket.org/dan2097}reactionSmiles'):
    # print(title.text)
    RxnSmiles.append(smiles.text)
print(len(RxnSmiles))

Here we had a problem:
while in the other tags, the value we wanted to extract was directly linked, in the product tag there are different values and we only want to extract the product smiles identifier. Here we need to be very specific to prevent extracting other identifier smiles from the reactants or spectators tags. 

In [None]:
PrdSmiles = []

# Define the namespace
ns = {'cml': 'http://www.xml-cml.org/schema'}

# Find all <reaction> elements
reaction_elements = root.findall('.//cml:reaction', ns)

# Iterate over each <reaction> element
for reaction_element in reaction_elements:
    # Find all <product> elements within the current <reaction> element
    product_elements = reaction_element.findall('.//cml:product', ns)
    # Iterate over each <product> element
    for product_element in product_elements:
        # Find all <identifier> elements within the current <product> element
        identifier_elements = product_element.findall('.//cml:identifier[@dictRef="cml:smiles"]', ns)
        # Iterate over each <identifier> element
        for identifier_element in identifier_elements:
            # Extract the value attribute (SMILES value)
            smiles_value = identifier_element.attrib.get('value')
            if smiles_value is not None:
                # Append the SMILES value to the list or process it as needed
                PrdSmiles.append(smiles_value)

# Check if any values were extracted
print("Product SMILES:", PrdSmiles)

In [None]:
print(len(PrdSmiles))

Now, to prevent mismatching through handling of lists of different lengths, the best is to create a dictionnary for every reaction containing the values of interest as keys. After that, we can build a dataframe where every dictionnary is one entry in the df.

In [17]:
#reaction_list = extract_data(root)
print(Applications_list[23][0])

{'title': 'HDDA—1,6-hexanediol diacrylate (available from UCB Chemicals Corp., Augustana, S.C.)', 'paragraphText': 'HPA—an isometric mixture of 2-hydroxypropyl acrylate and 3-hydroxypropyl acrylate (available as ROCRYL 430 from Rohm and Haas, Philadelphia, Pa.)', 'reactionSmiles': '[C:1]([O:5][CH2:6][CH:7](O)[CH3:8])(=[O:4])[CH:2]=[CH2:3].[C:10]([O:14][CH2:15][CH2:16][CH2:17]O)(=[O:13])[CH:11]=[CH2:12]>>[CH2:12]=[CH:11][C:10]([O:14][CH2:15][CH2:16][CH2:17][CH2:8][CH2:7][CH2:6][O:5][C:1]([CH:2]=[CH2:3])=[O:4])=[O:13].[C:10]([O:14][CH2:15][CH2:16][CH2:17][CH2:8][CH2:7][CH2:6][O:5][C:1](=[O:4])[CH:2]=[CH2:3])(=[O:13])[CH:11]=[CH2:12] |f:2.3|', 'productSmiles': ['C=CC(=O)OCCCCCCOC(=O)C=C.C(C=C)(=O)OCCCCCCOC(C=C)=O']}


# Put extracted data into a dataframe.
Now that we created a list of lists containing a dictionnary for every reaction that was extracted from the XML files, we put the lists into a dataframe.

In [46]:
# Create a DataFrame from the extracted data
df_extracts = pd.DataFrame(Applications_list[0])

# Iterate through the list of dictionaries and create a DataFrame
for i in range(1, len(Applications_list)):
    df = pd.DataFrame(Applications_list[i])
    
    # Concatenate the DataFrames
    df_extracts = pd.concat([df_extracts, df], ignore_index=True)

print(df_extracts.shape)
df_extracts.head()


(1939253, 4)


Unnamed: 0,paragraphText,reactionSmiles,productSmiles,title
0,"PL 137,526 describes the hydrogenation of p-te...",[C:1]([C:5]1[CH:10]=[CH:9][C:8]([OH:11])=[CH:7...,[C(C)(C)(C)C1CCC(CC1)O],
1,"Slurry aluminum chloride (140.9 g, 1.075 mol) ...",[Cl-].[Al+3].[Cl-].[Cl-].[Cl:5][CH2:6][CH2:7][...,[ClCCCC(=O)C1=CC=C(C=C1)C(C)C],Step h: 4-Chloro-1-(4-isopropyl-phenyl)-butan-...
2,"Suspend anhydrous AlCl3 (156 g, 1.15 mol) in t...",[Al+3].[Cl-].[Cl-].[Cl-].[Cl:5][CH2:6][CH2:7][...,[ClCCCC(=O)C1=CC=C(C=C1)C],Step d: 4-Chloro-1-(4-methyl-phenyl)-butan-1-one
3,Dissolve 4-chloro-1-(4-isopropyl-phenyl)-butan...,[Cl:1][CH2:2][CH2:3][CH2:4][C:5]([C:7]1[CH:12]...,[BrC(C)(C)C1=CC=C(C=C1)C(CCCCl)=O],1-[4-(1-Bromo-1-methyl-ethyl)-phenyl]-4-chloro...
4,Dissolve 4-chloro-1-(4-isopropyl-phenyl)-butan...,[Cl:1][CH2:2][CH2:3][CH2:4][C:5]([C:7]1[CH:12]...,[BrC(C)(C)C1=CC=C(C=C1)C(CCCCl)=O],1-[4-(1-Bromo-1-methyl-ethyl)-phenyl]-4-chloro...


In [52]:
df_extracts.to_csv(r'C:\Users\milen\git\ppChem\PPChem_TLC\extracted_data_raw_applications.csv', index=False)

In [60]:
import re

def extract_rf_eluent(text):
    
    """function to generate a regex pattern for the extraction of Rf values, eluent ratios, and their nature from a given text.

    Args:
        text: paragraphText extracted from the XML file

    Returns:
        string: Found Rf values, eluent ratios, and their nature
    """
    # Define regular expressions for matching Rf values, eluent ratios, and their nature
    rf_pattern = r'Rf\s*[:=]?\s*(\d+\.\d+)'
    eluent_pattern = r'(?:\d+\.\d+|[\w\s]+)\s*(?:[:/-])\s*(?:\d+\.\d+|[\w\s]+)'
    nature_pattern = r'[A-Za-z\s]+(?=\()'

    # Search for Rf values, eluent ratios, and their nature in the text
    rf_matches = re.findall(rf_pattern, text)
    eluent_matches = re.findall(eluent_pattern, text)
    nature_matches = re.findall(nature_pattern, text)

    # Return the extracted information as a dictionary
    return {
        'Rf_values': rf_matches,
        'Eluent_ratios': eluent_matches,
        'Nature': nature_matches
    }

# The regex patterns are yet not very useful, as they are not able to extract the information from the text.


# # Test the function with example text
# example_text = """
# Thin layer chromatography on silica gel with 4:1 ethyl acetate/hexane gave an Rf value of 0.59.
# Thin layer chromatography (TLC) showed a major spot at Rf=0.3 in 10% methylene chloride/hexane (with benzyl bromide at Rf=0.4).
# The product was purified by column chromatography on silica gel (100 g) eluting with CHCl3-MeOH (3:1 ,v/v). Selected fractions, based on TLC (Rf 0.65) analysis, were combined and concentrated to give a 1.64 g (54.1%) of a tan-brown solid.
# """

# result = extract_rf_eluent(example_text)
# print("Rf values:", result['Rf_values'])
# print("Eluent ratios:", result['Eluent_ratios'])
# print("Nature:", result['Nature'])


In [59]:
# for i in df_extracts.index[:20]:
#     rf_eluent_info = extract_rf_eluent(df_extracts.loc[i, 'paragraphText'])
#     print("Length of Rf_values:", len(rf_eluent_info['Rf_values']))
#     print("Length of Eluent_ratios:", len(rf_eluent_info['Eluent_ratios']), rf_eluent_info['Eluent_ratios'])
#     print("Length of Nature:", len(rf_eluent_info['Nature']))
#     df_extracts.loc[i, 'Rf_values'] = rf_eluent_info['Rf_values']
#     df_extracts.loc[i, 'Eluent_ratios'] = rf_eluent_info['Eluent_ratios']
#     df_extracts.loc[i, 'Nature'] = rf_eluent_info['Nature']