# Pre-processing of data
The first thing to do is extracting the data we are interested in from the XML files (XML files are used from a dataset of US patent applications from 2001 to 2016. This dataset can be found here: https://figshare.com/articles/dataset/Chemical_reactions_from_US_patents_1976-Sep2016_/5104873). We are interested in the title, the general experimental procedure, the reaction SMILES, but most importantly the product SMILES.
Therefore, using xml.Etree in the following we iterate through the child elements of the root of the file. 


In [3]:
# First we import the necessary libraries

import xml.etree.cElementTree as et     # for parsing the XML file
import pandas as pd
import numpy as np
import os
import re
from groq import Groq # for the LLM Groq queries (needs to be installed via pip)
import json
from tqdm import tqdm


Now we use a function to iterate over all the XML files in the applications data from 2001. We collect the title, the paragraphText, mostly including the experimental procedures, the reaction SMILES and the product SMILES from it.

In [2]:
def extract_data(root_file):
    """function that extracts data from an XML file and returns a list of dictionaries containing the extracted information.
        Information to be extracted includes the title of the reaction, the experimental procedure, the reaction SMILES, and the product SMILES.

    Args:
        root_file: root of the parsed XML file

    Returns:
        reaction_list: list of dictionaries containing the extracted information
    """
    
    # Define the namespace that prevents mismatching of tags in the XML file
    ns = {'cml': 'http://www.xml-cml.org/schema', 'dl': 'http://bitbucket.org/dan2097'}

    # Create lists to store extracted information
    reaction_list = []

    # Find all <reaction> elements
    reaction_elements = root_file.findall('.//cml:reaction', ns)

    # Iterate over each <reaction> element
    for reaction_element in reaction_elements:
        # Create a dictionary to store information about the reaction
        reaction_dict = {}

        # Extract title
        title = reaction_element.find('.//dl:headingText', ns)
        if title is not None:
            reaction_dict['title'] = title.text

        # Extract paragraph text
        paragraph_text = reaction_element.find('.//dl:paragraphText', ns)
        if paragraph_text is not None:
            reaction_dict['paragraphText'] = paragraph_text.text

        # Extract reaction SMILES
        reaction_smiles = reaction_element.find('.//dl:reactionSmiles', ns)
        if reaction_smiles is not None:
            reaction_dict['reactionSmiles'] = reaction_smiles.text

        # Extract product SMILES
        product_elements = reaction_element.findall('.//cml:product', ns)
        product_smiles = []
        for product_element in product_elements:
            identifier_element = product_element.find('.//cml:identifier[@dictRef="cml:smiles"]', ns)
            if identifier_element is not None:
                smiles_value = identifier_element.get('value')
                if smiles_value is not None:
                    product_smiles.append(smiles_value)
        if product_smiles:
            reaction_dict['productSmiles'] = product_smiles

        # Append the reaction dictionary to the reaction list
        reaction_list.append(reaction_dict)

    # Check if any values were extracted
    #print("Reaction List:", reaction_list)
    return reaction_list


In a second step, we iterate through all files in every folder of the application data of the dataset and extract the data with the function extract_data. This takes quite a bit, but you can see the progress whenever a new folder is treated. 

In [69]:
Applications_list = []

# define path to access first XML file in the folder 2001 of applications
# Insert the path to the folder containing the XML files
for folder in os.listdir(r'C:\Users\milen\git\ppChem\PPChem_TLC\data\applications'):
    folder = os.path.join(r'C:\Users\milen\git\ppChem\PPChem_TLC\data\applications', folder)
    print(folder) 
    for file in os.listdir(folder):
        if file.endswith('.xml'):
            file = os.path.join(folder, file)
            tree = et.parse(file)
         # define root of the XML file to iterate through the file
            root = tree.getroot()
            Applications_list.append(extract_data(root))    
                  

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\milen\\git\\ppChem\\PPChem_TLC\\data\\applications'

# Put extracted data into a dataframe.
Now that we created a list of lists containing a dictionnary for every reaction that was extracted from the XML files, we put the lists into a dataframe.

In [None]:
# Create a DataFrame from the extracted data
df_extracts = pd.DataFrame(Applications_list[0])

# Iterate through the list of dictionaries and create a DataFrame
for i in range(1, len(Applications_list)):
    df = pd.DataFrame(Applications_list[i])
    
    # Concatenate the DataFrames
    df_extracts = pd.concat([df_extracts, df], ignore_index=True)

print(df_extracts.shape)
df_extracts.head()


(1939253, 4)


Unnamed: 0,paragraphText,reactionSmiles,productSmiles,title
0,"PL 137,526 describes the hydrogenation of p-te...",[C:1]([C:5]1[CH:10]=[CH:9][C:8]([OH:11])=[CH:7...,[C(C)(C)(C)C1CCC(CC1)O],
1,"Slurry aluminum chloride (140.9 g, 1.075 mol) ...",[Cl-].[Al+3].[Cl-].[Cl-].[Cl:5][CH2:6][CH2:7][...,[ClCCCC(=O)C1=CC=C(C=C1)C(C)C],Step h: 4-Chloro-1-(4-isopropyl-phenyl)-butan-...
2,"Suspend anhydrous AlCl3 (156 g, 1.15 mol) in t...",[Al+3].[Cl-].[Cl-].[Cl-].[Cl:5][CH2:6][CH2:7][...,[ClCCCC(=O)C1=CC=C(C=C1)C],Step d: 4-Chloro-1-(4-methyl-phenyl)-butan-1-one
3,Dissolve 4-chloro-1-(4-isopropyl-phenyl)-butan...,[Cl:1][CH2:2][CH2:3][CH2:4][C:5]([C:7]1[CH:12]...,[BrC(C)(C)C1=CC=C(C=C1)C(CCCCl)=O],1-[4-(1-Bromo-1-methyl-ethyl)-phenyl]-4-chloro...
4,Dissolve 4-chloro-1-(4-isopropyl-phenyl)-butan...,[Cl:1][CH2:2][CH2:3][CH2:4][C:5]([C:7]1[CH:12]...,[BrC(C)(C)C1=CC=C(C=C1)C(CCCCl)=O],1-[4-(1-Bromo-1-methyl-ethyl)-phenyl]-4-chloro...


That Dataframe is huge! Finally, save all the extracted data into a csv file on the local device.

In [None]:
df_extracts.to_csv(r'C:\Users\milen\git\ppChem\PPChem_TLC\extracted_data_raw_applications.csv', index=False)

# Further Processing of the Data using Regex and LLM
Of course, not all entries in the data frame can be used for our model. Many of the experimental procedures do not include any information about the Rf value. Thus, we need to find the entries with Rf values. As all experimental procedures are written differently, we will try to find the value of interest by using Regex (Regular Expressions).

First, we load the extracted data into a new dataframe.

In [4]:
#df_new = pd.read_csv(r'C:\Users\milen\git\ppChem\PPChem_TLC\extracted_data_raw_applications.csv')

df_new = pd.read_csv(r'/Users/matthiasgalka/git/ppchem_project/data/extracted_data_raw_applications.csv')
            

And we create another dataframe, which only includes the first 1000 rows of the dataframe to reduce the cost of executing the test code below.

In [5]:
df_new.shape
#df_new = df_new.iloc[0:1000]
#df_new.head()

(1939253, 4)

Seems to have worked! Now let's take a closer look at how we will try to extract the Rf values. 
Throughout different testing until now, we found several criteria that need to be defined in the Regex pattern for the Rf value: it should be a number following the general pattern 0.XY with Y not being obligatory and X being some digit between 2 and 8 (this is to exclude as many other matches as possible and as Rf values should ideally be around 0.5, we thought this would be optimal). Furthermore, the pattern should not be followed by other digits (e.g. 0.45005) nor include special signs (0.5:0.4) or temperature values. The last remaining problem is, to distinguish between information about quantity and the Rf value (e.g. 0.56 vs. 0.56 mg).

In [3]:
def extract_rf_eluent(Dataframe: pd.DataFrame):
    """Function that applies defined regex patterns to data in a dataframe and creates 
    the following new columns: 
    Rf value, solvent A, solvent B, % solvent A, % solvent B. If no Rf value can be found, all columns
    are filled with NaN. 

    Args:
        Dataframe (_type_): Dataframe containing the extracted data from the US patents
    """
    # copy the dataframe to leave old dataframe unchanged
    df = Dataframe.copy()
    
    # Define the regex patterns
    Rf_check = r'( ?R[fF]?[ :=(]?)'
    Rf_pattern = r'(0\.(?!0|9)\d{1,2})\b(?! *mg\b| *mL\b| *g\b)' # exclude decimals that start by 0.0x, this yet seems not to work though, try to specifiy that digit after . cannot start with 0.
    #Rf_pattern = r'[ =:]?(0\.[^0][0-9])[^\dmglL](?! mg)' # exclude decimals that start by 0.0x, this yet seems not to work though, try to specifiy that digit after . cannot start with 0.
    
    #set a count to see how many multiple Rf values are found in the dataframe, how many NaN values are found
    count = 0
    count_nan = 0
    
    # Extract the Rf values from the paragraphText and put them into a new column
    for index, row in df.iterrows():
        checkRf = re.findall(Rf_check, row['paragraphText'])
        
        #check if Rf value can be found in the paragraphText column
        if checkRf:
             #try to find the Rf value in the paragraphText column
            match = re.findall(Rf_pattern, row['paragraphText'])
            if match:
                df.at[index, 'Rf_value'] = match[0] # df['paragraphText'].str.extract(Rf_pattern)
            
                # Check if multiple Rf values were found (potential error source)
                if len(match) > 1:
                   # print('Multiple Rf values found in paragraphText:', match, 'at index:', index)
                    count += 1
       
        else:
            df.at[index, 'Rf_value'] = np.nan
            count_nan += 1
            
    print("Number of entries with multiple Rf values:", count)
    print("Number of entries with no Rf values found:", count_nan)
    print("Number of entries with Rf values found:", df['Rf_value'].count())
    return df
    

Turns out it would be very painful to do all this with regex. Not the best idea. Instead, we will try to use a LLM in the following. To still reduce the cost of computation, we will pre-filter the dataframe with the subsequent function to only get back data where an Rf value can be found in the experimental procedure. 

In [4]:
def extract_rows_with_rf(Dataframe: pd.DataFrame):
    """Function that extracts rows with Rf values from a dataframe and returns a new dataframe containing only these rows. 

    Args:
        Dataframe (_type_): Dataframe containing the extracted data from the US patents
    """
    # copy the dataframe to leave old dataframe unchanged
    df = Dataframe.copy()
    
    # Define the regex patterns
    Rf_check = r'( ?R[fF][ :=(]?)'
    
    # List to store indices of rows without Rf values
    rows_to_drop = []
    
    # Search for rows with Rf values in the paragraphText column
    for index, row in df.iterrows():
        checkRf = re.findall(Rf_check, row['paragraphText'])

        if not checkRf:
            rows_to_drop.append(index)
               
    # Drop rows without Rf values
    df = df.drop(rows_to_drop)
            
    return df
    

That is still applying the first function and one can clearly see the mess. Won't be possible to extract the exact Rf value in the multiple detected rows.

In [8]:
#df_new.head()
df_processed_first_try = extract_rf_eluent(df_new)
df_processed_first_try.shape
#df_processed_first_try.to_csv(r'C:\Users\milen\git\ppChem\PPChem_TLC\extracted_data_first_processing_rf_values.csv', index=False)

  df.at[index, 'Rf_value'] = match[0] # df['paragraphText'].str.extract(Rf_pattern)


KeyboardInterrupt: 

Now the second function: already the dataframe size cut down from 1000 entries to 246 which we can hopefully treat with a LLM.

In [183]:
df_processed_second_try = extract_rows_with_rf(df_new)
df_processed_second_try.head()
df_processed_second_try.shape

(36579, 4)

In [187]:
df_processed_second_try.to_csv(r'/Users/matthiasgalka/git/ppchem_project/data/AfterRFfilter.csv', index=False)

In [46]:
df_filtered = pd.read_csv(r'/Users/matthiasgalka/git/ppchem_project/data/AfterRFfilter.csv')

In [47]:
df_filtered.head()
df_filtered.shape

(36579, 4)

In [42]:
df_short = df_filtered.loc[:1000]

For the LLM we use the API of the open source model offered by Groq Clouds (https://console.groq.com/docs/quickstart). Different models can be tested out.

In [4]:
#Access token croq: 
import json

# Create a Groq client (it is recommended to use the following Quickstart: https://console.groq.com/docs/quickstart)
# However, this did not work in our case and we had to use the following code to create a client 

with open('../API_KEY.txt', 'r') as file:
    API_KEY = file.read().strip()


client = Groq(
    api_key=API_KEY, # insert your API key here
)
user_prompt =  "Give me the Rf value, the solvent mixture and their ratio of the following procedure in the following format of only: Rf= , solvent A= , solvent B= , percent A= , percent B= . If there is a third solvent, please provide the information in the same format, call it additive C = and percent C = . Only give the information requested as output, no additional notes or information!!!" # "Give me the Rf value, the solvent mixture and their ratio of the following procedure in the following format of a dictionary only: Rf= , solvent A= , solvent B= , percent A= , percent B= . If there is a third solvent, please provide the information in the same format, call it additive C and percent additive C = . Only give the dictionnary as output, no additional notes or information!!!"
procedure = "An Rf value of 0.22 was found using DCM/EtOAc 20:1 and 0.4% Hydroxylammonium. "#"Rf(Hex/EtOAc 1:20):0.22"
user_prompt_procedure = user_prompt + procedure
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": user_prompt_procedure
        }
    ],
    model="mixtral-8x7b-32768", # other models: LlaMA3 70 b (llama3-70b-8192) can be found here: https://console.groq.com/docs/models
)

response_str = chat_completion.choices[0].message.content

print(response_str)
type(response_str)

# response_dict = json.loads(response_str)
#type(response_dict)
#print(response_dict["Rf"])


Rf = 0.22,
Solvent A (DCM) =,
Solvent B (EtOAc) =,
Percent A = 95%,
Percent B = 5%,
Additive C (Hydroxylammonium) = 0.4%.


str

In [55]:
def parse_response(response: str):
    """Function that extracts the Rf value, solvent A, solvent B, % solvent A, and % solvent B from a LLM response.

    Args:
        response (str): response from the LLM model 

    Returns:
        rf_value (str)
        solvent_a (str) 
        solvent_b (str) 
        percent_a (str)
        percent_b (str)
        additive_c (str)
        percent_c (str)
    """
    try: 
        
        # Convert the response string to a dictionary
        # response_dict = json.loads(response)
        
        #extracts values from the response dictionary
        
        #rf_value = response_dict["Rf"]
        #solvent_a = response_dict["solvent A"]
        #solvent_b = response_dict["solvent B"]
        #percent_a = response_dict["percent A"]
        #percent_b = response_dict["percent B"]
        #additive_c = response_dict["additive C"]
        #percent_c = response_dict["percent C"]
        
        
    # search for Rf value in the response using regex
        rf_value_match = re.search(r"Rf\s*=\s*(\d+\.\d+)", response)
        solvent_a_match = re.search(r"solvent\s+A\s*=\s*([^,]+)", response)
        solvent_b_match = re.search(r"solvent\s+B\s*=\s*([^,]+)", response)
        additive_c_match = re.search(r"additive\s+C\s*=\s*([^,]+)", response)
        percent_a_match = re.search(r"percent\s+A\s*=\s*(\d+)", response)
        percent_b_match = re.search(r"percent\s+B\s*=\s*(\d+)", response)
        percent_c_match = re.search(r"percent\s+C\s*=\s*(\d+\.\d+|\d+)", response)
        
    except KeyError:
        raise KeyError(f"Error extracting values from the response: {response_str}")
    
    
    
    # Extract values from regex matches
    rf_value = rf_value_match.group(1) if rf_value_match else None
    solvent_a = solvent_a_match.group(1) if solvent_a_match else None
    solvent_b = solvent_b_match.group(1) if solvent_b_match else None
    percent_a = percent_a_match.group(1) if percent_a_match else None
    percent_b = percent_b_match.group(1) if percent_b_match else None
    additive_c = additive_c_match.group(1) if additive_c_match else None
    percent_c = percent_c_match.group(1) if percent_c_match else None

    # Return the extracted values
    return rf_value, solvent_a, solvent_b, percent_a, percent_b, additive_c, percent_c

In [6]:
def get_values(Dataframe: pd.DataFrame):
    """Function that extracts the Rf value from a row in a Dataframe using LLM and returns it.

    Args:
        DataFrame (_type_): DataFrame containing the extracted data from the US patents
    """

    with open('../API_KEY.txt', 'r') as file:
        API_KEY = file.read().strip()

    
    client = Groq(
        api_key=API_KEY 
    ) # insert your API key here # we have to add a file containing the API key to upload it to github.
    
    
    
    for index, row in tqdm(Dataframe.iterrows(),total = len(Dataframe), leave = True):
        user_prompt = f"Give me the Rf value, the solvent mixture and the ratio of the solvents in percent for each solvent of this procedure: {row['paragraphText']} in the following format only: Rf= , solvent A= , solvent B= , percent A= , percent B= . If there is a third solvent, please provide the information in the same format, call it additive C = and percent C = . If no information is given for one of the categories, put None for this category. Only give the information requested as output, no additional notes or information!!!" # f"Please only provide the Rf value, solvents A and B, additives C (if applicable), and their ratios in percent as a Python dictionary only (without 'Here is the python dictionary') with keys 'Rf', 'solvent A', 'solvent B', 'additive C', 'percent A', 'percent B', and 'percent C' of this procedure: {row['paragraphText']}. If there is no information for one category, put 'Nan'. Do not provide any additional notes or information except for the dictionary!!! Always use this format!!" # "Give me the Rf value, the solvent A, B and additive C if applied and their ratios in percent as a python dictionary calling the keys Rf (should be a number), solvent A (should be a solvent only), solvent B (solvent only), percent A (number), percent B (number), additive C (solvent), percent C (number) of this procedure:" + row['paragraphText'] + "If there is no information for one category of solvent, put Nan. Do not put any other notes or information except for the dictionary!!! Always use this format!!"  # "Give me the Rf value, the solvent mixture and their ratio of the following procedure" + row['paragraphText'] +  "in the following format of a dictionary only: Rf= , solvent A= , solvent B= , percent A= , percent B= . If there is a third solvent, please provide the information in the same format, call it additive C and percent C = . Only give the values requested as output, no additional notes or information!!!"        
        
        try:             
            # Call the LLM model to extract the Rf value
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": user_prompt,
                    }
                ],
                model="mixtral-8x7b-32768", # other models: LlaMA3 70 b (llama3-70b-8192), mixtral (mixtral-8x7b-32768) can be found here: https://console.groq.com/docs/models
            )
            
            # Extract the Rf value from the response
            response = chat_completion.choices[0].message.content
            #print(response)
            
            # Parse the response to extract the Rf value, solvent A, solvent B, % solvent A, and % solvent B using the parse_response function
            rf_value, solvent_a, solvent_b, percent_a, percent_b, additive_c, percent_c = parse_response(response)
            
            # Add extracted values to new columns in the dataframe row
            Dataframe.at[index, 'Rf'] = rf_value
            Dataframe.at[index, 'Solvent_A'] = solvent_a
            Dataframe.at[index, 'Solvent_B'] = solvent_b
            Dataframe.at[index, 'Percent_A'] = percent_a
            Dataframe.at[index, 'Percent_B'] = percent_b
            Dataframe.at[index, 'Additive_C'] = additive_c
            Dataframe.at[index, 'Percent_C'] = percent_c
            
        except Exception as e:
            # Print the error message and the index of the row where the error occurred
            #print(e)
            #print("Error at index:", index)
            continue
        
    return Dataframe
        

        
    

In [None]:
df_short_LLM = get_values(df_short)

In [31]:
df_short_LLM.head(20)

NameError: name 'df_processed_second_try' is not defined

In [76]:
def canonicalise_smiles(Smiles: str):
    '''
       Converts Smile to a Mol file and back to a Smiles again to create
       a consistent Smiles string.

       Args: Smiles string
    '''
    mol = Chem.MolFromSmiles(Smiles)
    can_Smiles = Chem.MolToSmiles(mol)

    return can_Smiles

In [77]:
def clean_up(Dataframe: pd.DataFrame):
    """Delets all row whitout a Rf value or Rf value over 1.0 and Solvent information.
       Converts 
       Converts Rf values and percentage of solvents in a Dataframe from str to float.
       Strips productSMILES str to a usable SMILES str.

    Args: 
        Dataframe (_type_): Dataframe containing the extracted data from the US patents.
        Dataframe needs to have following columns: 'productSmiles,' 'Rf', 'Percent_A', 'Percent_B' !!!

    """
    entries_raw = len(Dataframe)
    
    before_Rf_drop = len(Dataframe)
    Dataframe.dropna(subset=['Rf'], inplace = True) #drops rows without Rf
    after_Rf_drop_1 = len(Dataframe)
    print(f"Amount of Rows dropped because of missing Rf value: {before_Rf_drop-after_Rf_drop_1}")
    
    Dataframe['Rf'] = Dataframe['Rf'].apply(lambda x: float(x)) # converts Percentage to float
    
    indices_false_Rf = Dataframe[Dataframe["Rf"] > 1].index #finds indicies with Rf values over 1.0
    Dataframe.drop(indices_false_Rf,inplace = True)
    after_Rf_drop_2 = len(Dataframe)
    print(f'Amount of Rows dropped because of invalid Rf value {after_Rf_drop_1-after_Rf_drop_2}')
         
    Dataframe['Percent_A'] = Dataframe['Percent_A'].apply(lambda x: 0 if x is None else x)#converts 'None' entry to 0
    
    Dataframe['Percent_B'] = Dataframe['Percent_B'].apply(lambda x: 0 if x is None else x)#converts 'None' entry to 0
    
    Dataframe['Percent_A'] = Dataframe['Percent_A'].apply(lambda x: float(x)) # converts Percentage to float
    
    Dataframe['Percent_B'] = Dataframe['Percent_B'].apply(lambda x: float(x)) #converts Percantage to float
    
    Dataframe['productSmiles'] = Dataframe['productSmiles'].apply(lambda x: x[2:-2]) # removes [' and '] from the productSimles

    Dataframe['productSmiles'] = Dataframe['productSmiles'].apply(lambda x: canonicalise_smiles(x)) # cannonicalises Smiles
    
    Dataframe.reset_index(drop=True, inplace=True) # sets the index new from 1 to end

    entries_after = len(Dataframe)
    
    print(f'After the clean up {entries_after} entries are left and {entries_raw-entries_after} are removed')

    return Dataframe


In [36]:
df1 = clean_up(df_processed_second_try)


NameError: name 'df_processed_second_try' is not defined

In [33]:
df1.head(20)

NameError: name 'df1' is not defined

In [32]:
Solvent_A = set(df1['Solvent_A'].tolist())
Solvent_B = set(df1['Solvent_B'].tolist())

NameError: name 'df1' is not defined

In [49]:
def split_dataframe(df):
    # Anzahl der Zeilen im DataFrame
    num_rows = len(df)

    # Berechne die Anzahl der Zeilen für jeden Teil
    chunk_size = num_rows // 4

    # Teile den DataFrame in vier Teile auf
    parts = []
    for i in range(0, num_rows, chunk_size):
        part = df.iloc[i:i+chunk_size]
        parts.append(part)

    return parts

In [53]:
for i, teil in enumerate(teile):
    print(f'Teil {i+1}:')
    print(teil)

(36579, 4)

In [87]:
df1 = df_filtered.iloc[:9144]
df2 = df_filtered.iloc[9144:18288]
df3 = df_filtered.iloc[18288:27432]
df4 = df_filtered.iloc[27432:36579]

In [86]:
df_filtered.iloc[-1]

paragraphText     Under nitrogen a dry 1,000 mL flask is charged...
reactionSmiles    C1C=C2C=CC(O)=C(C3C4C(=CC=CC=4)C=CC=3O)C2=CC=1...
productSmiles     ['C(C)OC(CC(C[C@@H](\\C=C\\C=1C(=NC2=CC=CC=C2C...
title                                                           NaN
Name: 36578, dtype: object

In [88]:
df1.to_csv(r'/Users/matthiasgalka/git/ppchem_project/data/AfterRFfilter(1).csv', index=False)
df2.to_csv(r'/Users/matthiasgalka/git/ppchem_project/data/AfterRFfilter(2).csv', index=False)
df3.to_csv(r'/Users/matthiasgalka/git/ppchem_project/data/AfterRFfilter(3).csv', index=False)
df4.to_csv(r'/Users/matthiasgalka/git/ppchem_project/data/AfterRFfilter(4).csv', index=False)

In [69]:
df2.iloc[0]

paragraphText     To a stirring solution of (2S,3R)-1-azido-3-be...
reactionSmiles    [N:1]([CH2:4][C@H:5]([OH:17])[C@H:6]([O:9][CH2...
productSmiles     ['N(=[N+]=[N-])C[C@@H]([C@@H](C=C)OCC1=CC=CC=C...
title                                                           NaN
Name: 9144, dtype: object

In [66]:
df3.iloc[0]

paragraphText     About 15 ml of a solution of diazomethane (abo...
reactionSmiles    [N+](=[CH2:3])=[N-].[CH3:4][O:5][CH2:6][CH2:7]...
productSmiles     ['COCCOCC1=CC=C(C=C1)[C@H]1C[C@H](N(C[C@@H]1OC...
title             b) Methyl 3-[(2S,4R,5R)-4-[4-(2-methoxyethoxym...
Name: 18288, dtype: object

In [70]:
df_filtered.iloc[9142:9146]

Unnamed: 0,paragraphText,reactionSmiles,productSmiles,title
9142,A mixture of ethyl 2-chlorobenzo[d]thiazole-6-...,Cl[C:2]1[S:3][C:4]2[CH:10]=[C:9]([C:11]([O:13]...,['N1(CCCCC1)[C@H]1CN(CC1)C=1SC2=C(N1)C=CC(=C2)...,(R)-ethyl 2-(3-(piperidin-1-yl)pyrrolidin-1-yl...
9143,A solution of (R)-ethyl 2-(3-(piperidin-1-yl)p...,[N:1]1([C@@H:7]2[CH2:11][CH2:10][N:9]([C:12]3[...,['N1(CCCCC1)[C@H]1CN(CC1)C=1SC2=C(N1)C=CC(=C2)...,(R)-2-(3-(piperidin-1-yl)pyrrolidin-1-yl)benzo...
9144,"To a stirring solution of (2S,3R)-1-azido-3-be...",[N:1]([CH2:4][C@H:5]([OH:17])[C@H:6]([O:9][CH2...,['N(=[N+]=[N-])C[C@@H]([C@@H](C=C)OCC1=CC=CC=C...,
9145,"To a solution of 2,5-dihydro-1H-pyrrole (30 g,...",[NH:1]1[CH2:5][CH:4]=[CH:3][CH2:2]1.[C:6](ON1C...,['N1(CC=CC1)C(=O)OCC1=CC=CC=C1'],


In [81]:
df1 = pd.read_csv(r'/Users/matthiasgalka/git/ppchem_project/data/LLM_processed(1).csv')
len(df1)

9144

In [80]:
clean_up(df1)

Amount of Rows dropped because of missing Rf value: 0
Amount of Rows dropped because of invalid Rf value 0


NameError: name 'Chem' is not defined

In [58]:
df32_LLM = get_values(df4)

100%|██████████| 9147/9147 [50:17:04<00:00, 19.79s/it]      


In [59]:
df4_LLM.to_csv(r'/Users/matthiasgalka/git/ppchem_project/data/LLM_processed(4).csv')

In [62]:
df4_LLM.shape

(9147, 11)

In [23]:
df1.loc[9142, "paragraphText"]

'A mixture of ethyl 2-chlorobenzo[d]thiazole-6-carboxylate (1.62 g, 6.72 mmol), (R)-1-(pyrrolidin-3-yl)piperidine dihydrochloride (Reference Example 5c, 1.80 g, 7.94 mmol), and triethylamine (5.11 mL, 36.66 mmol) in N,N-dimethylformamide (19 mL) was stirred at ambient temperature for 17 hours. According to TLC (100% ethyl acetate) the spot for starting chloride was replaced by a much lower Rf, ninhydrin-positive spot. The reaction mixture was diluted with water (400 mL) and extracted with ethyl acetate (3×75 mL). The combined organic extracts were washed with brine (2×200 mL) then dried (MgSO4) and filtered. The filtrate was concentrated under reduced pressure. The residue was purified by column chromatography on an Analogix IF-280 (Analogix SF40-120 g, 99:1 ethyl acetate/methanol). Fractions containing product were combined and concentrated under reduced pressure to provide the title compound. 1H NMR (300 MHz, CD3OD) δ ppm 8.34 (d, J=1.7 Hz, 1H), 7.96 (dd, J=1.7 Hz, 8.5, 1H), 7.49 (d,

In [24]:
df1.to_csv(r'/Users/matthiasgalka/git/ppchem_project/Data/LLM_processed(1)')

In [10]:
df1_cleaned = clean_up(df1)

In [12]:
df1_cleaned.shape

(4239, 12)

In [36]:
df1_cleaned.head()

Unnamed: 0,paragraphText,reactionSmiles,productSmiles,title,Rf,Solvent_A,Solvent_B,Percent_A,Percent_B,Additive_C,Percent_C
0,[A] Synthesis of 1-benzyl-piperidin-4-ylidene)...,C(CC([O-])=O)#N.[CH2:7]([N:14]1[CH2:19][CH2:18...,C(C1=CC=CC=C1)N1CCC(CC1)=C(C(=O)OCC)C#N,,0.53,,,0.0,0.0,,
1,A solution of (S)-(+)-3-hydroxytetrahydrofuran...,O[C@H]1CCOC1.CC([O-])(C)C.[K+].[CH3:13][O:14][...,COC(=O)C=1SC=CC1,,0.3,dichloromethane,ethyl acetate,100.0,20.0,,1.0
2,A solution of 4-{[(phenylmethoxy)carbonylamino...,[C:1]1([CH2:7][O:8][C:9]([NH:11][CH2:12][C:13]...,CON(C(=O)C1=CC=C(C=C1)CNC(=O)OCC1=CC=CC=C1)C,Preparation of N-methoxy-N-methyl(4-{[(phenylm...,0.3,pentane,EtOAc,50.0,50.0,,
3,Putative nitrilase up-mutants were assayed in ...,[CH3:1][C:2]1(C)S[C@@H]2[C@H](NC([C@H](N)C3C=C...,O[C@@H](CC(=O)O)CC#N,,0.5,EtOAc,Hexanes,50.0,50.0,,
4,3.0 g N6-Benzoyl-5′-O-tert-butyldimethylsilyl-...,[C:1]([NH:9][C:10]1[C:11]2[N:12]=[CH:13][N:14]...,C(C1=CC=CC=C1)(=O)NC=1C=2N=CN([C@H]3C[C@H](OCS...,,0.6,,,0.0,0.0,,


In [23]:
indices_Additive = df1_cleaned[df1_cleaned["Additive_C"].notna()].index 
   # df1_cleaned.drop(indices_Additive,inplace = True)

In [41]:
indices_noSolvent = df1_cleaned[df1_cleaned['Solvent_A'].isna() | df1_cleaned['Solvent_B'].isna()].index
indices_noSolvent [0:10]

Index([0, 4, 9, 15, 18, 20, 21, 22, 24, 28], dtype='int64')

In [45]:
len(indices_Additive)
#indices_Additive[0:10]
df1_cleaned.loc[15, 'paragraphText']

'NBS (1.926 mmol) was added to a mixture of 5-(3-chloro-2-fluoro-phenyl)-6-(4-chloro-2-methyl-phenyl)-1-isopropyl-5,6-dihydro-1H-pyrrolo[3,4-b]pyrrol-4-one (Step A1) (1.965 mmol) in CCl4 (65 mL). After 1.5 h, the reaction mixture was diluted with EtOAc and successively washed with a saturated aqueous solution of NaHCO3, water and brine, dried (Na2SO4), filtered and concentrated. The residue was purified using a RediSep® silica gel column to afford the title compound as a white solid. tR: 5.69 min (HPLC 1); ESI-MS: tR=1.39 min, [M+H]+ 495/497/499 (LC-MS 1); TLC: Rf=0.12 (CH2Cl2).'

In [8]:
df32 = pd.read_csv(r'/Users/matthiasgalka/git/ppchem_project/data/AfterRFfilter(3.2).csv')

In [9]:
df = get_values(df32)

  9%|▊         | 398/4572 [1:36:24<24:41:47, 21.30s/it]

In [7]:
df32.shape

(4572, 4)