In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw

In [5]:
df_LLM1 = pd.read_csv(r'/Users/matthiasgalka/git/ppchem_project/data/LLM_processed/LLM_processed(1).csv')
df_LLM2 = pd.read_csv(r'/Users/matthiasgalka/git/ppchem_project/data/LLM_processed/LLM_processed(2).csv')
df_LLM31 = pd.read_csv(r'/Users/matthiasgalka/git/ppchem_project/data/LLM_processed/LLM_processed(3.1).csv')
df_LLM32 = pd.read_csv(r'/Users/matthiasgalka/git/ppchem_project/data/LLM_processed/LLM_processed(3.2).csv')
df_LLM4 = pd.read_csv(r'/Users/matthiasgalka/git/ppchem_project/data/LLM_processed/LLM_processed(4).csv')

In [8]:
df_LLM1.head() 
df_LLM2.head() 
#df_LLM31.shape 
df_LLM32.shape #dataframe is too small just 4116 instead of 4572!!
#df_LLM4.shape 

(4572, 11)

In [11]:
def clean_up(Dataframe: pd.DataFrame, tolerance: float = 1):
    """1. Delets all row whitout Rf value or Rf value over 1.0 and without solvent information.
       2. Converts percentage of solvents in a Dataframe from str to float and check if they add up to 100, else drop them.
       3. Strips productSMILES str to a usable SMILES str.

    Args: 
        Dataframe (_type_): Dataframe containing the processed data from the get_value function.
        Dataframe needs to have following columns: 'productSmiles,' 'Rf', 'Percent_A', 'Percent_B'
        tolerance (_type_): float, default = 1, tolerance for the sum of the percentages of the solvents (default is 100% +- 1%)

    """
    size_pre_cleaning = len(Dataframe)  # get the size of the dataframe
    # Drop rows without Rf values
    Dataframe.dropna(subset=['Rf'], inplace = True) 
    size_after_nan = len(Dataframe)  # get the size of the dataframe after dropping rows with NaN values
    print(f"Number of rows dropped due to NaN values in Rf: {size_pre_cleaning - size_after_nan}, {round((size_pre_cleaning - size_after_nan) / size_pre_cleaning * 100, 2)}%")

    # Convert Rf value to float
    Dataframe['Rf'] = Dataframe['Rf'].astype(float)
    
    #finds indicies with Rf values over 1.0
    indices_false_Rf = Dataframe[Dataframe["Rf"] > 1].index 
    Dataframe.drop(indices_false_Rf,inplace = True) #drops rows with false Rf values
    size_after_false_Rf = len(Dataframe)  # get the size of the dataframe after dropping rows with Rf values over 1.0
    print(f"Number of rows dropped due to Rf values over 1.0: {len(indices_false_Rf)}, {round(len(indices_false_Rf) / size_pre_cleaning * 100, 2)}%")
    
    #check if at least one solvent (either solvent A or solvent B) is given (so check that solvent A and solvent B are not None)
    Dataframe = Dataframe[Dataframe['Solvent_A'].notnull() & Dataframe['Solvent_B'].notnull()].copy()
    size_after_solvent_drop = len(Dataframe)  # get the size of the dataframe after dropping rows without solvent information
    print(f"Number of rows dropped due to missing solvent information: {size_after_false_Rf - size_after_solvent_drop}, {round((size_after_false_Rf - size_after_solvent_drop) / size_pre_cleaning * 100, 2)}%")

    #convert 'None' entry to 0
    Dataframe.loc[:, 'Percent_A'] = Dataframe['Percent_A'].apply(lambda x: 0 if x is None else x)
    
    #convert 'None' entry to 0
    Dataframe.loc[:, 'Percent_B'] = Dataframe['Percent_B'].apply(lambda x: 0 if x is None else x)
    
    # convert Percentage to float
    Dataframe.loc[:, 'Percent_A'] = Dataframe['Percent_A'].apply(lambda x: float(x)) 
    
    #convert Percantage to float
    Dataframe.loc[:, 'Percent_B'] = Dataframe['Percent_B'].apply(lambda x: float(x)) 
    
    # Drop rows where 'additive_C' is not None
    Dataframe = Dataframe[Dataframe['Additive_C'].isnull()].copy()
    size_after_additive_drop = len(Dataframe)  # get the size of the dataframe after dropping rows with additive C
    print(f"Number of rows dropped due to additive C: {size_after_solvent_drop - size_after_additive_drop}, {round((size_after_solvent_drop - size_after_additive_drop) / size_pre_cleaning * 100, 2)}%")

    # remove [' and '] from the productSimles
    Dataframe.loc[:, 'productSmiles'] = Dataframe['productSmiles'].apply(lambda x: x[2:-2]) 
    
    # check if the sum of the percentages is 100, this at the same time kicks out entries with additives C (+ consider limitations of floating-point arithmetic)
    Dataframe.loc[:, 'sum'] = Dataframe['Percent_A'] + Dataframe['Percent_B']
    Dataframe = Dataframe[(Dataframe['sum'] >= 100 - tolerance) & (Dataframe['sum'] <= 100 + tolerance)].copy()
    size_after_percentage_check = len(Dataframe)  # get the size of the dataframe after dropping rows with wrong percentages
    print(f"Number of rows dropped due to wrong percentages: {size_after_additive_drop - size_after_percentage_check}, {round((size_after_additive_drop - size_after_percentage_check) / size_pre_cleaning * 100, 2)}%")

    Dataframe.reset_index(drop=True, inplace=True) # sets the index new from 1 to end

    size_post_cleaning = Dataframe.shape[0]  # get the size of the dataframe after cleaning
    print(f"Size of the dataframe before cleaning: {size_pre_cleaning}")
    print(f"Size of the dataframe after cleaning: {size_post_cleaning}")
    print(f"Number of rows dropped: {size_pre_cleaning - size_post_cleaning}")
    print(f"Percentage of rows dropped: {(size_pre_cleaning - size_post_cleaning) / size_pre_cleaning * 100}%")
    return Dataframe


In [9]:
# connecting all the Datarfames to one and make a csv file

df_LLM = pd.concat([df_LLM1, df_LLM2, df_LLM31, df_LLM32, df_LLM4], axis=0, ignore_index=True)

#needs to be repeated with df_LLM32 with original size

In [6]:
df_LLM.head()

Unnamed: 0,paragraphText,reactionSmiles,productSmiles,title,Rf,Solvent_A,Solvent_B,Percent_A,Percent_B,Additive_C,Percent_C
0,[A] Synthesis of 1-benzyl-piperidin-4-ylidene)...,C(CC([O-])=O)#N.[CH2:7]([N:14]1[CH2:19][CH2:18...,['C(C1=CC=CC=C1)N1CCC(CC1)=C(C(=O)OCC)C#N'],,0.53,,,,,,
1,A solution of (S)-(+)-3-hydroxytetrahydrofuran...,O[C@H]1CCOC1.CC([O-])(C)C.[K+].[CH3:13][O:14][...,['COC(=O)C=1SC=CC1'],,0.3,dichloromethane,ethyl acetate,100.0,20.0,,1.0
2,A solution of 4-{[(phenylmethoxy)carbonylamino...,[C:1]1([CH2:7][O:8][C:9]([NH:11][CH2:12][C:13]...,['CON(C(=O)C1=CC=C(C=C1)CNC(=O)OCC1=CC=CC=C1)C'],Preparation of N-methoxy-N-methyl(4-{[(phenylm...,0.3,pentane,EtOAc,50.0,50.0,,
3,Putative nitrilase up-mutants were assayed in ...,[CH3:1][C:2]1(C)S[C@@H]2[C@H](NC([C@H](N)C3C=C...,['O[C@@H](CC(=O)O)CC#N'],,0.5,EtOAc,Hexanes,50.0,50.0,,
4,3.0 g N6-Benzoyl-5′-O-tert-butyldimethylsilyl-...,[C:1]([NH:9][C:10]1[C:11]2[N:12]=[CH:13][N:14]...,['C(C1=CC=CC=C1)(=O)NC=1C=2N=CN([C@H]3C[C@H](O...,,0.6,,,,,,


In [7]:
df_LLM.to_csv(r'/Users/matthiasgalka/git/ppchem_project/data/After_LLM.csv')

In [13]:
df_LLM_clean = clean_up(df_LLM)

Number of rows dropped due to NaN values in Rf: 12017, 33.27%
Number of rows dropped due to Rf values over 1.0: 473, 1.31%
Number of rows dropped due to missing solvent information: 3851, 10.66%
Number of rows dropped due to additive C: 2582, 7.15%
Number of rows dropped due to wrong percentages: 583, 1.61%
Size of the dataframe before cleaning: 36123
Size of the dataframe after cleaning: 16617
Number of rows dropped: 19506
Percentage of rows dropped: 53.998837305871604%


In [14]:

def remove_salts(Dataframe: pd.DataFrame):
    """Removes salts from a Dataframe containing SMILES strings.

    Args:   

    """
    size_with_salts = len(Dataframe)  # get the size of the dataframe

    indices_false_Rf = Dataframe[Dataframe["productSmiles"].str.contains('\.')].index 
    Dataframe.drop(indices_false_Rf,inplace = True)
    size_without_salts = len(Dataframe)  # get the size of the dataframe after dropping rows with Rf values over 1.0
    print(f"number of rows dropped due to salts: {size_with_salts - size_without_salts}, {round((size_with_salts - size_without_salts) / size_with_salts * 100, 2)}%")

    return Dataframe

In [15]:
df_no_salt = remove_salts(df_LLM_clean)

number of rows dropped due to salts: 289, 1.74%


Now I will try some things to solve the Smiles problem (two product Smiles, Salts, Enatiomers)

In [None]:

def find_rows_with_dot(Dataframe: pd.DataFrame):
    
    ''' Finds all entries with product Smiles which are salts, complexes or Molecuels sperated by a '.' and puts them into a
        new Dataframe.

    Args: 
        Datafarme which contains a column called 'productSmiles' with Smiles.
    '''
    df_salts = pd.DataFrame(columns=df.columns)
    
    # Iteriere über jede Zeile im DataFrame
    for index, row in Dataframe.iterrows():
        # Überprüfe, ob der Wert in der Spalte "productSmiles" einen Punkt enthält
        if '.' in row['productSmiles']:
            # Füge die Zeile zum DataFrame df_salts hinzu
            df_salts = pd.concat([df_salts,pd.DataFrame([row])], ignore_index =False)
    
    return df_salts

In [None]:
df_salt = find_rows_with_dot(df_LLM_clean)

  df_salts = pd.concat([df_salts,pd.DataFrame([row])], ignore_index =False)


In [67]:
def canonicalize_smiles(Dataframe: pd.DataFrame, column_name: str):
    """Function that canonicalizes the SMILES strings in the dataframe.

    Args:
        Dataframe (pd.DataFrame): Dataframe containing the extracted data from the US patents, 
                                preprocessed with the get_values, clean_up, and convert_solvents function.
                                
        column_name (str): name of the column that needs to be canonicalized e.g. productSmiles or Solvent_A_Smiles etc.
    """
    for index, row in Dataframe.iterrows():
        try: 
            smiles_to_canon = row[column_name]
            if smiles_to_canon is not None:
                p_mol = Chem.MolFromSmiles(smiles_to_canon)
                if p_mol is not None:
                    smiles_to_canon = Chem.MolToSmiles(p_mol)
                    Dataframe.at[index, 'productSmiles'] = smiles_to_canon
                else:
                    print(f"Could not canonicalize SMILES for product at index {index}, value is {smiles_to_canon}")
        except Exception as e:
            print(e)
            print(f"Error at index {index}, smiles value is {smiles_to_canon}")
        
    return Dataframe

In [54]:
# Example Strigs (Smiles)

S_a_k = "COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC',"
a_S_k_a = "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC',"
a_S_a_k_a = "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC','"
a_S_a_k_w_a = "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC', '"
S_k_a_w_a_S = "C(=O)(C(F)(F)F)O', 'O=C(CN1N=C(C2=CC=CC=C12)C(=O)N)N1[C@@H]2C[C@@H]2C[C@H]1C(NC1=NN(C=C1)CC(F)(F)F)=O"
En_ = "[Si](C)(C)(C(C)(C)C)O[C@H]1C[C@H](C[C@H]([C@@H]1O[Si](C)(C)C(C)(C)C)C)C1=C(C=NC=C1)N', '[Si](C)(C)(C(C)(C)C)O[C@@H]1C[C@@H](C[C@@H]([C@H]1O[Si](C)(C)C(C)(C)C)C)C1=C(C=NC=C1)N"

test_1 = [S_a_k, a_S_k_a, a_S_a_k_a, a_S_a_k_w_a]

test_2 = [S_a_k, a_S_k_a, a_S_a_k_a, a_S_a_k_w_a, S_k_a_w_a_S, En_]

dic_test_2 = {'productSmiles': test_2}

df_test = pd.DataFrame(dic_test_2)


Lets find a function which removes all komma, apostroph or whitespace from a Smiles. If there is a second Smiles it should seperate them and put it in another column

In [55]:
df_test.head(6)

Unnamed: 0,productSmiles
0,"COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC',"
1,"'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC',"
2,"'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC','"
3,"'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC', '"
4,"C(=O)(C(F)(F)F)O', 'O=C(CN1N=C(C2=CC=CC=C12)C(..."
5,[Si](C)(C)(C(C)(C)C)O[C@H]1C[C@H](C[C@H]([C@@H...


In [40]:

special_characters = [',', ' ', "'"]

for Smiles in test:

    for char in special_characters:
        Smiles = Smiles.replace(char, '')
    print(Smiles)
    

COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC
["COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC',", "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC',", "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC','", "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC', '"]
COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC
["COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC',", "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC',", "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC','", "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC', '"]
COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC
["COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC',", "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC',", "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC','", "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC', '"]
COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC
["COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC',", "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC',", "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC','", "'COC1=CC=C(C=C1)[C@@H]1C[C@H](C1)C(=O)OC', '"]


In [1]:
def clean_Smiles(productSmiles: str, Dataframe: pd.DataFrame):

    '''


    '''

    Smiles_list = productSmiles.split("', '")

    special_characters = [',', ' ', "'"]

    for Smiles in Smiles_list:

        for char in special_characters:
            Smiles = Smiles.replace(char, '')
    
    if len(Smiles_list) == 2:
        Dataframe.loc[df['productSmiles'] == productSmiles, 'productSmiles'] = Smiles_list[0]
        Dataframe['productSmiles_2'] = Smiles_list[1]
    
    else:
        Dataframe.loc[df['productSmiles'] == productSmiles, 'productSmiles'] = Smiles_list[0]
        Dataframe['productSmiles_2'] = np.NaN


    return Dataframe


NameError: name 'pd' is not defined

In [10]:
df_LLM.to_csv(r'/Users/matthiasgalka/git/ppchem_project/data/LLM_processed/After_LLM.csv', index=False)

In [11]:
df = pd.read_csv(r'/Users/matthiasgalka/git/ppchem_project/data/LLM_processed/After_LLM.csv')

In [13]:
df.shape

(36579, 11)