In [91]:
from rdkit import Chem
import re
import datamol as dm
import pandas as pd

In [92]:
master_amines = pd.read_csv("./data/master_Amines.csv")

In [93]:
smarts_str = "[#6]-[#7H2]"
smarts = dm.from_smarts(smarts_str)

In [94]:
master_amines.head(2)

Unnamed: 0,ID,Box,Type,Building Block ID,Smiles,CAS,Old Fluics,Mass (mg),Conc (M),Volume (uL),merge_timestamp
0,Amine_1,Amine_Box_1,Amine,IMA-XXXX,CC(C)N,,,2.5,0.5,,2025-04-16T16:00:40.988684
1,Amine_2,Amine_Box_1,Amine,IMA-XXXX,CC(N)C1CCCCC1,,,3.4,0.5,,2025-04-16T16:00:40.988684


In [101]:
def add_volume(df):
    """
    Calculate and add missing volume values using mass and concentration data.
    Uses SMILES from the dataframe to calculate molar mass using RDKit.
    
    Parameters:
    df (DataFrame): DataFrame containing Mass, Concentration, and SMILES columns
    
    Returns:
    DataFrame: DataFrame with calculated volumes
    """
    import re
    import numpy as np
    from rdkit import Chem
    from rdkit.Chem import Descriptors
    
    regex_pattern = r'\(([^)]+)\)'
    regex = re.compile(regex_pattern)
    
    # Extract column names and units
    mass_column = df.columns[df.columns.str.contains("Mass")][0]
    mass_unit = regex.search(mass_column).group(1)
    
    concentration_column = df.columns[df.columns.str.contains("Conc")][0]
    concentration_unit = regex.search(concentration_column).group(1)
    
    volume_column = df.columns[df.columns.str.contains("Volume")][0]
    volume_unit = regex.search(volume_column).group(1)
    
    print(f"Units detected: Mass: {mass_unit}, Concentration: {concentration_unit}, Volume: {volume_unit}")
    
    # Create a copy to avoid modifying the original dataframe
    df_result = df.copy()
    
    # Unit conversion factors
    mass_to_g = 0.001 if mass_unit == 'mg' else 1  # Convert mg to g
    conc_to_mol_L = 1 if concentration_unit == 'M' else 1  # M is already mol/L
    L_to_target_vol = 1000000 if volume_unit == 'uL' else 1000 if volume_unit == 'mL' else 1  # Convert L to target volume unit
    
    # Calculate volume for rows with NaN in volume column
    volume_mask = df_result[volume_column].isna()
    
    if volume_mask.any():
        # Get rows with missing volume
        rows_to_calculate = df_result.loc[volume_mask]
        
        # Calculate molar mass for each compound using SMILES
        molar_masses = []
        for smiles in rows_to_calculate['Smiles']:
            try:
                # Convert SMILES to RDKit molecule
                mol = Chem.MolFromSmiles(smiles)
                if mol is not None:
                    # Calculate exact molecular weight in g/mol
                    molar_mass = Descriptors.ExactMolWt(mol)
                    molar_masses.append(molar_mass)
                else:
                    # If SMILES parsing fails, append NaN
                    molar_masses.append(np.nan)
                    print(f"Warning: Failed to parse SMILES: {smiles}")
            except Exception as e:
                molar_masses.append(np.nan)
                print(f"Error calculating molar mass for {smiles}: {str(e)}")
        
        # Get mass and concentration values for rows with missing volume
        masses = rows_to_calculate[mass_column].values
        concentrations = rows_to_calculate[concentration_column].values
        
        # Calculate volume in liters: Mass (g) / (Concentration (mol/L) * Molar Mass (g/mol))
        volumes_L = []
        for mass, conc, mol_mass in zip(masses, concentrations, molar_masses):
            if np.isnan(mol_mass):
                volumes_L.append(np.nan)
            else:
                # Convert mass to grams
                mass_g = mass * mass_to_g
                # Calculate volume in liters
                volume_L = mass_g / (conc * mol_mass)
                volumes_L.append(volume_L)
        
        # Convert to target volume unit
        volumes_target_unit = [vol * L_to_target_vol if not np.isnan(vol) else np.nan for vol in volumes_L]
        
        # Round values to three decimal places
        volumes_target_unit = [round(vol, 3) if not np.isnan(vol) else np.nan for vol in volumes_target_unit]
        
        # Add calculated volumes to dataframe
        df_result.loc[volume_mask, volume_column] = volumes_target_unit
        
        # Count successful calculations
        successful_calcs = sum(~np.isnan(volumes_target_unit))
        print(f"Calculated {successful_calcs} out of {volume_mask.sum()} missing volume values")
    else:
        print("No missing volume values found")
    
    return df_result

In [102]:
add_volume(master_amines)

Units detected: Mass: mg, Concentration: M, Volume: uL
Calculated 14 out of 14 missing volume values


Unnamed: 0,ID,Box,Type,Building Block ID,Smiles,CAS,Old Fluics,Mass (mg),Conc (M),Volume (uL),merge_timestamp
0,Amine_1,Amine_Box_1,Amine,IMA-XXXX,CC(C)N,,,2.5,0.5,84.64,2025-04-16T16:00:40.988684
1,Amine_2,Amine_Box_1,Amine,IMA-XXXX,CC(N)C1CCCCC1,,,3.4,0.5,53.486,2025-04-16T16:00:40.988684
2,Amine_3,Amine_Box_1,Amine,,CC(N)C1CCCCC1,Placeholder,Placeholder,6.1,0.5,95.96,2025-04-16T16:00:40.988684
3,Amine_4,Amine_Box_1,Amine,IMA-XXXX,CC(N)C1=CCCC(C)C1,,,8.3,0.5,119.308,2025-04-16T16:00:40.988684
4,Amine_5,Amine_Box_1,Amine,IMA-XXXX,CC(N)C1=CCCC(C)C1,,,8.3,0.5,119.308,2025-04-16T16:00:40.988684
5,Amine_6,Amine_Box_1,Amine,,NC1=C(O)C(F)=CC=C1,Placeholder,Placeholder,5.5,0.5,86.585,2025-04-16T16:00:40.988684
6,Amine_7,Amine_Box_1,Amine,,NC1=NC(Cl)=C(NC=O)C(Cl)=N1,Placeholder,Placeholder,4.2,0.5,40.781,2025-04-16T16:00:40.988684
7,Amine_8,Amine_Box_1,Amine,,NC1=C(C=O)C=C(Cl)C=C1,Placeholder,Placeholder,0.4,0.5,5.161,2025-04-16T16:00:40.988684
8,Amine_9,Amine_Box_1,Amine,,O=CC1=C(N)N=CC=C1,Placeholder,Placeholder,4.5,0.5,73.741,2025-04-16T16:00:40.988684
9,Amine_10,Amine_Box_1,Amine,,C(=O)C1=CC(Br)=CC(Br)=C1N,Placeholder,Placeholder,4.2,0.5,30.339,2025-04-16T16:00:40.988684


In [52]:
def filter_on_smarts(df, smiles_col, smarts: Chem.rdchem.Mol):
    
    filtered_ndx = []
    for ndx, row in df.iterrows():
        smiles = row[smiles_col]
        
        mol = dm.to_mol(smiles)
        
        if mol.HasSubstructMatch(smarts):
            filtered_ndx.append(ndx)
            
    return df.iloc[filtered_ndx]

filtered_df = filter_on_smarts(master_amines, "Smiles", smarts)

In [53]:
filtered_df

Unnamed: 0,ID,Box,Type,Building Block ID,Smiles,CAS,Old Fluics,Mass (mg),Conc (M),Volume (uL),merge_timestamp
0,Amine_1,Amine_Box_1,Amine,IMA-XXXX,CC(C)N,,,2.5,0.5,,2025-04-16T16:00:40.988684
1,Amine_2,Amine_Box_1,Amine,IMA-XXXX,CC(N)C1CCCCC1,,,3.4,0.5,,2025-04-16T16:00:40.988684
2,Amine_3,Amine_Box_1,Amine,,CC(N)C1CCCCC1,Placeholder,Placeholder,6.1,0.5,,2025-04-16T16:00:40.988684
3,Amine_4,Amine_Box_1,Amine,IMA-XXXX,CC(N)C1=CCCC(C)C1,,,8.3,0.5,,2025-04-16T16:00:40.988684
4,Amine_5,Amine_Box_1,Amine,IMA-XXXX,CC(N)C1=CCCC(C)C1,,,8.3,0.5,,2025-04-16T16:00:40.988684
5,Amine_6,Amine_Box_1,Amine,,NC1=C(O)C(F)=CC=C1,Placeholder,Placeholder,5.5,0.5,,2025-04-16T16:00:40.988684
6,Amine_7,Amine_Box_1,Amine,,NC1=NC(Cl)=C(NC=O)C(Cl)=N1,Placeholder,Placeholder,4.2,0.5,,2025-04-16T16:00:40.988684
7,Amine_8,Amine_Box_1,Amine,,NC1=C(C=O)C=C(Cl)C=C1,Placeholder,Placeholder,0.4,0.5,,2025-04-16T16:00:40.988684
8,Amine_9,Amine_Box_1,Amine,,O=CC1=C(N)N=CC=C1,Placeholder,Placeholder,4.5,0.5,,2025-04-16T16:00:40.988684
9,Amine_10,Amine_Box_1,Amine,,C(=O)C1=CC(Br)=CC(Br)=C1N,Placeholder,Placeholder,4.2,0.5,,2025-04-16T16:00:40.988684
