In [None]:
#Import
import pandas as pd
import numpy as np

#Data
import pickle

#Utilities
from rdkit import Chem
from rdkit.Chem import Descriptors

# Model
from xgboost import XGBRegressor

In [5]:
def generate_smarts_descriptors(df, smiles_column='SMILES'):
    """
    Adds molecular representations, SMARTS-based substructure counts,
    and molecular weight to a DataFrame.

    Parameters:
    - df: pandas DataFrame with a column of SMILES strings.
    - smiles_column: name of the column containing SMILES (default: 'SMILE').

    Returns:
    - Modified DataFrame with SMARTS descriptors and MolWeight column.
    """
    
    smarts_list = [
        '[H]', '[C,c]', '[O]', '[C;R]', '[c;R]', '[O;R]', '[*R]', '[OX2]', '[OX2H]',
        '[c][OX2H]', '[OX2H0]', '[CX3][OX1]', '[#6][CX3H0](=O)[#6]', '[CX3H1](=O)',
        '[CX3](=O)[OX2H]', '[CX3](=O)O[CX3](=O)', '[CX3H0](=O)[OX2H0][#6]', '[CX4H0]',
        '[CX4H1]', '[CX4H2]', '[CX4H3]', '[CX3]=[CX3]', '[#8].[#8]', '[OX2H].[OX2H]',
        '[OX2H].[OX2H].[OX2H]', '[CX4][OX2H]', '[CX4H0][OX2H]', '[CX4H1][OX2H]',
        '[CX4H2][OX2H]', '[CX3H2]=[CX3H1]', '[CX3H2]=[CX3H0]', '[CX3H1]=[CX3H1]',
        '[OX2H0].[OX2H0]', '[OX2H0].[OX2H0].[OX2H0]', '[c][OX2][c]', '[c][OX2][C]',
        '[#8][#6][#6][#8]', '[#8][#6][#6][#6][#8]', '[#6;R]-O-[#6;R]'
    ]
    smarts_labels = [f'X{i+1}' for i in range(len(smarts_list))]

    df = df.copy()
    
    # Generate RDKit Mol objects and add explicit Hs
    df['mol'] = df[smiles_column].apply(Chem.MolFromSmiles)
    df['mol'] = df['mol'].apply(lambda mol: Chem.AddHs(mol) if mol else None)

    # Calculate SMARTS-based substructure counts
    for smarts, label in zip(smarts_list, smarts_labels):
        pattern = Chem.MolFromSmarts(smarts)
        df[label] = df['mol'].apply(lambda mol: len(mol.GetSubstructMatches(pattern)) if mol else 0)

    # Add molecular weight
    df['MolWeight'] = df['mol'].apply(lambda mol: Descriptors.MolWt(mol) if mol else None)

    df.drop(columns=['mol'], inplace=True)  # Optionally drop the Mol column
    return df

In [13]:
# Load XGBoost model
with open('xgb_model.pkl', 'rb') as f:
    model = pickle.load(f)

PREDICT MIXTURES

In [39]:
mixture_df = pd.read_excel('mixture_placeholder.xlsx', sheet_name=0) # Replace with file path of mixtures' components SMILES + temperatures + weight fractions
output_mixture = "mixture_predictions.xlsx" # Replace with desired output file path
"""
mixture_df: DataFrame containing columns ['SMILES_A', 'SMILES_B', 'T', 'mA', 'mB']
"""
None

In [28]:
# Combine SMILES and Temperature from A and B into one DataFrame
unique_a = mixture_df[['SMILES_A', 'T']].rename(columns={'SMILES_A': 'SMILES'})
unique_b = mixture_df[['SMILES_B', 'T']].rename(columns={'SMILES_B': 'SMILES'})

# Concatenate and drop duplicates
unique_df = pd.concat([unique_a, unique_b]).drop_duplicates().reset_index(drop=True)

# Apply the generate_smarts_descriptors function
unique_with_descriptors = generate_smarts_descriptors(unique_df)

# Get predictions
unique_with_descriptors['K'] = model.predict(unique_with_descriptors.drop(columns=['SMILES']))

In [29]:
# Prepare mapping DataFrame
k_map = unique_with_descriptors[['SMILES', 'T', 'K']]

# Map to SMILES_A
mixture_df = mixture_df.merge(
    k_map.rename(columns={'SMILES': 'SMILES_A', 'K': 'KA'}),
    on=['SMILES_A', 'T'],
    how='left'
)

# Map to SMILES_B
mixture_df = mixture_df.merge(
    k_map.rename(columns={'SMILES': 'SMILES_B', 'K': 'KB'}),
    on=['SMILES_B', 'T'],
    how='left'
)

In [None]:
# Apply Weighted Harmonic Mean (WHM) calculation
mixture_df['WHM'] = 1 / (mixture_df['mA'] / mixture_df['KA'] + mixture_df['mB'] / mixture_df['KB'])
mixture_df

Unnamed: 0,SMILES_A,SMILES_B,mA,mB,T,KA,KB,WHM
0,CC(C)[C@@H]1CC[C@@H](C)C[C@H]1O,CCCCCCCCCC(O)=O,0.783947,0.216053,304.01,0.132136,0.147828,0.135238
1,CC(C)[C@@H]1CC[C@@H](C)C[C@H]1O,CCCCCCCCCC(O)=O,0.783947,0.216053,314.00,0.130615,0.144901,0.133458
2,CC(C)[C@@H]1CC[C@@H](C)C[C@H]1O,CCCCCCCCCC(O)=O,0.783947,0.216053,324.01,0.129336,0.143133,0.132087
3,CC(C)[C@@H]1CC[C@@H](C)C[C@H]1O,CCCCCCCCCC(O)=O,0.783947,0.216053,333.98,0.128770,0.142100,0.131434
4,CC(C)[C@@H]1CC[C@@H](C)C[C@H]1O,CCCCCCCCCC(O)=O,0.783947,0.216053,344.15,0.125668,0.139309,0.128384
...,...,...,...,...,...,...,...,...
3048,CCCCCCCCCCC,CCCCCCCCCCCCCC(=O)OC,0.217467,0.782533,322.02,0.123445,0.142102,0.137580
3049,CCCCCCCCCCC,CCCCCCCCCCCCCC(=O)OC,0.217467,0.782533,331.75,0.120640,0.139657,0.135028
3050,CCCCCCCCCCC,CCCCCCCCCCCCCC(=O)OC,0.217467,0.782533,341.92,0.117905,0.138230,0.133235
3051,CCCCCCCCCCC,CCCCCCCCCCCCCC(=O)OC,0.217467,0.782533,351.90,0.115874,0.140397,0.134220


In [None]:
# Export the results to an Excel file
mixture_df.to_excel(output_mixture, index=False)

PREDICT ONLY COMPOUNDS

In [40]:
single_df = pd.read_excel('mixture_placeholder.xlsx', sheet_name=1) # Replace with file path of unique components SMILES + temperatures - Default sheet should be 0
output_single = "pure_placeholder.xlsx" # Replace with desired output file path
"""
unique_df: DataFrame containing columns ['SMILES', 'T']
"""
None

In [None]:
# Generate descriptors and get predictions
single_df = generate_smarts_descriptors(single_df)
single_df['K'] = model.predict(single_df.drop(columns=['SMILES']))
single_df

Unnamed: 0,SMILES,T,X1,X2,X3,X4,X5,X6,X7,X8,...,X32,X33,X34,X35,X36,X37,X38,X39,MolWeight,K
0,CC(=O)OCCCCCCCCCC,308.15,24,12,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,200.322,0.140081
1,CC(=O)OCCCCCCCCCC,310.95,24,12,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,200.322,0.140081
2,CC(=O)OCCCCCCCCCC,328.15,24,12,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,200.322,0.134833
3,CC(=O)OCCCCCCCCCC,349.15,24,12,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,200.322,0.130339
4,CC(=O)OCCCCCCCC,313.15,20,10,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,172.268,0.136847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1654,OCCOCCO,353.15,10,4,3,0,0,0,0,3,...,0,0,0,0,0,2,0,0,106.121,0.208826
1655,OCCOCCO,363.15,10,4,3,0,0,0,0,3,...,0,0,0,0,0,2,0,0,106.121,0.208384
1656,OCCOCCO,373.15,10,4,3,0,0,0,0,3,...,0,0,0,0,0,2,0,0,106.121,0.209597
1657,OCCOCCO,393.15,10,4,3,0,0,0,0,3,...,0,0,0,0,0,2,0,0,106.121,0.209471


In [None]:
# Export the results to an Excel file
single_df.to_excel(output_single, index=False)