In [1]:
import pandas as pd
import numpy as np
import pubchempy as pcp

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator

In [2]:
Seed = r"C:\Users\Dillo\OneDrive\Desktop\LCCCSeed-ML-2-2-25-RemovedCommasFromSolventsList.xlsx"
SeedData = pd.read_excel(Seed)


In [3]:
## A little pubchempy smiles generation from chemical name example -- Unrelated to our data.

results = pcp.get_compounds('Triethylamine', 'name')
print(results)

for compound in results:
    print(compound.isomeric_smiles)

[Compound(8471)]
CCN(CC)CC


In [4]:
print(type(Seed))

<class 'str'>


In [5]:
FullSolvents = pd.DataFrame()
FullSolvents['Removed_Spaces'] = SeedData['Solvents'].str.replace(r'; \s*', ';', regex=True)
FullSolvents['Cleaned_Solvents'] = FullSolvents['Removed_Spaces'].str.replace(r' \s*\(near crit\)', '', regex=True)
FullSolvent_split = FullSolvents['Cleaned_Solvents'].str.split(';', expand=True)

UniqueSolvents = pd.unique(FullSolvent_split.stack())
print(type(UniqueSolvents))



<class 'numpy.ndarray'>


In [6]:
def get_cid(UniqueSolvents):
    try:
        compound = pcp.get_compounds(UniqueSolvents, 'name')
        if compound:
            return int(compound[0].cid)
        else:
            return None
    except Exception as e:
        return None

def get_smiles(compound_name):
    try:
        compound = pcp.get_compounds(compound_name, 'name')
        if compound:
            return compound[0].isomeric_smiles
        else:
            return None
    except Exception as e:
        return None


In [7]:
# Property Arrays
UniqueSolventsDataFrame = pd.DataFrame()
UniqueSolvents_series = pd.Series(UniqueSolvents)
#smiles = []
UniqueSolventsDataFrame['Solvents']= UniqueSolvents_series



UniqueSolventsDataFrame['Smiles'] = UniqueSolventsDataFrame['Solvents'].apply(get_smiles)

mol_list = []

for smile in UniqueSolventsDataFrame['Smiles']:
    if smile:  # Skip None or empty strings
        mol = Chem.MolFromSmiles(smile)
        if mol:  # Ensure RDKit didn't return None
            mol_list.append(mol)
        else:
            print(f"Warning: Invalid SMILES string - {smile}")  # Debugging info

# Print first few rows for debugging
print(len(list(UniqueSolventsDataFrame['Smiles'])))
print(UniqueSolventsDataFrame.head())
print(f"Number of valid molecules: {len(mol_list)}")

## Apparently can't generate smiles for deuterated acetone - will need to manually create or find in HSP software.

50
           Solvents             Smiles
0           Toluene       CC1=CC=CC=C1
1   Tetrahydrofuran            C1CCOC1
2            Hexane             CCCCCC
3           Dioxane           C1COCCO1
4  Diethyl Malonate  CCOC(=O)CC(=O)OCC
Number of valid molecules: 49


In [8]:
UniqueSolventsDataFrame

Unnamed: 0,Solvents,Smiles
0,Toluene,CC1=CC=CC=C1
1,Tetrahydrofuran,C1CCOC1
2,Hexane,CCCCCC
3,Dioxane,C1COCCO1
4,Diethyl Malonate,CCOC(=O)CC(=O)OCC
5,Acetonitrile,CC#N
6,Dimethyl Formamide,CN(C)C=O
7,Methyl Ethyl Ketone,CCC(=O)C
8,Cyclohexane,C1CCCCC1
9,2-Propanol,CC(C)O


In [9]:
# Ensure 'Smiles' is treated as a string column and drop invalid values
data = pd.DataFrame()
data['Solvents'] = UniqueSolventsDataFrame['Solvents'].astype(str)
data['Smiles'] = UniqueSolventsDataFrame['Smiles'].astype(str)  # Convert everything to strings

data = data.dropna(subset=['Smiles'])

# Remove NaN, 'None' strings, and empty values
data = data[~data['Smiles'].isin([None, 'None', '', 'nan', 'NaN'])].dropna(subset=['Smiles'])

#data = data.dropna(subset= ['mol'])

smiles_list = data['Smiles'].values
solvents_list = data['Solvents']


data

Unnamed: 0,Solvents,Smiles
0,Toluene,CC1=CC=CC=C1
1,Tetrahydrofuran,C1CCOC1
2,Hexane,CCCCCC
3,Dioxane,C1COCCO1
4,Diethyl Malonate,CCOC(=O)CC(=O)OCC
5,Acetonitrile,CC#N
6,Dimethyl Formamide,CN(C)C=O
7,Methyl Ethyl Ketone,CCC(=O)C
8,Cyclohexane,C1CCCCC1
9,2-Propanol,CC(C)O


In [10]:
def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # If invalid SMILES, return None or some placeholder. Better to filter out invalid rows beforehand.
        return None
    ##These are the descriptors Ethier paper utilized - They cut out the descriptors they decided were repetitive - 212 descriptors available cut to 196 I believe
    custom_descriptors = ["BalabanJ", "BertzCT", "Chi0","Chi1", "Chi0v", "Chi1v", "Chi2v", "Chi3v", "Chi4v", "Chi0n", "Chi1n", "Chi2n", "Chi3n", "Chi4n",
                          "EState_VSA1", "EState_VSA2", "EState_VSA3", "EState_VSA4", "EState_VSA5", "EState_VSA6", "EState_VSA7", "EState_VSA8", "EState_VSA9",
                          "EState_VSA10", "ExactMolWt", "FractionCSP3", "HallKierAlpha", "HeavyAtomCount", "HeavyAtomMolWt", "lpc", "Kappa1", "Kappa2", "Kappa3",
                          "LabuteASA", "MolLogP", "MolMR", "MolWt", "NHOHCount", "NOCount", "NumAliphaticCarbocycles", "NumAliphaticHeterocycles", "NumAlphaticRings",
                          "NumAromaticRings", # Upon review I accidently left this descriptor out - Ethier used it.
                          "NumHAcceptors", "NumHDonors", "NumHeteroatoms", "NumRotatableBonds", "NumSaturatedCarbocycles", "NumSaturatedHeterocycles", "NumValenceElectrons",
                          "PEOE_VSA1", "PEOE_VSA2", "PEOE_VSA3", "PEOE_VSA4", "PEOE_VSA5", "PEOE_VSA6", "PEOE_VSA7", "PEOE_VSA8", "PEOE_VSA9", "PEOE_VSA10", "PEOE_VSA12",
                          "PEOE_VSA14", "RingCount", "SMR_VSA1", "SMR_VSA2", "SMR_VSA4", "SMR_VSA5", "SMR_VSA6", "SMR_VSA7", "SMR_VSA9", "SMR_VSA10", "SlogP_VSA1", "SlogP_VSA2",
                          "SlogP_VSA3", "SlogP_VSA4", "SlogP_VSA5", "SlogP_VSA6", "SlogP_VSA7", "SlogP_VSA11", "TPSA", "VSA_EState1", "VSA_EState2", "VSA_EState3", "VSA_EState4",
                          "VSA_EState5", "VSA_EState6", "VSA_EState7", "VSA_EState8", "VSA_EState9", "fr_Al_OH", "fr_Al_OH_noTert", "fr_C_O", "fr_C_O_noCOO", "fr_NH0",
                          "fr_aldehyde", "fr_allytic_oxid", "fr_aryl_methyl", "fr_benzene", "fr_bicyclic", "fr_epoxide", "fr_ester", "fr_ether", "fr_ketone", "fr_ketone_Topliss",
                          "fr_methoxy", "fr_nitrile", "fr_nitro", "fr_para_hydroxylation", "fr_sulfone", "fr_unbrch_alkane", "MaxAbsEStateIndex", "MaxAbsPartialCharge",
                          "MaxEStateIndex", "MaxPartialCharge", "MinAbsEStateIndex", "MinAbsPartialCharge", "MinEStateIndex", "MinPartialCharge"]
    
    descriptor_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(custom_descriptors)
    desc = descriptor_calculator.CalcDescriptors(mol)
    headers = descriptor_calculator.GetDescriptorNames()  # Retrieve headers
    
    return desc, headers, mol

In [11]:

# Compute descriptors for all molecules


X_list = []
valid_smiles = []
header = []
mol_list = []


for i, s in enumerate(smiles_list):
    desc, headers, mol = compute_descriptors(s)
    if desc is not None:
        X_list.append(desc)
        valid_smiles.append(s)
        header.append(headers)
        mol_list.append(mol)
        
df_descriptors = pd.DataFrame(X_list, columns=headers)
df_descriptors.insert(0, "Smiles", valid_smiles)
df_descriptors.insert(0, "Solvents", solvents_list)

df_descriptors

Unnamed: 0,Solvents,Smiles,BalabanJ,BertzCT,Chi0,Chi1,Chi0v,Chi1v,Chi2v,Chi3v,...,fr_sulfone,fr_unbrch_alkane,MaxAbsEStateIndex,MaxAbsPartialCharge,MaxEStateIndex,MaxPartialCharge,MinAbsEStateIndex,MinAbsPartialCharge,MinEStateIndex,MinPartialCharge
0,Toluene,CC1=CC=CC=C1,3.021465,129.96566,5.112884,3.393847,4.386751,2.410684,1.654701,0.940456,...,0,0,2.083333,0.062249,2.083333,-0.039775,1.321759,0.039775,1.321759,-0.062249
1,Tetrahydrofuran,C1CCOC1,2.083333,11.60964,3.535534,2.5,3.236675,2.07735,1.319479,0.82735,...,0,0,4.944444,0.381449,4.944444,0.046638,1.0,0.046638,1.0,-0.381449
2,Hexane,CCCCCC,2.339092,12.0,4.828427,2.914214,4.828427,2.914214,1.707107,0.957107,...,0,3,2.231806,0.065382,2.231806,-0.053579,1.355,0.053579,1.355,-0.065382
3,Dioxane,C1COCCO1,2.0,15.509775,4.242641,3.0,3.644924,2.154701,1.224745,0.744017,...,0,0,4.944444,0.376668,4.944444,0.070114,0.777778,0.070114,0.777778,-0.376668
4,Diethyl Malonate,CCOC(=O)CC(=O)OCC,3.489962,114.973737,8.690234,5.201907,6.754314,3.515167,1.786819,0.899326,...,0,0,10.605235,0.465545,10.605235,0.316635,0.289919,0.316635,-0.536157,-0.465545
5,Acetonitrile,CC#N,2.475534,21.509775,2.707107,1.414214,1.947214,0.723607,0.223607,0.0,...,0,0,7.319444,0.198658,7.319444,0.058715,1.430556,0.058715,1.430556,-0.198658
6,Dimethyl Formamide,CN(C)C=O,2.825749,27.01955,4.284457,2.270056,3.432812,1.388328,1.069021,0.210819,...,0,0,9.430556,0.351387,9.430556,0.208685,0.75,0.208685,0.75,-0.351387
7,Methyl Ethyl Ketone,CCC(=O)C,2.847379,35.302969,4.284457,2.270056,3.615355,1.764784,1.055568,0.497891,...,0,0,9.8125,0.300042,9.8125,0.129065,0.25463,0.129065,0.25463,-0.300042
8,Cyclohexane,C1CCCCC1,2.0,15.509775,4.242641,3.0,4.242641,3.0,2.12132,1.5,...,0,0,1.5,0.053306,1.5,-0.053306,1.5,0.053306,1.5,-0.053306
9,2-Propanol,CC(C)O,2.32379,4.754888,3.57735,1.732051,3.024564,1.412899,1.093748,0.0,...,0,0,8.055556,0.393707,8.055556,0.048348,0.166667,0.048348,-0.166667,-0.393707


In [12]:


morgan_gen = GetMorganGenerator(radius=2, fpSize=1024)


fingerprints = []  #  Morgan fingerprints as bit vectors

for mol in mol_list:
    if mol:
        fp = morgan_gen.GetFingerprint(mol)
        fp_bits = list(fp.GetOnBits())  # Store only "on" bits
    else:
        fp_bits = []
    
    fp_vector = [1 if i in fp_bits else 0 for i in range(1024)]
    fingerprints.append(fp_vector)



fingerprints_df = pd.DataFrame(fingerprints, columns=[f"FP_bit_{i}" for i in range(1024)])




fingerprints_df



Unnamed: 0,FP_bit_0,FP_bit_1,FP_bit_2,FP_bit_3,FP_bit_4,FP_bit_5,FP_bit_6,FP_bit_7,FP_bit_8,FP_bit_9,...,FP_bit_1014,FP_bit_1015,FP_bit_1016,FP_bit_1017,FP_bit_1018,FP_bit_1019,FP_bit_1020,FP_bit_1021,FP_bit_1022,FP_bit_1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
8,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
Final_df = pd.concat([df_descriptors, fingerprints_df], axis=1)

Final_df

Unnamed: 0,Solvents,Smiles,BalabanJ,BertzCT,Chi0,Chi1,Chi0v,Chi1v,Chi2v,Chi3v,...,FP_bit_1014,FP_bit_1015,FP_bit_1016,FP_bit_1017,FP_bit_1018,FP_bit_1019,FP_bit_1020,FP_bit_1021,FP_bit_1022,FP_bit_1023
0,Toluene,CC1=CC=CC=C1,3.021465,129.96566,5.112884,3.393847,4.386751,2.410684,1.654701,0.940456,...,0,0,0,0,0,0,0,0,0,0
1,Tetrahydrofuran,C1CCOC1,2.083333,11.60964,3.535534,2.5,3.236675,2.07735,1.319479,0.82735,...,0,0,0,0,0,0,0,0,0,0
2,Hexane,CCCCCC,2.339092,12.0,4.828427,2.914214,4.828427,2.914214,1.707107,0.957107,...,0,0,0,0,0,0,0,0,0,0
3,Dioxane,C1COCCO1,2.0,15.509775,4.242641,3.0,3.644924,2.154701,1.224745,0.744017,...,0,0,0,0,0,0,0,0,0,0
4,Diethyl Malonate,CCOC(=O)CC(=O)OCC,3.489962,114.973737,8.690234,5.201907,6.754314,3.515167,1.786819,0.899326,...,0,0,0,0,0,0,0,0,0,0
5,Acetonitrile,CC#N,2.475534,21.509775,2.707107,1.414214,1.947214,0.723607,0.223607,0.0,...,0,0,0,0,0,0,0,0,0,0
6,Dimethyl Formamide,CN(C)C=O,2.825749,27.01955,4.284457,2.270056,3.432812,1.388328,1.069021,0.210819,...,0,0,0,0,0,0,0,0,0,0
7,Methyl Ethyl Ketone,CCC(=O)C,2.847379,35.302969,4.284457,2.270056,3.615355,1.764784,1.055568,0.497891,...,0,0,0,1,0,0,0,0,0,0
8,Cyclohexane,C1CCCCC1,2.0,15.509775,4.242641,3.0,4.242641,3.0,2.12132,1.5,...,0,0,0,0,0,0,0,0,0,0
9,2-Propanol,CC(C)O,2.32379,4.754888,3.57735,1.732051,3.024564,1.412899,1.093748,0.0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#Final_df.to_excel('SolventDescriptors-UPDATED-2-23-25.xlsx', index=False)

In [None]:
#df_descriptors.to_excel('Solvent-UPDATED-Molecular-Descriptors-Only2-23-25.xlsx', index=False)

In [1]:
def get_custom_descriptors(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    
    if molcule is None:
        raise ValueError("Invalid SMILES string provided.")
    
    custom_descriptors = ["BalabanJ", "BertzCT", "Chi0","Chi1", "Chi0v", "Chi1v", "Chi2v", "Chi3v", "Chi4v", "Chi0n", "Chi1n", "Chi2n", "Chi3n", "Chi4n",
                          "EState_VSA1", "EState_VSA2", "EState_VSA3", "EState_VSA4", "EState_VSA5", "EState_VSA6", "EState_VSA7", "EState_VSA8", "EState_VSA9",
                          "EState_VSA10", "ExactMolWt", "FractionCSP3", "HallKierAlpha", "HeavyAtomCount", "HeavyAtomMolWt", "lpc", "Kappa1", "Kappa2", "Kappa3",
                          "LabuteASA", "MolLogP", "MolMR", "MolWt", "NHOHCount", "NOCount", "NumAliphaticCarbocycles", "NumAliphaticHeterocycles", "NumAlphaticRings",
                          "NumAromaticRings", # Upon review I accidently left this descriptor out - Ethier used it.
                          "NumHAcceptors", "NumHDonors", "NumHeteroatoms", "NumRotatableBonds", "NumSaturatedCarbocycles", "NumSaturatedHeterocycles", "NumValenceElectrons",
                          "PEOE_VSA1", "PEOE_VSA2", "PEOE_VSA3", "PEOE_VSA4", "PEOE_VSA5", "PEOE_VSA6", "PEOE_VSA7", "PEOE_VSA8", "PEOE_VSA9", "PEOE_VSA10", "PEOE_VSA12",
                          "PEOE_VSA14", "RingCount", "SMR_VSA1", "SMR_VSA2", "SMR_VSA4", "SMR_VSA5", "SMR_VSA6", "SMR_VSA7", "SMR_VSA9", "SMR_VSA10", "SlogP_VSA1", "SlogP_VSA2",
                          "SlogP_VSA3", "SlogP_VSA4", "SlogP_VSA5", "SlogP_VSA6", "SlogP_VSA7", "SlogP_VSA11", "TPSA", "VSA_EState1", "VSA_EState2", "VSA_EState3", "VSA_EState4",
                          "VSA_EState5", "VSA_EState6", "VSA_EState7", "VSA_EState8", "VSA_EState9", "fr_Al_OH", "fr_Al_OH_noTert", "fr_C_O", "fr_C_O_noCOO", "fr_NH0",
                          "fr_aldehyde", "fr_allytic_oxid", "fr_aryl_methyl", "fr_benzene", "fr_bicyclic", "fr_epoxide", "fr_ester", "fr_ether", "fr_ketone", "fr_ketone_Topliss",
                          "fr_methoxy", "fr_nitrile", "fr_nitro", "fr_para_hydroxylation", "fr_sulfone", "fr_unbrch_alkane", "MaxAbsEStateIndex", "MaxAbsPartialCharge",
                          "MaxEStateIndex", "MaxPartialCharge", "MinAbsEStateIndex", "MinAbsPartialCharge", "MinEStateIndex", "MinPartialCharge"]
    return custom_descriptors
if __name__ == "__main__":
    smiles_input = input("Enter a SMILES string: ")
    descriptors = get_custom_descriptors(smiles_input)
    
    print("\nCustom Molecular Descriptors:")
    for key, value in descriptors.items():
        print(f"{key}: {value}")

NameError: name 'Chem' is not defined