In [123]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors

Here I created a collection of solvent data we can use for the ML modell. The Rf value depends on the compund it selfe but also on the solvents and there ratio. Two main properties of the solvent mixture are important: 1. The ability to dissolve the compound. 2. The distraction of the polar interactions between the compund and silica. An adjustment of the named properties is possible by trying diffrent solvent ratios. The Solvents properties is roughly defined by some parameters of the collection. Here are some definitions of important parameters:

Elutropic_series: The elution effect is the ability of the solvent to entrain a substance. The arrangement is empirical and depends on the stationary phase used. However, the elutropic series usually correlates with the permittivities of the solvents.

Solubility_water: solubility is the ability of a substance, the solute, to form a solution with another substance, the solvent. It gives an information about the hydrophilicity of the solvent. And 





In [106]:


Solvent_data = {
    'Solvent': ['n-Hexane', 'n-Pentane', 'Cyclohexane', 'Cyclopentane', 'Toluene', 'Benzene', 'Diethyl ether', 'Dichloromethane', 'Chloroform', 'Acetone', '1,4-Dioxane', 'Ethyl acetate', 'Tetrahydrofuran', 'Methyl tert-butyl ether', 'Dimethyl sulfoxide', 'Acetonitrile', 'Pyridine', 'Isopropyl alcohol', 'Ethanol', 'Methanol'],
    'Smiles': ['CCCCCC', 'CCCCC', 'C1CCCCC1', 'C1CCCC1', 'Cc1ccccc1', 'c1ccccc1', 'CCOCC', 'ClCCl', 'ClC(Cl)Cl', 'CC(=O)C', 'O1CCOCC1', 'O=C(OCC)C', 'C1CCOC1', 'O(C(C)(C)C)C', 'CS(=O)C', 'CC#N', 'c1ccncc1', 'CC(O)C', 'OCC', 'CO'],
    'Elutropic_series': [0.00, 0.00, 0.03, 0.04, 0.22, 0.25, 0.29, 0.30, 0.31, 0.43, 0.43, 0.45, 0.48, 0.48, 0.48, 0.50, 0.55, 0.60, 0.68, 0.73],
    'Density(20°C, g/ml)': [0.672, 0.626, 0.778, 0.751, 0.865, 0.879, 0.713, 1.324, 1.489, 0.79, 1.034 , 0.902, 0.888, 0.741, 1.10, 0.781, 0.982, 0.785, 0.789, 0.791],
    'Solubility_water (20°C, mg/L)': [9.5, 40.0, 60.0, 156.0, 520.0, 1790.0, 60500.0, 20000.0, 8090.0, np.nan, np.nan, 83000.0, np.nan, 26000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
    'log P': [3.764, 3.255, np.nan, np.nan, 2.73, 2.13, 0.98, 1.19, np.nan, -0.24, np.nan, 0.71, np.nan, np.nan, np.nan, -0.334, 0.65, -0.16, -0.18, -0.69],
    'Viscosity (20°C, mPa*s)': [0.31, 0.23, 1.00, np.nan, 0.59, 0.65, 0.23, 0.43, 0.57, 0.32, 1.54, 0.45, 0.55, 0.27, 2.24, 0.36, 0.95, 2.3, 1.20, 0.55]
}

In [107]:
df_solvents = pd.DataFrame(Solvent_data)

In [108]:
df_solvents

Unnamed: 0,Solvent,Smiles,Elutropic_series,"Density(20°C, g/ml)","Solubility_water (20°C, mg/L)",log P,"Viscosity (20°C, mPa*s)"
0,n-Hexane,CCCCCC,0.0,0.672,9.5,3.764,0.31
1,n-Pentane,CCCCC,0.0,0.626,40.0,3.255,0.23
2,Cyclohexane,C1CCCCC1,0.03,0.778,60.0,,1.0
3,Cyclopentane,C1CCCC1,0.04,0.751,156.0,,
4,Toluene,Cc1ccccc1,0.22,0.865,520.0,2.73,0.59
5,Benzene,c1ccccc1,0.25,0.879,1790.0,2.13,0.65
6,Diethyl ether,CCOCC,0.29,0.713,60500.0,0.98,0.23
7,Dichloromethane,ClCCl,0.3,1.324,20000.0,1.19,0.43
8,Chloroform,ClC(Cl)Cl,0.31,1.489,8090.0,,0.57
9,Acetone,CC(=O)C,0.43,0.79,,-0.24,0.32


Solvent info missing:

log P: Cycolhexane, Cyclopentane, Chloroform, 1,4-Dioxane, Tetrahydrofuran, MTBE, DMSO

Solubility: Benzene at 15°C, Acetone, 1,4-Dioxane, Tetrahydrofuran, DMSO, MeCN, Pyridine

Viscosity: Cyclopentane




In [109]:
def canonilze_smiles(Smiles: str):
    '''
       Converts Smile to a Mol file and back to a Smiles again to create
       a consistent Smiles string.

       Args: Smiles string
    '''
    mol = Chem.MolFromSmiles(Smiles)
    can_Smiles = Chem.MolToSmiles(mol)

    return can_Smiles


In [110]:
df_solvents['Smiles'] = df_solvents['Smiles'].apply(lambda x: canonilze_smiles(x))
df_solvents['Mol'] = df_solvents['Smiles'].apply(lambda x: Chem.MolFromSmiles(x))
df_solvents['Mw (g/mol)'] = df_solvents['Mol'].apply(lambda x: Chem.Descriptors.MolWt(x))
df_solvents['log P (calc)'] = df_solvents['Mol'].apply(lambda x: Chem.Descriptors.MolLogP(x))
df_solvents['H_Bond_Donors'] = df_solvents['Mol'].apply(lambda x: Chem.rdMolDescriptors.CalcNumHBD(x))
df_solvents['H_Bond_Acceptors'] = df_solvents['Mol'].apply(lambda x: Chem.rdMolDescriptors.CalcNumHBA(x))


In [111]:
Polarity_Index_list = [0.1, 0.0, 0.2, 0.1, 2.4, 2.7, 2.8, 3.1, 4.1, 5.1, 4.8, 4.4, 4.0, 2.5, 7.2, 5.8, 5.3, 3.9, 4.3, 5.1]
ET30_values = [31.0, 31.0, 30.09, np.nan, 33.9, 34.3, 34.5, 40.7, 39.1, 42.2, 36.0, 38.1, 37.4, 34.7, 45.1, 45.6, 40.5, 48.4, 51.9, 55.4]
ET30N_values = [0.009, 0.009, 0.006, np.nan, 0.099, 0.111, 0.117, 0.309, 0.259, 0.355, 0.164, 0.228, 0.207, 0.124, 0.444, 0.46, 0.302, 0.546, 0.654, 0.762]
len(Polarity_Index_list)
df_solvents['Polarity_index'] = Polarity_Index_list
df_solvents['ET30 (kcal/mol)'] = ET30_values
df_solvents['ET30N'] = ET30N_values

In [112]:
df_solvents

Unnamed: 0,Solvent,Smiles,Elutropic_series,"Density(20°C, g/ml)","Solubility_water (20°C, mg/L)",log P,"Viscosity (20°C, mPa*s)",Mol,Mw (g/mol),log P (calc),H_Bond_Donors,H_Bond_Acceptors,Polarity_index,ET30 (kcal/mol),ET30N
0,n-Hexane,CCCCCC,0.0,0.672,9.5,3.764,0.31,<rdkit.Chem.rdchem.Mol object at 0x125780580>,86.178,2.5866,0,0,0.1,31.0,0.009
1,n-Pentane,CCCCC,0.0,0.626,40.0,3.255,0.23,<rdkit.Chem.rdchem.Mol object at 0x1257804a0>,72.151,2.1965,0,0,0.0,31.0,0.009
2,Cyclohexane,C1CCCCC1,0.03,0.778,60.0,,1.0,<rdkit.Chem.rdchem.Mol object at 0x1257805f0>,84.162,2.3406,0,0,0.2,30.09,0.006
3,Cyclopentane,C1CCCC1,0.04,0.751,156.0,,,<rdkit.Chem.rdchem.Mol object at 0x125780660>,70.135,1.9505,0,0,0.1,,
4,Toluene,Cc1ccccc1,0.22,0.865,520.0,2.73,0.59,<rdkit.Chem.rdchem.Mol object at 0x1257806d0>,92.141,1.99502,0,0,2.4,33.9,0.099
5,Benzene,c1ccccc1,0.25,0.879,1790.0,2.13,0.65,<rdkit.Chem.rdchem.Mol object at 0x125780740>,78.114,1.6866,0,0,2.7,34.3,0.111
6,Diethyl ether,CCOCC,0.29,0.713,60500.0,0.98,0.23,<rdkit.Chem.rdchem.Mol object at 0x1257807b0>,74.123,1.0428,0,1,2.8,34.5,0.117
7,Dichloromethane,ClCCl,0.3,1.324,20000.0,1.19,0.43,<rdkit.Chem.rdchem.Mol object at 0x125780820>,84.933,1.4215,0,0,3.1,40.7,0.309
8,Chloroform,ClC(Cl)Cl,0.31,1.489,8090.0,,0.57,<rdkit.Chem.rdchem.Mol object at 0x125780890>,119.378,1.9864,0,0,4.1,39.1,0.259
9,Acetone,CC(C)=O,0.43,0.79,,-0.24,0.32,<rdkit.Chem.rdchem.Mol object at 0x125780900>,58.08,0.5953,0,1,5.1,42.2,0.355


In [113]:
df_solvents.to_csv(r'/Users/matthiasgalka/git/ppchem_project/Data/Solvents.csv')

In [115]:
df = pd.read_csv(r'/Users/matthiasgalka/git/ppchem_project/data/AfterRFfilter(3).csv')

In [117]:
df31 = df.iloc[:4572]
df32 = df.iloc[4572:]

In [120]:
df31.to_csv(r'/Users/matthiasgalka/git/ppchem_project/data/AfterRFfilter(3.1).csv', index=False)
df32.to_csv(r'/Users/matthiasgalka/git/ppchem_project/data/AfterRFfilter(3.2).csv', index=False)

In [121]:
len(df_solvents)

20