In [1]:
%%capture
!pip install mordred
!pip install rdkit

## Importing Libraries

In [3]:
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, AllChem

import mordred
from mordred import Calculator, descriptors

In [4]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [6]:
df_final = pd.read_csv('delaney_8_des.csv')
df_final.head()

Unnamed: 0,FilterItLogS,Lipinski,SIC0,RNCG,ATS0Z,RPCG,AETA_eta,AATS0i,measured log(solubility:mol/L)
0,-2.790326,1,0.5,0.330754,1230.0,0.551252,0.941684,162.007716,-2.18
1,-2.129911,1,0.520426,0.31169,942.0,0.695189,0.91344,164.102494,-2.0
2,-2.433986,1,0.5,0.25,1230.0,0.335426,0.899987,162.007716,-1.74
3,-2.147371,1,0.520426,0.372917,942.0,0.36405,0.776328,164.102494,-1.48
4,-2.6646,1,0.520426,0.277752,1182.0,0.533222,0.875353,208.591109,-3.04


In [7]:
y = df_final['measured log(solubility:mol/L)']
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(df_final.iloc[:, :-1]), columns = df_final.iloc[:, :-1].columns)

In [8]:
X.head()

Unnamed: 0,FilterItLogS,Lipinski,SIC0,RNCG,ATS0Z,RPCG,AETA_eta,AATS0i
0,0.021772,0.332685,1.784579,0.03375,0.534794,3.187876,0.64755,-0.07758
1,0.376467,0.332685,1.993944,-0.070165,0.121242,4.534268,0.553108,0.215509
2,0.213155,0.332685,1.784579,-0.406439,0.534794,1.169029,0.508126,-0.07758
3,0.367089,0.332685,1.993944,0.263581,0.121242,1.436776,0.09464,0.215509
4,0.089296,0.332685,1.993944,-0.255165,0.465869,3.019216,0.425756,6.440094


In [9]:
y

0      -2.180
1      -2.000
2      -1.740
3      -1.480
4      -3.040
        ...  
1139    1.144
1140   -4.925
1141   -3.893
1142   -3.790
1143   -2.581
Name: measured log(solubility:mol/L), Length: 1144, dtype: float64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=45)

In [11]:
rf = RandomForestRegressor(bootstrap= True,
 max_depth= 45,
 max_features= 'log2',
 min_samples_leaf= 2,
 min_samples_split= 5,
 n_estimators= 800,
random_state=0)
rf.fit(X_train, y_train)
print(f'The r2 score for train set is : {rf.score(X_train, y_train)}')
print(f'The r2 score for test set is : {rf.score(X_test, y_test)}')

The r2 score for train set is : 0.9664879058908096
The r2 score for test set is : 0.8563149115031692


In [12]:
import pickle
with open('scaler.pkl','wb') as f:
    pickle.dump(scaler, f)
with open('model_rf', 'wb') as f:
    pickle.dump(rf, f)

# Predict Solubility from SMILES

In [14]:
X.columns

Index(['FilterItLogS', 'Lipinski', 'SIC0', 'RNCG', 'ATS0Z', 'RPCG', 'AETA_eta',
       'AATS0i'],
      dtype='object')

## Prediction for a single SMILES

In [29]:
def predict_sol(smile):
  mol = Chem.MolFromSmiles(smile)
  mol = Chem.AddHs(mol)
  AllChem.EmbedMolecule(mol)

  df_mol = pd.DataFrame(data = [mol], columns=(['mol']), dtype='object')
  calc = Calculator(descriptors, ignore_3D=False)
  desc = calc.pandas(df_mol['mol'])
  desc_8 = desc[['FilterItLogS', 'Lipinski', 'SIC0', 'RNCG', 'RPCG', 'ATS0Z',
        'PEOE_VSA6', 'AATS0i']]
  X_test = scaler.transform(desc_8)
  predict = rf.predict(X_test)
  print(f'The Predicted Solubility is {predict[0]}')

In [31]:
predict_sol('OCC1=C(O)C=C(O)C=C1')

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.33it/s]


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- PEOE_VSA6
Feature names seen at fit time, yet now missing:
- AETA_eta


# Predictions for a list of SMILES

In [25]:
def predict_sol_smiles(smiles):
  mols = []
  for smile in smiles:
    mol = Chem.MolFromSmiles(smile)
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol)
    mols.append(mol)
  df_mol = pd.DataFrame(data = mols, columns=(['mol']), dtype='object')
  calc = Calculator(descriptors, ignore_3D=False)
  desc = calc.pandas(df_mol['mol'])
  desc_8 = desc[['FilterItLogS', 'Lipinski', 'SIC0', 'RNCG', 'RPCG', 'ATS0Z',
        'PEOE_VSA6', 'AATS0i']]
  X_test = scaler.transform(desc_8)
  predict = rf.predict(X_test)
  print(f'The Predicted Solubilities is {predict}')

In [27]:
smiles = ['OCC1=C(O)C=C(O)C=C1', 'NCC1=C(N)C=C(N)C=C1', 'CCC1=C(C)C=C(C)C=C1']
predict_sol_smiles(smiles)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.22it/s]


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- PEOE_VSA6
Feature names seen at fit time, yet now missing:
- AETA_eta
