In [23]:
!pip install rdkit



#Section 1. - Importing and preprocessing the data

In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Loading the dataset
url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv"
df = pd.read_csv(url)

# Converting SMILES to RDKit Mol objects
df['mol'] = df['smiles'].apply(Chem.MolFromSmiles)

# Calculating RDKit descriptors
def calculate_descriptors(mol):
    descriptors = {
        'MolWt': Descriptors.MolWt(mol),
        'LogP': Descriptors.MolLogP(mol),
        'NumHDonors': Descriptors.NumHDonors(mol),
        'NumHAcceptors': Descriptors.NumHAcceptors(mol)
    }
    return pd.Series(descriptors)

df = df.join(df['mol'].apply(calculate_descriptors))

# Dropping rows with missing values (where molecules couldn't be processed)
df = df.dropna()

# Defining features (X) and target (y)
X = df[['MolWt', 'LogP', 'NumHDonors', 'NumHAcceptors', 'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area']]
y = df['measured log solubility in mols per litre']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Section 2 - Feeding ML algorithms with data

##Section 2.1 - Linear Regression

In [4]:
from sklearn.linear_model import LinearRegression

linear = LinearRegression()
linear.fit(X_train, y_train)

# Making predictions
y_pred = linear.predict(X_test)

# Evaluating the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'R² Score: {r2}')

RMSE: 1.0728493823684075
R² Score: 0.7564938335582525




##Section 2.2 - Random Forest Regression

In [5]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=None)
forest.fit(X_train, y_train)

# Making predictions
y_pred = forest.predict(X_test)

# Evaluating the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'R² Score: {r2}')

RMSE: 0.8057146590092263
R² Score: 0.8626606514981415




##Section 2.3 - Support Vector Machines

In [8]:
from sklearn.svm import SVR

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the SVR model with RBF kernel
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)

# Train the model
svr.fit(X_train_scaled, y_train)

# Make predictions
y_pred = svr.predict(X_test_scaled)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'R² Score: {r2}')

RMSE: 0.8780779264809296
R² Score: 0.8368832453712077




##Section 2.4 - Multi Layer Perceotron

In [7]:
# Scaling the data (important for MLPs)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Defining and train the MLPRegressor model
mlp = MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_train)

# Making predictions
y_pred = mlp.predict(X_test_scaled)

# Evaluating the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'R² Score: {r2}')

RMSE: 0.7289858385642586
R² Score: 0.8875729964215217




##Section 2.5 - Convolutional Neural Networks

In [16]:
from rdkit.Chem import rdFingerprintGenerator
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam

# Load the dataset
url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv"
df = pd.read_csv(url)

# Convert SMILES to RDKit Mol objects
df['mol'] = df['smiles'].apply(Chem.MolFromSmiles)

# Convert molecules to molecular fingerprints
def mol_to_fingerprint(mol, radius=2, fpSize=2048):
    morgan_fp = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=fpSize)
    return morgan_fp.GetFingerprint(mol)

# Calculate fingerprints and convert them to numpy arrays
df['fingerprint'] = df['mol'].apply(mol_to_fingerprint)
X = pd.DataFrame(df['fingerprint'].apply(lambda x: list(x)).tolist())
y = df['measured log solubility in mols per litre']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the data to numpy arrays and reshape for 1D convolution (CNN expects 3D input: samples, timesteps, features)
X_train = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.values.reshape(X_test.shape[0], X_test.shape[1], 1)

# Define the CNN model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=8, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.2))  # Prevent overfitting
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1))  # Regression task, output single value (solubility)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Make predictions
y_pred = model.predict(X_test).flatten()

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'R² Score: {r2}')


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 224ms/step - loss: 6.6726 - val_loss: 2.9113
Epoch 2/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - loss: 1.8384 - val_loss: 1.8846
Epoch 3/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 1.2714 - val_loss: 1.7139
Epoch 4/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.8292 - val_loss: 1.6941
Epoch 5/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.6992 - val_loss: 1.5404
Epoch 6/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.4638 - val_loss: 1.6616
Epoch 7/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.5702 - val_loss: 1.6870
Epoch 8/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.4841 - val_loss: 1.6224
Epoch 9/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 



#Section 3 - Predictions based on SMILES format

In [17]:
from sklearn.preprocessing import StandardScaler

# Calculating descriptors for molecules in SMILE format
def prepare_raw_data(dataFrame):

    dataFrame['mol'] = dataFrame['smiles'].apply(Chem.MolFromSmiles)
    dataFrame = dataFrame.join(dataFrame['mol'].apply(calculate_descriptors))
    dataFrame = dataFrame.dropna()
    '''
    for molecule in dataFrame['mol']
        ring_info = molecule.GetRingInfo()
        num_rings = ring_info.NumRings()

    '''
    return dataFrame

# Normalizng the data via StandardScaler
def normalize_data(dataframe):
    scaler = StandardScaler()
    normalized = scaler.fit_transform(dataframe)

    return dataframe

SECTION XX: Testing the ML model on data generated by using rdkit


In [18]:
smilesList = [
    'C', 'CC', 'CCC', 'CCCC', 'CCCCC', 'CCCCCC',
    'CCCCCCC', 'CCN', 'CC(=O)O', 'CCOCC', 'CC(=O)CC',
    'CC(=O)OCC', 'CCCCO', 'C(C(=O)O)C(N)C', 'CCCCCO'
]

# The solubities in a form of log([mol of substance]/[litre of H2O]). Experimental measurements based on the literature.
log_solubilities = [-2.82, -2.70,	-2.82,	-2.98,	-2.65,	-3.96,	-4.53,	-1.57,	1.00,	-0.12,	0.44,	-0.10,	-0.01,	0.79,	-0.61]
# For refernece - "Yalkowsky, S.H., He, Yan, Jain, P. Handbook of Aqueous Solubility Data Second Edition. CRC Press, Boca Raton, FL 2010"


In [19]:
smilesDF = pd.DataFrame(smilesList, columns=['smiles'])

testDFfromSMILES = prepare_raw_data(smilesDF)

scaler = StandardScaler()

norm_testDFfromSMILES = normalize_data(testDFfromSMILES[['MolWt', 'LogP', 'NumHDonors', 'NumHAcceptors']])


In [20]:
predictions_MLP = mlp.predict(norm_testDFfromSMILES)
predictionsDF = norm_testDFfromSMILES.join(pd.DataFrame(predictions_MLP, columns=['MLP predictions']))



ValueError: X has 4 features, but MLPRegressor is expecting 7 features as input.

END
    

In [22]:
X_train

array([[[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       ...,

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[0],
        [1],
        [0],
        ...,
        [0],
        [0],
        [0]]])