<a href="https://colab.research.google.com/github/JoozG/ml-case-studies/blob/main/SolubilityPredictor-fingerprints.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from rdkit.Chem import rdFingerprintGenerator
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam

# Load the dataset
url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv"
df = pd.read_csv(url)

# Convert SMILES to RDKit Mol objects
df['mol'] = df['smiles'].apply(Chem.MolFromSmiles)

# Convert molecules to molecular fingerprints
def mol_to_fingerprint(mol, radius=2, fpSize=2048):
    morgan_fp = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=fpSize)
    return morgan_fp.GetFingerprint(mol)

# Calculate fingerprints and convert them to numpy arrays
df['fingerprint'] = df['mol'].apply(mol_to_fingerprint)
X = pd.DataFrame(df['fingerprint'].apply(lambda x: list(x)).tolist())
y = df['measured log solubility in mols per litre']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the data to numpy arrays and reshape for 1D convolution (CNN expects 3D input: samples, timesteps, features)
X_train = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.values.reshape(X_test.shape[0], X_test.shape[1], 1)

# Define the CNN model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=8, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.2))  # Prevent overfitting
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1))  # Regression task, output single value (solubility)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Make predictions
y_pred = model.predict(X_test).flatten()

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'R² Score: {r2}')
