In [None]:
# import pandas as pd
# import numpy as np
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, InputLayer
# from tensorflow.keras.optimizers import Adam
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import r2_score, mean_absolute_error
# import matplotlib.pyplot as plt

In [None]:
# # 1. Load and preprocess the data
# df = pd.read_csv('delaney-processed.csv')

In [None]:
# features = ['Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors', 'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area']
# target = 'measured log solubility in mols per litre'

In [None]:
# X = df[features]
# y = df[target]

In [None]:
# # Split the data
# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# # Normalize the data
# scaler = StandardScaler()

| Scaler | Equation | When to Use | Preserves Zero Mean? | Sensitivity to Outliers | Range After Scaling |
|--------|----------|-------------|----------------------|-------------------------|---------------------|
| StandardScaler | $ z = \frac{(x - \mu)}{\sigma} $ | When features have different units or different variances. Most suitable for algorithms that assume zero-centered data. | Yes | No | Varies |
| MinMaxScaler | $ z = \frac{(x - \text{min})}{\text{max} - \text{min}} $ | When you need values in a bounded interval. Scales the data to a specific range, typically [0, 1] or [-1, 1]. | No | Yes | [0, 1] or Custom |
| MaxAbsScaler | $ z = \frac{x}{\max(\|x\|)} $ | Useful for zero-centered or sparse data. | Yes, if data is zero-centered | Yes | [-1, 1] |
| RobustScaler | $ z = \frac{(x - \text{median})}{\text{IQR}} $ | When the data contains many outliers. Uses median and the interquartile range for scaling. | No | No | Varies |
| QuantileTransformer (uniform output) | Transforms features to follow a uniform distribution | When you want to transform features to follow a uniform distribution. Useful for non-linear data. | No | No | [0, 1] |
| QuantileTransformer (normal output) | Transforms features to follow a normal distribution | When you want to transform features to follow a normal distribution. Useful for non-linear data. | No | No | Varies, but closer to normal distribution |
| PowerTransformer | Applies a power transformation to each feature | Useful for stabilizing variance and making the data more Gaussian-like. | Yes | No | Varies, but closer to normal distribution |
| Normalizer | $ z = \frac{x}{\sqrt{x_1^2 + x_2^2 + \dots + x_n^2}} $ | Each parameter vector $x$ is normalized individually. Used when only the angle between feature vectors matter. Not really a scaler, but a normalizer. | No | No | Norm 1 |

### Notes:
- "Varies" in the "Range After Scaling" column means that the range isn't fixed and depends on the data.
- "Norm 1" means that the Euclidean norm of each data vector will be 1.
- $ \mu $ is the mean of the feature.
- $ \sigma $ is the standard deviation of the feature.
- $ \text{min} $ and $ \text{max} $ are the minimum and maximum values of the feature, respectively.
- $ \text{IQR} $ is the interquartile range, $ \text{Q3} - \text{Q1} $, where $ \text{Q3} $ and $ \text{Q1} $ are the third and first quartiles, respectively.

In [None]:
# X_train_scaled = scaler.fit_transform(X_train)
# X_val_scaled = scaler.transform(X_val)
# X_test_scaled = scaler.transform(X_test)

In [None]:
# # # Add dropout and regularization
# from tensorflow.keras.layers import Dropout
# from tensorflow.keras import regularizers

In [None]:
# from tensorflow.keras.callbacks import EarlyStopping
# early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

In [None]:
# # 2. Define the model
# model = Sequential()
# model.add(InputLayer(input_shape=(6,)))  # Explicit input layer with 6 features
# model.add(Dense(128, activation='relu'))
# model.add(Dense(128, activation='relu'))
# model.add(Dense(128, activation='relu'))
# model.add(Dense(256, activation='relu'))
# model.add(Dense(1))  # Output layer

In [None]:
# # Compile the model
# opt = Adam(learning_rate=0.0001)

# model.compile(optimizer=opt, loss='mse', metrics=['mse'])

In [None]:
# # 3. Train the model
# # history = model.fit(X_train_scaled, y_train, epochs=200, validation_data=(X_val_scaled, y_val), verbose=1, callbacks=[early_stopping])
# history = model.fit(X_train_scaled, y_train, epochs=200, validation_data=(X_val_scaled, y_val), verbose=1)

In [None]:
# # 4. Evaluate the model
# test_loss, test_mse = model.evaluate(X_test_scaled, y_test, verbose=1)

In [None]:
# Make predictions
y_pred = model.predict(X_test_scaled)

In [None]:
# Calculate R^2 and MAE
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {np.sqrt(test_mse)}")
print(f"Test R^2: {r2}")
print(f"Test MAE: {mae}")

In [None]:
# 5. Plot the training and validation loss
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

## Plot metrics

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

y_pred = y_pred.reshape(-1)

# Create a dataframe to compare true and predicted values
comparison_df = pd.DataFrame({'True_Values': y_test, 'Predicted_Values': y_pred})

# Display the comparison dataframe
print(comparison_df.head())

# Plotting true vs predicted values
plt.figure(figsize=(10, 6))
sns.scatterplot(x='True_Values', y='Predicted_Values', data=comparison_df)
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('True vs Predicted Values')
plt.show()

# Residuals plot
plt.figure(figsize=(10, 6))
sns.residplot(x='True_Values', y='Predicted_Values', data=comparison_df, lowess=True)
plt.xlabel('True Values')
plt.ylabel('Residuals')
plt.title('Residuals Plot')
plt.show()

# Histogram of residuals
residuals = comparison_df['True_Values'] - comparison_df['Predicted_Values']
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Histogram of Residuals')
plt.show()