In [1]:
import json
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import optuna
import xgboost as xgb

# Load the JSON data
with open('descriptors_WA.json', 'r') as file:
    data = json.load(file)

# Convert JSON data to a pandas DataFrame
# Each row will have 'formula', 'descriptors', and 'e_f' columns
formulas = []
descriptors = []
e_f_values = []


for formula, values in data.items():
    if values[1] < 10 and values[1] > -10:
        descriptors.append(values[0])  # Descriptor values
        e_f_values.append(values[1])    # e_f value
        formulas.append(formula)
    else:
        pass

# Create DataFrame
df = pd.DataFrame(descriptors)
df['e_f'] = e_f_values
df['formula'] = formulas

# Split the data into features (X) and target (y)
X = df.drop(columns=['e_f', 'formula'])  # All descriptor columns
y = df['e_f']                            # Target column

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [2]:
# Initialize and train the Random Forest Regressor
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def objective(trial):
    # Suggest values for the hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    # Initialize and train the Random Forest Regressor with the suggested hyperparameters
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(X_train_scaled, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test_scaled)

    r2 = r2_score(y_test, y_pred)
    
    # We want to maximize R-squared, so we return it directly
    return r2

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

# Get the best hyperparameters
best_params = study.best_params
print("Best hyperparameters:", best_params)

[32m[I 2024-11-07 01:01:24,110][0m A new study created in memory with name: no-name-86d3a541-0d50-4c2f-b805-73a1ecde4310[0m
[32m[I 2024-11-07 01:01:28,772][0m Trial 0 finished with value: 0.318695025905976 and parameters: {'n_estimators': 299, 'max_depth': 41, 'min_samples_split': 17, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.318695025905976.[0m
[32m[I 2024-11-07 01:01:36,812][0m Trial 1 finished with value: 0.3347677396086213 and parameters: {'n_estimators': 486, 'max_depth': 50, 'min_samples_split': 13, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.3347677396086213.[0m
[32m[I 2024-11-07 01:01:42,665][0m Trial 2 finished with value: 0.32028407100448186 and parameters: {'n_estimators': 335, 'max_depth': 25, 'min_samples_split': 16, 'min_samples_leaf': 5}. Best is trial 1 with value: 0.3347677396086213.[0m
[32m[I 2024-11-07 01:01:43,802][0m Trial 3 finished with value: 0.34948392499795866 and parameters: {'n_estimators': 64, 'max_depth': 40, 'min_sample

Best hyperparameters: {'n_estimators': 84, 'max_depth': 50, 'min_samples_split': 3, 'min_samples_leaf': 3}


In [5]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import keras.backend as K

# Custom R-squared metric
def r_squared(y_true, y_pred):
    ss_res = K.sum(K.square(y_true - y_pred))  # Sum of squares of residuals
    ss_tot = K.sum(K.square(y_true - K.mean(y_true)))  # Total sum of squares
    return 1 - ss_res / (ss_tot + K.epsilon())  # R-squared formula

# Define the neural network model
def create_model():
    model = Sequential()

    # Input Layer + Hidden Layer 1
    model.add(Dense(64, input_dim=9, activation='relu'))

    # Hidden Layer 2
    model.add(Dense(32, activation='relu'))

    # Hidden Layer 3 (optional for capturing non-linearity)
    model.add(Dense(16, activation='relu'))

    # Output Layer
    model.add(Dense(1, activation='linear'))  # Linear activation for regression output

    # Compile the model with MSE loss and R-squared as a metric
    model.compile(optimizer=Adam(learning_rate=0.001), 
                  loss='mse',  # MSE loss is typical for regression tasks
                  metrics=[r_squared])

    return model

# Create and summarize the model
model = create_model()
model.summary()

# Fit the model (assuming X_train, y_train, X_test, and y_test are defined)
history = model.fit(X_train, y_train, 
                    validation_data=(X_test, y_test),  # Optional validation data
                    epochs=100, 
                    batch_size=16, 
                    verbose=1)


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 64)                640       
                                                                 
 dense_13 (Dense)            (None, 32)                2080      
                                                                 
 dense_14 (Dense)            (None, 16)                528       
                                                                 
 dense_15 (Dense)            (None, 1)                 17        
                                                                 
Total params: 3265 (12.75 KB)
Trainable params: 3265 (12.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78