## Importing Library 

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy import stats
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import optuna
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

##  Load Dataset 

In [None]:

Dataset = pd.read_csv("Dataset for PMV Prediction.csv")

In [31]:
print (Dataset)

       Season            Climate  Building type   Clo  Met  \
0      Summer  Humid subtropical         Office  0.57  1.0   
1      Summer  Humid subtropical         Office  0.57  1.1   
2      Summer  Humid subtropical         Office  0.57  1.1   
3      Summer  Humid subtropical         Office  0.57  1.0   
4      Summer  Humid subtropical         Office  0.57  1.0   
...       ...                ...            ...   ...  ...   
47331  Winter  Humid subtropical  Senior center  0.94  1.0   
47332  Winter  Humid subtropical  Senior center  0.66  1.0   
47333  Winter  Humid subtropical  Senior center  0.69  1.0   
47334  Winter  Humid subtropical  Senior center  0.82  1.0   
47335  Winter  Humid subtropical  Senior center  0.86  1.2   

       Air temperature (C)  Relative humidity (%)  Air velocity (m/s)  \
0                     24.3                   36.8                0.27   
1                     25.7                   33.1                0.09   
2                     24.6          

## Input and Output Features

In [None]:

X = Dataset[['Air temperature (C)','Outdoor monthly air temperature (C)', 'Relative humidity (%)', 'Air velocity (m/s)', 'Clo', 'Met','Season', 'Building type','Climate']]   # Input Features
y = Dataset[['PMV']]                     # Output Features

### Identify feature types

In [None]:

categorical_features = ["Season", "Building type","Climate"]
numeric_features = [
    "Clo", "Met", "Air temperature (C)", "Relative humidity (%)",
    "Air velocity (m/s)","Outdoor monthly air temperature (C)"
]

### Preprocessor: Scale numeric + One-hot encode categorical

In [None]:

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

#### 1.6 Split the Dataset: Train, Validation, and Test

In [None]:
from sklearn.model_selection import train_test_split
X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=0)
X_val_raw, X_test_raw, y_val, y_test = train_test_split(X_temp_raw, y_temp, test_size=0.5, random_state=0)

In [36]:
# Fit preprocessor only on training set
preprocessor.fit(X_train_raw)

In [37]:
# Transform all splits
X_train = preprocessor.transform(X_train_raw)
X_val = preprocessor.transform(X_val_raw)
X_test = preprocessor.transform(X_test_raw)

###  Optuna For Deep Neural Network

In [None]:
# --- Features & target 
X = Dataset[['Air temperature (C)', 'Outdoor monthly air temperature (C)',
             'Relative humidity (%)', 'Air velocity (m/s)', 'Clo', 'Met',
             'Season', 'Building type', 'Climate']]
y = Dataset['PMV']  # 1D is nicer for Keras/sklearn

categorical_features = ["Season", "Building type", "Climate"]
numeric_features = ["Clo", "Met", "Air temperature (C)", "Relative humidity (%)",
                    "Air velocity (m/s)", "Outdoor monthly air temperature (C)"]

# --- Optuna Objective (with in-trial preprocessing + proper Keras input) ---
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

def objective(trial):
    # Trial-specific split
    train_ratio = trial.suggest_float("train_ratio", 0.6, 0.9)
    X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(
        X, y, test_size=1 - train_ratio, random_state=0
    )
    X_val_raw, X_test_raw, y_val, y_test = train_test_split(
        X_temp_raw, y_temp, test_size=0.5, random_state=0
    )

    # Trial-specific preprocessor (fit only on train split!)
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        # for older scikit-learn versions
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

    preprocessor = ColumnTransformer([
        ("num", StandardScaler(), numeric_features),
        ("cat", ohe, categorical_features),
    ], remainder="drop")

    preprocessor.fit(X_train_raw)

    X_train = preprocessor.transform(X_train_raw)
    X_val   = preprocessor.transform(X_val_raw)
    X_test  = preprocessor.transform(X_test_raw)

    # DNN hyperparameters
    n_layers      = trial.suggest_int("n_layers", 1, 3)
    units         = trial.suggest_int("units", 16, 128)
    activation    = trial.suggest_categorical("activation", ["relu", "tanh"])
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)
    batch_size    = trial.suggest_categorical("batch_size", [32, 64, 128])
    epochs        = 100

    # Build the DNN (use keras.Input to avoid the warning)
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],)))
    model.add(Dense(units, activation=activation))
    for _ in range(n_layers - 1):
        model.add(Dense(units, activation=activation))
    model.add(Dense(1))  # regression output

    model.compile(optimizer=Adam(learning_rate=learning_rate), loss="mse")

    # Ensure y is 1D numpy
    y_train_np = np.asarray(y_train).ravel()
    y_val_np   = np.asarray(y_val).ravel()
    y_test_np  = np.asarray(y_test).ravel()

    # Train with early stopping
    model.fit(
        X_train, y_train_np,
        validation_data=(X_val, y_val_np),
        epochs=epochs,
        batch_size=batch_size,
        verbose=0,
        callbacks=[tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)]
    )

    # Final evaluation on test set
    y_pred = model.predict(X_test, verbose=0).ravel()
    mse = mean_squared_error(y_test_np, y_pred)
    return mse

# --- Run Optuna Study (unchanged) ---
import optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)
print("Best value:", study.best_value)
print("Best params:", study.best_trial.params)


[I 2025-09-12 14:43:59,493] A new study created in memory with name: no-name-af50b35b-85c5-4c05-8810-c5aa5d4026d9
[I 2025-09-12 14:46:47,914] Trial 0 finished with value: 0.021041100451548046 and parameters: {'train_ratio': 0.663613415045792, 'n_layers': 2, 'units': 46, 'activation': 'relu', 'learning_rate': 0.0003699423079213829, 'batch_size': 32}. Best is trial 0 with value: 0.021041100451548046.
[I 2025-09-12 14:47:42,558] Trial 1 finished with value: 0.02436797915656098 and parameters: {'train_ratio': 0.7546019035965632, 'n_layers': 2, 'units': 56, 'activation': 'relu', 'learning_rate': 0.0007116758592917981, 'batch_size': 128}. Best is trial 0 with value: 0.021041100451548046.
[I 2025-09-12 14:48:59,229] Trial 2 finished with value: 0.024253809735489656 and parameters: {'train_ratio': 0.714611743271883, 'n_layers': 3, 'units': 122, 'activation': 'tanh', 'learning_rate': 0.0012645487779427464, 'batch_size': 32}. Best is trial 0 with value: 0.021041100451548046.
[I 2025-09-12 14:49:

Best value: 0.0198513317711582
Best params: {'train_ratio': 0.6814496492656757, 'n_layers': 2, 'units': 123, 'activation': 'relu', 'learning_rate': 0.0004726224563288523, 'batch_size': 128}


### Best Results For Deep Neural Networks

In [42]:
# ------------------ Best Results ------------------
print("Best Parameters Found:")
print(f"Train Ratio:       {study.best_params['train_ratio']:.2f}")
print(f"Hidden Layers:     {study.best_params['n_layers']}")
print(f"Units per Layer:   {study.best_params['units']}")
print(f"Activation:        {study.best_params['activation']}")
print(f"Learning Rate:     {study.best_params['learning_rate']:.5f}")
print(f"Best Test MSE:     {study.best_value:.5f}")

Best Parameters Found:
Train Ratio:       0.68
Hidden Layers:     2
Units per Layer:   123
Activation:        relu
Learning Rate:     0.00047
Best Test MSE:     0.01985


### Optuna For Random Forest

In [None]:

RANDOM_STATE = 0

def rf_objective(trial):
    # --- trial-specific split ---
    train_ratio = trial.suggest_float("train_ratio", 0.6, 0.9)
    X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(
        X, y, test_size=1 - train_ratio, random_state=RANDOM_STATE
    )
    X_val_raw, X_test_raw, y_val, y_test = train_test_split(
        X_temp_raw, y_temp, test_size=0.5, random_state=RANDOM_STATE
    )

    # --- trial-specific preprocessor (fit ONLY on train) ---
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)  # sklearn >=1.2
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)         # sklearn <1.2

    preproc = ColumnTransformer(
        transformers=[
            ("cat", ohe, categorical_features),
            ("num", "passthrough", numeric_features),
        ],
        remainder="drop",
    )

    # --- RF hyperparameters ---
    n_estimators       = trial.suggest_int("n_estimators", 100, 1000, step=100)
    max_depth          = trial.suggest_int("max_depth", 5, 40)
    max_features       = trial.suggest_categorical("max_features", ["sqrt", "log2", None])
    min_samples_split  = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf   = trial.suggest_int("min_samples_leaf", 1, 10)

    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )

    # --- pipeline: preprocessing + model ---
    pipe = Pipeline([
        ("prep", preproc),
        ("model", rf),
    ])

    # Ensure 1D target
    y_train_1d = np.asarray(y_train).ravel()
    y_test_1d  = np.asarray(y_test).ravel()

    # Fit on train; evaluate on test (your existing structure)
    pipe.fit(X_train_raw, y_train_1d)
    y_pred = pipe.predict(X_test_raw)

    mse = mean_squared_error(y_test_1d, y_pred)
    return mse

# --- run Optuna study ---
study_rf = optuna.create_study(direction="minimize")
study_rf.optimize(rf_objective, n_trials=20)

print("Best RF Parameters:")
print(study_rf.best_params)
print(f"Best RF Test MSE: {study_rf.best_value:.5f}")


[I 2025-09-12 15:27:23,255] A new study created in memory with name: no-name-c63ada51-7fa5-4f6e-8782-a69996a9398e
[I 2025-09-12 15:27:27,354] Trial 0 finished with value: 0.048977533088183436 and parameters: {'train_ratio': 0.7833101258103793, 'n_estimators': 500, 'max_depth': 25, 'max_features': 'log2', 'min_samples_split': 15, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.048977533088183436.
[I 2025-09-12 15:27:32,024] Trial 1 finished with value: 0.07330641401522386 and parameters: {'train_ratio': 0.8914014250256461, 'n_estimators': 600, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_split': 12, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.048977533088183436.
[I 2025-09-12 15:27:36,201] Trial 2 finished with value: 0.09625103435979815 and parameters: {'train_ratio': 0.8566225698510836, 'n_estimators': 200, 'max_depth': 7, 'max_features': None, 'min_samples_split': 20, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.048977533088183436.
[I 2025-09-12 15:2

Best RF Parameters:
{'train_ratio': 0.675420920217388, 'n_estimators': 700, 'max_depth': 35, 'max_features': None, 'min_samples_split': 12, 'min_samples_leaf': 5}
Best RF Test MSE: 0.03057


### Optuna for Gradient Boosting Regression

In [None]:

RANDOM_STATE = 0  # keep consistent with your other objectives

def gb_objective(trial):
    # --- trial-specific split ---
    train_ratio = trial.suggest_float("train_ratio", 0.6, 0.9)
    X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(
        X, y, test_size=1 - train_ratio, random_state=RANDOM_STATE
    )
    X_val_raw, X_test_raw, y_val, y_test = train_test_split(
        X_temp_raw, y_temp, test_size=0.5, random_state=RANDOM_STATE
    )

    # --- trial-specific preprocessor (fit ONLY on train) ---
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)  # sklearn >= 1.2
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)         # sklearn < 1.2

    preproc = ColumnTransformer(
        transformers=[
            ("cat", ohe, categorical_features),
            ("num", "passthrough", numeric_features),
        ],
        remainder="drop",
    )

    # --- GB hyperparameters ---
    n_estimators      = trial.suggest_int("n_estimators", 100, 1200, step=100)
    learning_rate     = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    max_depth         = trial.suggest_int("max_depth", 2, 6)
    subsample         = trial.suggest_float("subsample", 0.6, 1.0)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf  = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features      = trial.suggest_categorical("max_features", [None, "sqrt", "log2"])

    gb = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=RANDOM_STATE,
    )

    # --- pipeline: preprocessing + model ---
    pipe = Pipeline([
        ("prep", preproc),
        ("model", gb),
    ])

    # Ensure 1D target
    y_train_1d = np.asarray(y_train).ravel()
    y_test_1d  = np.asarray(y_test).ravel()

    # Fit on train; evaluate on test
    pipe.fit(X_train_raw, y_train_1d)
    y_pred = pipe.predict(X_test_raw)
    mse = mean_squared_error(y_test_1d, y_pred)
    return mse

# --- run Optuna study ---
study_gb = optuna.create_study(direction="minimize")
study_gb.optimize(gb_objective, n_trials=20)

print("Best GBR Parameters:")
print(study_gb.best_params)
print(f"Best GBR Test MSE: {study_gb.best_value:.5f}")


[I 2025-09-12 15:35:00,007] A new study created in memory with name: no-name-9589d43e-9f6f-4aee-85e9-58b54ed2d091
[I 2025-09-12 15:35:10,958] Trial 0 finished with value: 0.024298529696875966 and parameters: {'train_ratio': 0.6917129294724884, 'n_estimators': 800, 'learning_rate': 0.043731027718313276, 'max_depth': 6, 'subsample': 0.749371463929578, 'min_samples_split': 16, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 0 with value: 0.024298529696875966.
[I 2025-09-12 15:35:14,213] Trial 1 finished with value: 0.07005819551966296 and parameters: {'train_ratio': 0.8773200280777704, 'n_estimators': 200, 'learning_rate': 0.030664601577693292, 'max_depth': 5, 'subsample': 0.6514174778711466, 'min_samples_split': 17, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.024298529696875966.
[I 2025-09-12 15:35:20,135] Trial 2 finished with value: 0.023258224913042653 and parameters: {'train_ratio': 0.6720362895693771, 'n_estimators': 400, 'learning_rat

Best GB Parameters:
{'train_ratio': 0.6088092433763583, 'n_estimators': 700, 'learning_rate': 0.047621128887587426, 'max_depth': 6, 'subsample': 0.8182979876303812, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': None}
Best GB Test MSE: 0.01938
