Import Modules

In [48]:
import optuna
import os
import pandas as pd
import plotly
import sklearn
from sklearn import preprocessing, svm, ensemble
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor, kernels
from sklearn.cross_decomposition import PLSRegression
from tqdm import tqdm

Define objectives of hyperparamter optimization

In [49]:
def objective(trial):
    # Load data file
    prefix = "TSOA_PIP"
    training_data=f"{prefix}_train.csv"
    Dataset=pd.read_csv(training_data)
    Descs = Dataset.columns[1:-1]
    y_data = Dataset['Ea']
    x_data = Dataset[Descs]
    # Scale and transform values
    scaler = preprocessing.StandardScaler().fit(x_data)
    X = scaler.transform(x_data)
    # Define methods to be tested
    classifier_name = trial.suggest_categorical("classifier",
                                                ["MLR", "GPR", "ANN", "SVM", "PLS", "RF", "ExtraTrees", "Bagging"])
    # Define parameters to optimize and initialise regressor instances
    if classifier_name == "MLR":
        classifier_obj = LinearRegression(n_jobs=-1)
    elif classifier_name == "GPR":
        gpr_kernel = kernels.Matern()
        classifier_obj = GaussianProcessRegressor(kernel=gpr_kernel)
    elif classifier_name == "ANN":
        ann_layer = trial.suggest_int("ann_layer", 10, 5000)
        ann_layer2 = trial.suggest_int("ann_layer2", 8, 2000)
        classifier_obj = MLPRegressor(hidden_layer_sizes=(ann_layer,ann_layer2), max_iter=800)
    elif classifier_name == "SVM":
        svm_c = trial.suggest_float("svm_c", 1e-10, 1e10, log=True)
        svm_gamma = trial.suggest_float("svm_gamma", 0.01, 0.1)
        svm_epsilon = trial.suggest_float("svm_epsilon", 0.0001, 1000, log=True)
        classifier_obj = svm.SVR(C=svm_c, gamma=svm_gamma, epsilon=svm_epsilon)
    elif classifier_name == "PLS":
        pls_components = trial.suggest_int("pls_components", 1, 10)
        classifier_obj = PLSRegression(n_components=pls_components)
    elif classifier_name == "RF":
        rf_trees = trial.suggest_int("rf_trees", 10, 1000)
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        classifier_obj = ensemble.RandomForestRegressor(max_depth=rf_max_depth, n_estimators=rf_trees, n_jobs=-1)
    elif classifier_name == "ExtraTrees":
        et_trees = trial.suggest_int("et_trees", 10, 1000)
        et_max_depth = trial.suggest_int("et_max_depth", 2, 32, log=True)
        classifier_obj = ensemble.ExtraTreesRegressor(n_estimators=et_trees, max_depth=et_max_depth, n_jobs=-1)
    elif classifier_name == "Bagging":
        bag_trees = trial.suggest_int("bag_trees", 10, 1000)
        bag_max_depth = trial.suggest_int("bag_max_depth", 2, 32, log=True)
        classifier_obj = ensemble.ExtraTreesRegressor(n_estimators=bag_trees, max_depth=bag_max_depth, n_jobs=-1)
    else:
        pass
    score = cross_val_score(classifier_obj,X,y_data,n_jobs=-1,cv=KFold(n_splits=10,shuffle=True, random_state=1))
    accuracy = score.mean()
    
    return accuracy

In [50]:
def main():

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=10)
    
    print(f"The best trial is: \n {study.best_trial}")
    print(f"The best value is : \n{study.best_value}")
    print(f"The best parameters are : \n{study.best_params}")

In [51]:
main()

[32m[I 2022-04-11 15:16:33,920][0m A new study created in memory with name: no-name-34a792a1-5f54-4b24-9d57-66d52f3f9aaa[0m
[32m[I 2022-04-11 15:16:35,161][0m Trial 0 finished with value: 0.16132093497195044 and parameters: {'classifier': 'MLR'}. Best is trial 0 with value: 0.16132093497195044.[0m
[32m[I 2022-04-11 15:16:35,967][0m Trial 1 finished with value: 0.17584472603337437 and parameters: {'classifier': 'PLS', 'pls_components': 4}. Best is trial 1 with value: 0.17584472603337437.[0m
[32m[I 2022-04-11 15:16:38,130][0m Trial 2 finished with value: 0.20189411773252197 and parameters: {'classifier': 'Bagging', 'bag_trees': 986, 'bag_max_depth': 3}. Best is trial 2 with value: 0.20189411773252197.[0m
[32m[I 2022-04-11 15:16:44,479][0m Trial 3 finished with value: 0.2651174347003118 and parameters: {'classifier': 'RF', 'rf_trees': 854, 'rf_max_depth': 3}. Best is trial 3 with value: 0.2651174347003118.[0m
[32m[I 2022-04-11 15:16:44,833][0m Trial 4 finished with value:

The best trial is: 
 FrozenTrial(number=7, values=[0.4075420007335703], datetime_start=datetime.datetime(2022, 4, 11, 15, 16, 46, 249755), datetime_complete=datetime.datetime(2022, 4, 11, 15, 16, 50, 933802), params={'classifier': 'RF', 'rf_trees': 238, 'rf_max_depth': 9}, distributions={'classifier': CategoricalDistribution(choices=('MLR', 'GPR', 'ANN', 'SVM', 'PLS', 'RF', 'ExtraTrees', 'Bagging')), 'rf_trees': IntUniformDistribution(high=1000, low=10, step=1), 'rf_max_depth': IntLogUniformDistribution(high=32, low=2, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=7, state=TrialState.COMPLETE, value=None)
The best value is : 
0.4075420007335703
The best parameters are : 
{'classifier': 'RF', 'rf_trees': 238, 'rf_max_depth': 9}


Plot results

In [52]:
plot_optimization_history(study).show()
optuna.visualization.plot_parallel_coordinate(study, params=['rf_trees', 'rf_max_depth'])

NameError: name 'plot_optimization_history' is not defined