In [1]:
import h2o
from h2o.automl import H2OAutoML
h2o.init()


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "11.0.22" 2024-01-16 LTS; Java(TM) SE Runtime Environment 18.9 (build 11.0.22+9-LTS-219); Java HotSpot(TM) 64-Bit Server VM 18.9 (build 11.0.22+9-LTS-219, mixed mode)
  Starting server from /Users/yvankammelu/anaconda3/envs/causalml-py38/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/8y/9ttyks_n5f7_6c4hyq6_g50r0000gn/T/tmpbpu_c_8f
  JVM stdout: /var/folders/8y/9ttyks_n5f7_6c4hyq6_g50r0000gn/T/tmpbpu_c_8f/h2o_yvankammelu_started_from_python.out
  JVM stderr: /var/folders/8y/9ttyks_n5f7_6c4hyq6_g50r0000gn/T/tmpbpu_c_8f/h2o_yvankammelu_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Toronto
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,1 month and 10 days
H2O_cluster_name:,H2O_from_python_yvankammelu_7at2b8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [3]:
# First, I need to import the necessary libraries for the task
from dataclasses import dataclass
import pandas as pd
import h2o
from h2o.automl import H2OAutoML

# Define a dataclass to store the parameters for AutoML
@dataclass
class H2OAutoMLParams:
    df: pd.DataFrame  # The Pandas DataFrame containing the dataset
    y: str            # The name of the target variable
    path: str = "./"  # The path where the model should be stored

# Initialize the H2O server. I need to make sure H2O server is running on my machine on the specified IP and port.
h2o.init(ip="localhost", port=54321)

# Now, I'll load the dataset into a Pandas DataFrame.
# The path to 'final_df.csv' will be the location where the file is stored on my machine.
final_df = pd.read_csv('final_df.csv')

# Next, I'll set the parameters for AutoML.
# I know my target variable is 'Conversion', so I'll set that.
params = H2OAutoMLParams(df=final_df, y='Conversion')

# Convert the Pandas DataFrame to an H2O Frame which is required by H2O.
h2o_df = h2o.H2OFrame(params.df)

# Specify the features (x) and the target variable (y)
target = params.y
features = h2o_df.columns
features.remove(target)  # Exclude the target from the feature set

# Now I'll create an instance of H2OAutoML.
# I'll limit the max models to 10 for quick results, but this can be adjusted for a thorough search.
aml = H2OAutoML(max_models=10, seed=1, nfolds=5)  # I'll use 5-fold cross-validation

# Train the model
aml.train(x=features, y=target, training_frame=h2o_df)

# After training, H2O provides a leaderboard of models. I'll display it to check the performance.
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))  # Show the leaderboard

# Finally, I'll download the best model to the specified path.
best_model = aml.leader
best_model_path = best_model.download_mojo(path=params.path)
print(f"The best model is saved to {best_model_path}")

# I'll close the H2O connection as I'm done with model training and deployment.
h2o.shutdown(prompt=False)


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,34 secs
H2O_cluster_timezone:,America/Toronto
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,1 month and 10 days
H2O_cluster_name:,H2O_from_python_yvankammelu_7at2b8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


  exec(code_obj, self.user_global_ns, self.user_ns)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
18:40:37.245: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
model_id                                                      rmse         mse         mae        rmsle    mean_residual_deviance
StackedEnsemble_BestOfFamily_1_AutoML_1_20240423_184037  0.063563   0.00404026  0.00432536    0.0169845                0.00404026
StackedEnsemble_AllModels_1_AutoML_1_20240423_184037     0.0636618  0.00405282  0.00366319    0.0170515                0.00405282
GBM_1_AutoML_1_20240423_184037                           0.064153   0.00411561  0.00297235    0.0180058                0.00411561
DRF_1_AutoML_1_20240423_184037                           0.0644931  0.00415936  0.00248796    0.017146                 0.00415936
GBM_2_AutoML_1_20240423_184037                           0.0645101  0.00416156  0.0029853     0.0

  h2o.shutdown(prompt=False)
