In [7]:
import h2o
from h2o.automl import H2OAutoML
h2o.init()


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "11.0.22" 2024-01-16 LTS; Java(TM) SE Runtime Environment 18.9 (build 11.0.22+9-LTS-219); Java HotSpot(TM) 64-Bit Server VM 18.9 (build 11.0.22+9-LTS-219, mixed mode)
  Starting server from /Users/yvankammelu/anaconda3/envs/causalml-py38/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/8y/9ttyks_n5f7_6c4hyq6_g50r0000gn/T/tmp9uw3fhjx
  JVM stdout: /var/folders/8y/9ttyks_n5f7_6c4hyq6_g50r0000gn/T/tmp9uw3fhjx/h2o_yvankammelu_started_from_python.out
  JVM stderr: /var/folders/8y/9ttyks_n5f7_6c4hyq6_g50r0000gn/T/tmp9uw3fhjx/h2o_yvankammelu_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Toronto
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,1 month and 12 days
H2O_cluster_name:,H2O_from_python_yvankammelu_8qleq0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [6]:
# Import necessary libraries for data handling and machine learning
from dataclasses import dataclass
import pandas as pd
import h2o
from h2o.automl import H2OAutoML

# Create a data class to store my AutoML parameters
@dataclass
class H2OAutoMLParams:
    df: pd.DataFrame  # This will hold my DataFrame
    y: str            # This is my target column that I want to predict
    path: str = "./"  # I will save my model here

# Start my H2O session to initialize the H2O machine learning environment
h2o.init(max_mem_size="4G")  # I allocate 4 GB of memory to H2O; adjust if needed

# Load my dataset from a CSV file into a Pandas DataFrame
# I need to replace '/path/to/train_df.csv' with the actual path to the CSV on my local machine
train_df = pd.read_csv("/Users/yvankammelu/Documents/Winter '24 Semester/INSY695 Part 2 Entreprise Data Science and ML in Production/train_df.csv")

# Now I'll set up my AutoML parameters, including the path to save the model
# I already know that my target variable for prediction is 'at_least_one_conversion'
params = H2OAutoMLParams(df=train_df, y='at_least_one_conversion')

# Convert my pandas DataFrame into an H2OFrame which is necessary for H2O to process it
h2o_df = h2o.H2OFrame(params.df)

# Identify my features and target variable
# 'at_least_one_conversion' is my target, so I remove it from the list of feature names
x = h2o_df.columns
x.remove(params.y)

# Now I initialize AutoML, setting a limit on the number of models to control runtime
# I choose a seed for reproducibility
aml = H2OAutoML(max_models=20, seed=1, exclude_algos=["DeepLearning"], verbosity="info")

# Train the model on my dataset
aml.train(x=x, y=params.y, training_frame=h2o_df)

# After training, I can view the leaderboard of models trained by AutoML
# This shows me the best performing models on my data
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))  # Print the leaderboard

# I want to save the best performing model to use it later for predictions
best_model = aml.leader
best_model_path = best_model.download_mojo(path=params.path)
print(f"My best model is saved to: {best_model_path}")

# It's important to close the H2O connection after I'm done
h2o.shutdown(prompt=False)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 day 6 hours 16 mins
H2O_cluster_timezone:,America/Toronto
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,1 month and 12 days
H2O_cluster_name:,H2O_from_python_yvankammelu_1ku70n
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.947 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


  exec(code_obj, self.user_global_ns, self.user_ns)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
17:39:01.10: Project: AutoML_1_20240425_173900
17:39:01.11: 5-fold cross-validation will be used.
17:39:01.11: Setting stopping tolerance adaptively based on the training frame: 0.0013274583189194754
17:39:01.11: Build control seed: 1
17:39:01.11: training frame: Frame key: AutoML_1_20240425_173900_training_Key_Frame__upload_ba5a5d408c1a36cd20595b82dad84866.hex    cols: 29    rows: 567490  chunks: 25    size: 28389269  checksum: -4111174395153498966
17:39:01.11: validation frame: NULL
17:39:01.11: leaderboard frame: NULL
17:39:01.11: blending frame: NULL
17:39:01.11: response column: at_least_one_conversion
17:39:01.11: fold column: null
17:39:01.11: weights column: null
17:39:01.17: AutoML: XGBoost is not available; skipping it.
17:39:01.20: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (7g, 30w)]}, {GL

  h2o.shutdown(prompt=False)


In [None]:
aml.train(x=x, y=params.y, training_frame=train)
