In [11]:
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
import numpy as np

In [12]:
# Initialize H2O
h2o.init(max_mem_size="8G",nthreads=-1)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 24.0.2+12-54, mixed mode, sharing)
  Starting server from C:\Users\Liuji\AppData\Roaming\Python\Python313\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Liuji\AppData\Local\Temp\tmp8sxgqkfx
  JVM stdout: C:\Users\Liuji\AppData\Local\Temp\tmp8sxgqkfx\h2o_Liuji_started_from_python.out
  JVM stderr: C:\Users\Liuji\AppData\Local\Temp\tmp8sxgqkfx\h2o_Liuji_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,America/Vancouver
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,4 months and 21 days
H2O_cluster_name:,H2O_from_python_Liuji_ye3s05
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.983 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


In [13]:
# Load preprocessed data
train_h2o = h2o.import_file("train_processed.csv")
test_h2o = h2o.import_file("test_processed.csv")

# Define features and target
y = "SalePrice"  # Log-transformed target
x = train_h2o.columns
x.remove(y)
# x.remove("Id")  # Remove ID if present

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [14]:
# Configure AutoML
# no need to split train and vaild data due to nfolds CV
aml = H2OAutoML(
    project_name = "house_price_prediction",
    #max_models = 20,  # Limit the number of models
    max_runtime_secs = 21600,  # 6 hours for better model exploration
    nfolds = 5,  # 5-fold cross-validation
    seed = 42,  # Reproducible results
    sort_metric = "RMSE",  # Optimize for RMSE
    preprocessing = ["target_encoding"],  # Enhance categorical features
    #exclude_algos = ["DeepLearning"],  # Exclude DeepLearning for faster runs
    #include_algos = ["XGBoost", "GBM"],  # Primary algorithms to include
    #stopping_rounds = 5,  # Stop early if no improvement
    verbosity = "info"  # Detailed logging
)

In [15]:
# Train AutoML
aml.train(x=x, y=y, training_frame=train_h2o)

AutoML progress: |
02:01:00.95: Project: house_price_prediction
02:01:00.96: Setting stopping tolerance adaptively based on the training frame: 0.026207120918047958
02:01:00.96: Build control seed: 42
02:01:00.97: training frame: Frame key: AutoML_1_20250818_20100_training_train_processed.hex    cols: 94    rows: 1456  chunks: 64    size: 877285  checksum: 8725142839886435116
02:01:00.98: validation frame: NULL
02:01:00.98: leaderboard frame: NULL
02:01:00.98: blending frame: NULL
02:01:00.98: response column: SalePrice
02:01:00.98: fold column: null
02:01:00.98: weights column: null
02:01:00.114: AutoML: XGBoost is not available; skipping it.
02:01:00.124: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (6g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), lr_annealing (6g, 10w)]}

key,value
Stacking strategy,cross_validation
Number of base models (used / total),5/5
# GBM base models (used / total),1/1
# DeepLearning base models (used / total),1/1
# GLM base models (used / total),1/1
# DRF base models (used / total),2/2
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,-420.8141,25.945621,-411.8525,-407.25064,-391.40094,-457.13943,-436.4269
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mae,0.0788419,0.0032721,0.083214,0.0800155,0.0795458,0.0746424,0.0767917
mean_residual_deviance,0.0131983,0.0013958,0.0134146,0.0135611,0.0151582,0.0114042,0.0124537
mse,0.0131983,0.0013958,0.0134146,0.0135611,0.0151582,0.0114042,0.0124537
null_deviance,45.763325,3.2791312,50.28016,43.15814,45.993694,47.28797,42.09666
r2,0.9150522,0.0112777,0.922191,0.9086307,0.9009841,0.9297748,0.9136807
residual_deviance,3.8476558,0.4640048,3.8768055,3.9055848,4.5474606,3.2843952,3.6240327
rmse,0.1147557,0.0060701,0.1158212,0.116452,0.1231186,0.1067902,0.1115962
rmsle,0.0089581,0.0005305,0.0089435,0.0092016,0.0096929,0.0083007,0.0086516


In [16]:
# View leaderboard
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))

model_id                                                    rmse        mse        mae       rmsle    mean_residual_deviance
StackedEnsemble_BestOfFamily_6_AutoML_1_20250818_20100  0.115416  0.0133209  0.0789315  0.00901196                 0.0133209
StackedEnsemble_AllModels_6_AutoML_1_20250818_20100     0.116033  0.0134637  0.0785353  0.00906463                 0.0134637
StackedEnsemble_AllModels_5_AutoML_1_20250818_20100     0.118132  0.0139551  0.0805898  0.0092156                  0.0139551
GBM_grid_1_AutoML_1_20250818_20100_model_313            0.118826  0.0141197  0.0816144  0.00926819                 0.0141197
GBM_grid_1_AutoML_1_20250818_20100_model_116            0.11925   0.0142206  0.081212   0.00932233                 0.0142206
GBM_grid_1_AutoML_1_20250818_20100_model_212            0.119264  0.0142239  0.0814517  0.00931461                 0.0142239
GBM_grid_1_AutoML_1_20250818_20100_model_132            0.119853  0.0143647  0.0814329  0.00936746                 0.0143647


In [17]:
# Inspect leader model performance
best_model = aml.leader
print(best_model.model_performance())
print(best_model.get_params())

ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 0.008414301354516441
RMSE: 0.0917295010044012
MAE: 0.06367197478069492
RMSLE: 0.00716129618287674
Mean Residual Deviance: 0.008414301354516441
R^2: 0.9463269044888276
Null degrees of freedom: 1455
Residual degrees of freedom: 1450
Null deviance: 228.25631082945284
Residual deviance: 12.25122277217594
AIC: -2810.5605201650014
{'model_id': None, 'training_frame': None, 'response_column': None, 'validation_frame': None, 'blending_frame': None, 'base_models': [], 'metalearner_algorithm': 'auto', 'metalearner_nfolds': 0, 'metalearner_fold_assignment': None, 'metalearner_fold_column': None, 'metalearner_params': None, 'metalearner_transform': 'none', 'max_runtime_secs': 0.0, 'weights_column': None, 'offset_column': None, 'custom_metric_func': None, 'seed': -1, 'score_training_samples': 10000, 'keep_levelone_frame': False, 'export_checkpoints_dir': None, 'auc_type': 'auto', 'gainslift_bins': -1}


In [18]:
# Predict on test set
preds_h2o = best_model.predict(test_h2o)
final_preds = np.expm1(preds_h2o.as_data_frame()["predict"].values)
test = pd.read_csv("test.csv")

# Create submission file
submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": final_preds
})
submission.to_csv("submission_h2o.csv", index=False)



stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%





In [19]:
# Cleanup
h2o.cluster().shutdown()

H2O session _sid_8c80 closed.
