In [1]:
# импортируем необходимые библиотеки, классы и функции
import h2o
import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split, 
                                     GridSearchCV)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import CountEncoder
from h2o.sklearn import (H2OAutoMLClassifier, 
                         H2OAutoMLRegressor)
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score,
                             mean_squared_error)

  import pandas.util.testing as tm


In [2]:
# отключаем предупреждения
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# запускаем кластер H2O
h2o.init(nthreads=-1, max_mem_size=8)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.15" 2022-04-19; OpenJDK Runtime Environment (build 11.0.15+10-Ubuntu-0ubuntu0.18.04.1); OpenJDK 64-Bit Server VM (build 11.0.15+10-Ubuntu-0ubuntu0.18.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpbci2cj48
  JVM stdout: /tmp/tmpbci2cj48/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpbci2cj48/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.2
H2O_cluster_version_age:,1 month and 3 days
H2O_cluster_name:,H2O_from_python_unknownUser_v67wdh
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


## `H2OAutoMLClassifier` пример

In [4]:
# загружаем данные для задачи классификации
data = pd.read_csv('Data/StateFarm_missing.csv', sep=';')
data.head()

Unnamed: 0,Customer Lifetime Value,Coverage,Education,EmploymentStatus,Gender,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Response
0,2763.519279,Basic,Bachelor,Employed,F,56274.0,,32.0,5.0,,1.0,No
1,,,Bachelor,Unemployed,F,0.0,,13.0,42.0,,,No
2,,,,Employed,F,48767.0,108.0,,38.0,0.0,,No
3,7645.861827,Basic,Bachelor,,,0.0,106.0,18.0,,,7.0,No
4,2813.692575,Basic,Bachelor,,M,43836.0,73.0,12.0,,,1.0,No


In [5]:
# разбиваем данные на обучающие и тестовые: получаем обучающий
# массив признаков, тестовый массив признаков, обучающий массив
# меток, тестовый массив меток
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('Response', axis=1), 
    data['Response'], 
    test_size=0.3,
    stratify=data['Response'],
    random_state=42)

# превращаем датафреймы в массивы NumPy
X_train = X_train.values
X_test = X_test.values

y_train = y_train.values
y_test = y_test.values

# создаем конвейер
pipe = Pipeline([
    ('imp', SimpleImputer(strategy='constant')),
    ('classifier', H2OAutoMLClassifier(preprocessing=None, 
                                       max_models=10, seed=2022))
])

# запускаем AutoML
pipe.fit(X_train, y_train);

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


In [6]:
# получаем прогнозы для тестовой выборки
predictions = pipe.predict(X_test)
# оценим правильность на тестовой выборке
acc = accuracy_score(y_test, predictions)
print("Правильность на тестовой выборке: %.3f" % acc)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
Правильность на тестовой выборке: 0.941


In [7]:
# выводим результаты AutoML
automl = pipe.named_steps.classifier.estimator
automl.leaderboard

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_4_AutoML_1_20220629_182612,0.913029,0.17163,0.629657,0.0897083,0.216536,0.046888
StackedEnsemble_BestOfFamily_1_AutoML_1_20220629_182612,0.912744,0.169933,0.628493,0.0919027,0.217101,0.0471331
StackedEnsemble_AllModels_1_AutoML_1_20220629_182612,0.912073,0.169591,0.629174,0.0909478,0.216839,0.047019
GBM_3_AutoML_1_20220629_182612,0.910665,0.174784,0.630274,0.0927643,0.219404,0.0481382
GBM_2_AutoML_1_20220629_182612,0.909894,0.180035,0.618754,0.115197,0.222876,0.0496736
XGBoost_2_AutoML_1_20220629_182612,0.908942,0.192712,0.592121,0.164934,0.233423,0.0544862
GBM_1_AutoML_1_20220629_182612,0.906138,0.192051,0.592964,0.134694,0.233383,0.0544677
XGBoost_3_AutoML_1_20220629_182612,0.904502,0.193615,0.579166,0.117902,0.232535,0.0540726
XRT_1_AutoML_1_20220629_182612,0.901894,0.218027,0.624927,0.115005,0.226119,0.0511296
DRF_1_AutoML_1_20220629_182612,0.898166,0.30724,0.616114,0.10327,0.226227,0.0511785




In [8]:
# создаем конвейер
pipe = Pipeline([
    ('imp', SimpleImputer(strategy='constant')),
    ('classifier', H2OAutoMLClassifier(preprocessing=None, 
                                       exclude_algos=['DRF'],
                                       max_models=10, seed=2022))
])

# запускаем AutoML
pipe.fit(X_train, y_train);

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


In [9]:
# получаем прогнозы для тестовой выборки
predictions = pipe.predict(X_test)
# оценим правильность на тестовой выборке
acc = accuracy_score(y_test, predictions)
print("Правильность на тестовой выборке: %.3f" % acc)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
Правильность на тестовой выборке: 0.942


In [10]:
# выводим результаты AutoML
automl = pipe.named_steps.classifier.estimator
automl.leaderboard

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_3_AutoML_2_20220629_182857,0.91451,0.174031,0.628115,0.0897157,0.219013,0.0479669
StackedEnsemble_BestOfFamily_1_AutoML_2_20220629_182857,0.913688,0.173071,0.627176,0.0899072,0.219735,0.0482833
GBM_4_AutoML_2_20220629_182857,0.9135,0.171677,0.635448,0.0928575,0.216805,0.0470044
StackedEnsemble_AllModels_1_AutoML_2_20220629_182857,0.913416,0.169593,0.632087,0.0929533,0.216986,0.0470829
GBM_2_AutoML_2_20220629_182857,0.909623,0.177571,0.628567,0.0968757,0.221213,0.0489353
XGBoost_2_AutoML_2_20220629_182857,0.908942,0.192712,0.592121,0.164934,0.233423,0.0544862
GBM_5_AutoML_2_20220629_182857,0.907962,0.18445,0.615031,0.123695,0.225956,0.0510559
XGBoost_3_AutoML_2_20220629_182857,0.907085,0.189672,0.600344,0.141068,0.228614,0.0522642
GBM_1_AutoML_2_20220629_182857,0.906138,0.192051,0.592964,0.134694,0.233383,0.0544677
XGBoost_1_AutoML_2_20220629_182857,0.892651,0.208908,0.536815,0.185764,0.24532,0.0601819




In [11]:
# разбиваем данные на обучающие и тестовые: получаем обучающий
# массив признаков, тестовый массив признаков, обучающий массив
# меток, тестовый массив меток
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('Response', axis=1), 
    data['Response'], 
    test_size=0.3,
    stratify=data['Response'],
    random_state=42)

# создаем списки категориальных 
# и количественных столбцов
cat_columns = X_train.select_dtypes(
    include='object').columns.tolist()
num_columns = X_train.select_dtypes(
    exclude='object').columns.tolist()

# массивы меток (серии) превращаем в массивы NumPy
y_train = y_train.values
y_test = y_test.values

# создаем конвейер для количественных переменных
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

# создаем конвейер для категориальных переменных
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('count', CountEncoder(return_df=False))
])

# создаем список трехэлементных кортежей, в котором
# первый элемент кортежа - название конвейера с
# преобразованиями для определенного типа признаков
transformers = [('num', num_pipe, num_columns),
                ('cat', cat_pipe, cat_columns)]

# создаем конвейер
pipe = Pipeline([
    ('tr', ColumnTransformer(transformers=transformers)),
    ('classifier', H2OAutoMLClassifier(preprocessing=None,
                                       max_models=10, seed=2022))])

# запускаем AutoML
pipe.fit(X_train, y_train);

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


In [12]:
# получаем прогнозы для тестовой выборки
predictions = pipe.predict(X_test)
# оценим правильность на тестовой выборке
acc = accuracy_score(y_test, predictions)
print("Правильность на тестовой выборке: %.3f" % acc)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Правильность на тестовой выборке: 0.938


In [13]:
# выводим результаты AutoML
automl = pipe.named_steps.classifier.estimator
automl.leaderboard

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_BestOfFamily_1_AutoML_3_20220629_183108,0.913198,0.170277,0.628174,0.0936234,0.217896,0.0474785
StackedEnsemble_AllModels_1_AutoML_3_20220629_183108,0.912461,0.170575,0.625438,0.0912375,0.218012,0.0475291
GBM_4_AutoML_3_20220629_183108,0.912366,0.172513,0.627018,0.0901869,0.217613,0.0473553
XGBoost_2_AutoML_3_20220629_183108,0.910819,0.189462,0.601831,0.136795,0.231553,0.0536167
GBM_2_AutoML_3_20220629_183108,0.909529,0.179718,0.621232,0.0988836,0.222935,0.0496999
GBM_3_AutoML_3_20220629_183108,0.908902,0.176946,0.62083,0.0928649,0.220594,0.0486618
GBM_1_AutoML_3_20220629_183108,0.904372,0.195252,0.581743,0.141662,0.235526,0.0554724
DRF_1_AutoML_3_20220629_183108,0.902045,0.302361,0.61531,0.131507,0.225393,0.0508019
XGBoost_3_AutoML_3_20220629_183108,0.901969,0.194533,0.587154,0.128295,0.231816,0.0537385
XRT_1_AutoML_3_20220629_183108,0.901799,0.260677,0.624442,0.104895,0.225108,0.0506736




In [14]:
# пересоздаем итоговый конвейер

# создаем конвейер для количественных переменных
num_pipe = Pipeline([
    ('imputer', SimpleImputer())
])

# создаем конвейер для категориальных переменных
cat_pipe = Pipeline([
    ('imputer', SimpleImputer())
])

# создаем список трехэлементных кортежей, в котором
# первый элемент кортежа - название конвейера с
# преобразованиями для определенного типа признаков
transformers = [('num', num_pipe, num_columns),
                ('cat', cat_pipe, cat_columns)]

# создаем конвейер
pipe = Pipeline([
    ('tr', ColumnTransformer(transformers=transformers)),
    ('classifier', H2OAutoMLClassifier(preprocessing=None,
                                       max_models=10, seed=2022))])

In [15]:
# задаем сетку гиперпараметров
hyperparams_dct = {
    'tr__num__imputer__strategy': ['mean', 'median', 'constant'],
    'tr__cat__imputer__strategy': ['most_frequent', 'constant']
}
# запускаем поиск по сетке
grid = GridSearchCV(pipe, cv=3, param_grid=hyperparams_dct)
grid.fit(X_train, y_train);

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |█████████████████████████████████

In [16]:
# получаем прогнозы для тестовой выборки
predictions = grid.predict(X_test)
# оценим правильность на тестовой выборке
acc = accuracy_score(y_test, predictions)
print("Правильность на тестовой выборке: %.3f" % acc)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Правильность на тестовой выборке: 0.941


In [17]:
# еще можно так

# извлекаем наилучшую модель
best = grid.best_estimator_
# получаем прогнозы для тестовой выборки
predictions = best.predict(X_test)
# оценим правильность на тестовой выборке
acc = accuracy_score(y_test, predictions)
print("Правильность на тестовой выборке: %.3f" % acc)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Правильность на тестовой выборке: 0.941


In [18]:
# смотрим наилучшие значения гиперпараметров
print("Наилучшие значения гиперпараметров:\n{}".format(
    grid.best_params_))

Наилучшие значения гиперпараметров:
{'tr__cat__imputer__strategy': 'constant', 'tr__num__imputer__strategy': 'constant'}


In [19]:
# запишем результаты поиска в DataFrame
results = pd.DataFrame(grid.cv_results_)
# превращаем в сводную таблицу
table = results.pivot_table(
    values=['mean_test_score'],    
    index=['param_tr__cat__imputer__strategy',
           'param_tr__num__imputer__strategy'])
table

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score
param_tr__cat__imputer__strategy,param_tr__num__imputer__strategy,Unnamed: 2_level_1
constant,constant,0.933678
constant,mean,0.929716
constant,median,0.9323
most_frequent,constant,0.931611
most_frequent,mean,0.9323
most_frequent,median,0.9323


In [20]:
# выводим результаты AutoML
best = grid.best_estimator_['classifier']
automl = best.estimator
automl.leaderboard

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_BestOfFamily_1_AutoML_22_20220629_185309,0.912305,0.169704,0.632853,0.0920009,0.217249,0.0471972
GBM_4_AutoML_22_20220629_185309,0.911861,0.172413,0.630323,0.0917137,0.21719,0.0471715
StackedEnsemble_AllModels_1_AutoML_22_20220629_185309,0.910989,0.170257,0.629147,0.0926685,0.217529,0.0473187
XGBoost_2_AutoML_22_20220629_185309,0.910851,0.185772,0.625292,0.122281,0.227859,0.0519196
GBM_3_AutoML_22_20220629_185309,0.910332,0.175707,0.629146,0.0921948,0.219893,0.0483529
GBM_1_AutoML_22_20220629_185309,0.909179,0.18993,0.597023,0.144038,0.23173,0.0536986
GBM_2_AutoML_22_20220629_185309,0.908555,0.179801,0.621628,0.115005,0.223249,0.0498399
XGBoost_3_AutoML_22_20220629_185309,0.902631,0.193709,0.587397,0.140597,0.23112,0.0534165
XRT_1_AutoML_22_20220629_185309,0.898481,0.23301,0.623064,0.105565,0.22602,0.051085
DRF_1_AutoML_22_20220629_185309,0.898156,0.387124,0.617498,0.116338,0.225769,0.0509716




## `H2OAutoMLRegressor` пример

In [21]:
# загружаем данные для задачи регрессии
data = pd.read_csv('Data/Flats_missing.csv', 
                   sep=';', decimal=',')
data.head()

Unnamed: 0,Rooms_Number,District,Stor,Storeys,Space_Total,Space_Living,Space_Kitchen,Balcon_Num,Lodgee_Num,lat,Long,Cost_KV
0,1,Заельцовский,13,17.0,54.1,18.0,21.2,0.0,1,55.0725,82.9069,50831.79298
1,1,Заельцовский,10,17.0,54.5,18.0,21.1,0.0,1,55.0725,82.9069,52000.0
2,1,Центральный,8,17.0,37.0,0.0,0.0,0.0,0,55.0725,82.9068,87837.83784
3,1,Центральный,2,17.0,42.0,0.0,0.0,0.0,0,55.0725,82.9068,90238.09524
4,1,Центральный,13,17.0,28.0,0.0,0.0,0.0,0,55.0725,82.9068,110714.2857


In [22]:
# разбиваем данные на обучающие и тестовые: получаем обучающий
# массив признаков, тестовый массив признаков, обучающий массив
# меток, тестовый массив меток
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('Cost_KV', axis=1), 
    data['Cost_KV'],
    test_size=0.3,
    random_state=42)

X_train = X_train.values
X_test = X_test.values

y_train = y_train.values
y_test = y_test.values

# создаем конвейер
pipe = Pipeline([
    ('imp', SimpleImputer(strategy='constant')),
    ('regressor', H2OAutoMLRegressor(preprocessing=None, 
                                     max_models=10, seed=2022))
])

# запускаем AutoML
pipe.fit(X_train, y_train);

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


In [23]:
# получаем прогнозы для тестовой выборки
predictions = pipe.predict(X_test)
# оцениваем RMSE на тестовой выборке
rmse = mean_squared_error(y_test, predictions, squared=False)
print("RMSE на тестовой выборке: %.3f" % rmse)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
RMSE на тестовой выборке: 9483.515


In [24]:
# выводим результаты AutoML
automl = pipe.named_steps.regressor.estimator
automl.leaderboard

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_AllModels_1_AutoML_23_20220629_185527,11109.4,123418000.0,6734.8,0.181944,123418000.0
StackedEnsemble_BestOfFamily_1_AutoML_23_20220629_185527,11160.9,124565000.0,6771.15,0.182826,124565000.0
GBM_4_AutoML_23_20220629_185527,11227.1,126049000.0,6851.11,0.184222,126049000.0
GBM_1_AutoML_23_20220629_185527,11304.2,127785000.0,6924.0,0.184864,127785000.0
GBM_3_AutoML_23_20220629_185527,11348.7,128793000.0,6961.28,0.186158,128793000.0
DRF_1_AutoML_23_20220629_185527,11358.3,129010000.0,6921.17,0.185917,129010000.0
XRT_1_AutoML_23_20220629_185527,11360.2,129053000.0,6908.06,0.185547,129053000.0
GBM_2_AutoML_23_20220629_185527,11395.0,129846000.0,7012.8,0.186889,129846000.0
XGBoost_3_AutoML_23_20220629_185527,11622.4,135079000.0,7163.02,0.190612,135079000.0
XGBoost_2_AutoML_23_20220629_185527,11628.4,135220000.0,7110.53,0.190115,135220000.0




In [25]:
# извлекаем нужную нам модель
h2o.get_model('DRF_1_AutoML_23_20220629_185527')

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_1_AutoML_23_20220629_185527


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,47.0,47.0,9672250.0,20.0,20.0,20.0,14165.0,19153.0,16379.915




ModelMetricsRegression: drf
** Reported on train data. **

MSE: 133451062.30450475
RMSE: 11552.102072978092
MAE: 7012.203089158242
RMSLE: 0.1881363850778693
Mean Residual Deviance: 133451062.30450475

ModelMetricsRegression: drf
** Reported on cross-validation data. **

MSE: 129010212.29656787
RMSE: 11358.266253991755
MAE: 6921.1746908948635
RMSLE: 0.18591658290574456
Mean Residual Deviance: 129010212.29656787

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,6921.175,105.2039,7100.097,6859.382,6833.76,6892.952,6919.682
1,mean_residual_deviance,129010200.0,37263030.0,183524600.0,95562020.0,92376830.0,136371400.0,137216200.0
2,mse,129010200.0,37263030.0,183524600.0,95562020.0,92376830.0,136371400.0,137216200.0
3,r2,0.4918372,0.06632507,0.4079469,0.5590534,0.5598539,0.4730992,0.4592325
4,residual_deviance,129010200.0,37263030.0,183524600.0,95562020.0,92376830.0,136371400.0,137216200.0
5,rmse,11265.15,1622.744,13547.13,9775.583,9611.287,11677.82,11713.93
6,rmsle,0.1858941,0.003230138,0.1908681,0.187098,0.1828036,0.1835666,0.1851344



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2022-06-29 18:58:49,1 min 9.987 sec,0.0,,,
1,,2022-06-29 18:58:51,1 min 11.211 sec,5.0,13695.991626,8473.332086,187580200.0
2,,2022-06-29 18:58:52,1 min 12.500 sec,10.0,12417.934281,7771.040961,154205100.0
3,,2022-06-29 18:58:53,1 min 13.760 sec,15.0,12020.315361,7429.648955,144488000.0
4,,2022-06-29 18:58:55,1 min 15.183 sec,20.0,11824.621057,7267.979248,139821700.0
5,,2022-06-29 18:58:56,1 min 16.374 sec,25.0,11778.795829,7192.458548,138740000.0
6,,2022-06-29 18:58:57,1 min 17.619 sec,30.0,11693.074419,7134.751883,136728000.0
7,,2022-06-29 18:58:58,1 min 18.862 sec,35.0,11671.958821,7092.658614,136234600.0
8,,2022-06-29 18:59:00,1 min 20.172 sec,40.0,11631.391712,7050.51368,135289300.0
9,,2022-06-29 18:59:01,1 min 21.409 sec,45.0,11572.89249,7023.526991,133931800.0



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,C10,61052340000000.0,1.0,0.178531
1,C5,50175260000000.0,0.82184,0.146724
2,C4,44397730000000.0,0.727208,0.129829
3,C11,44318210000000.0,0.725905,0.129597
4,C2,42620790000000.0,0.698102,0.124633
5,C6,28538120000000.0,0.467437,0.083452
6,C3,22795770000000.0,0.373381,0.06666
7,C7,22795010000000.0,0.373368,0.066658
8,C1,12733420000000.0,0.208566,0.037236
9,C9,6796561000000.0,0.111324,0.019875




In [26]:
# извлекаем наилучшую модель из семейства GBM
gbm = automl.get_best_model(algorithm='gbm')
gbm

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_4_AutoML_23_20220629_185527


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,76.0,76.0,247659.0,10.0,10.0,10.0,28.0,570.0,254.92105




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 91071507.0223175
RMSE: 9543.139264535414
MAE: 6043.618145537464
RMSLE: 0.1634354774450186
Mean Residual Deviance: 91071507.0223175

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 126048644.99227272
RMSE: 11227.138771399983
MAE: 6851.108989652716
RMSLE: 0.18422174508207162
Mean Residual Deviance: 126048644.99227272

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,6852.157,77.98803,6987.18,6842.608,6795.463,6803.161,6832.372
1,mean_residual_deviance,126120400.0,35471530.0,177956500.0,94485080.0,91009480.0,132799300.0,134351700.0
2,mse,126120400.0,35471530.0,177956500.0,94485080.0,91009480.0,132799300.0,134351700.0
3,r2,0.5027447,0.06122913,0.4259097,0.5640227,0.5663689,0.4869006,0.4705215
4,residual_deviance,126120400.0,35471530.0,177956500.0,94485080.0,91009480.0,132799300.0,134351700.0
5,rmse,11143.03,1562.582,13340.03,9720.344,9539.889,11523.86,11591.02
6,rmsle,0.1842326,0.002544359,0.187482,0.1860914,0.1816803,0.1819298,0.1839797



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2022-06-29 19:00:34,28.043 sec,0.0,15806.495979,11372.87438,249845300.0
1,,2022-06-29 19:00:34,28.415 sec,5.0,13387.457468,9212.778927,179224000.0
2,,2022-06-29 19:00:34,28.816 sec,10.0,12029.020153,7950.570357,144697300.0
3,,2022-06-29 19:00:35,29.196 sec,15.0,11274.768178,7258.041721,127120400.0
4,,2022-06-29 19:00:35,29.569 sec,20.0,10813.847852,6880.898604,116939300.0
5,,2022-06-29 19:00:35,29.910 sec,25.0,10528.95838,6659.966946,110859000.0
6,,2022-06-29 19:00:36,30.243 sec,30.0,10327.773275,6503.082956,106662900.0
7,,2022-06-29 19:00:36,30.596 sec,35.0,10152.357011,6393.695494,103070400.0
8,,2022-06-29 19:00:36,30.932 sec,40.0,10049.88366,6325.451384,101000200.0
9,,2022-06-29 19:00:37,31.272 sec,45.0,9978.449029,6292.059113,99569450.0



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,C2,6972071000000.0,1.0,0.187413
1,C10,6815666000000.0,0.977567,0.183209
2,C11,6032062000000.0,0.865175,0.162145
3,C5,5251109000000.0,0.753163,0.141153
4,C4,4944036000000.0,0.70912,0.132899
5,C6,2110102000000.0,0.302651,0.056721
6,C7,1760673000000.0,0.252532,0.047328
7,C3,1519134000000.0,0.217888,0.040835
8,C1,878929500000.0,0.126064,0.023626
9,C9,593104200000.0,0.085069,0.015943




In [27]:
# смотрим параметры и гиперпараметры извлеченной модели
gbm.params

{'auc_type': {'actual': 'AUTO', 'default': 'AUTO', 'input': 'AUTO'},
 'balance_classes': {'actual': False, 'default': False, 'input': False},
 'build_tree_one_node': {'actual': False, 'default': False, 'input': False},
 'calibrate_model': {'actual': False, 'default': False, 'input': False},
 'calibration_frame': {'actual': None, 'default': None, 'input': None},
 'categorical_encoding': {'actual': 'Enum',
  'default': 'AUTO',
  'input': 'AUTO'},
 'check_constant_response': {'actual': True, 'default': True, 'input': True},
 'checkpoint': {'actual': None, 'default': None, 'input': None},
 'class_sampling_factors': {'actual': None, 'default': None, 'input': None},
 'col_sample_rate': {'actual': 0.8, 'default': 1.0, 'input': 0.8},
 'col_sample_rate_change_per_level': {'actual': 1.0,
  'default': 1.0,
  'input': 1.0},
 'col_sample_rate_per_tree': {'actual': 0.8, 'default': 1.0, 'input': 0.8},
 'custom_distribution_func': {'actual': None, 'default': None, 'input': None},
 'custom_metric_func'