In [1]:
import h2o
from h2o.automl import H2OAutoML
import warnings
import pandas as pd
import numpy as np

### Get Data

In [2]:
train_df = pd.read_csv('../data/to2ml/train_round1.csv')

In [3]:
test_df = pd.read_csv('../data/to2ml/test_round1.csv')

In [4]:
features = list(train_df.columns.difference(['visitors']))

In [5]:
test = test_df[features]

In [6]:
train = train_df[features + ['visitors']]

In [30]:
train.visitors = np.log1p(train.visitors)

### Stacked Ensemble

- Learning the weights for weighed averaging.
- Linear combination of base model predictions.
- Base learner predictions treated as features.
- Hyper parameter tuning for all base level and second stage regressor. Time consuming.
- Second model learns **“How predictions from the individual regressors contribute the ground truth?”**


![title](images/stack1.png)

## H2O: Open Source, Distributed Machine Learning for Everyone

![title](images/h2o.png)

In [7]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_152-release"; OpenJDK Runtime Environment (build 1.8.0_152-release-1056-b12); OpenJDK 64-Bit Server VM (build 25.152-b12, mixed mode)
  Starting server from /home/dorukhan/anaconda2/envs/richmansworld/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpqby2pznj
  JVM stdout: /tmp/tmpqby2pznj/h2o_dorukhan_started_from_python.out
  JVM stderr: /tmp/tmpqby2pznj/h2o_dorukhan_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Europe/Istanbul
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.0.3
H2O cluster version age:,11 months and 6 days !!!
H2O cluster name:,H2O_from_python_dorukhan_avm2va
H2O cluster total nodes:,1
H2O cluster free memory:,13.98 Gb
H2O cluster total cores:,48
H2O cluster allowed cores:,48


In [31]:
train_data = h2o.H2OFrame(train)
test_data = h2o.H2OFrame(test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [32]:
y = 'visitors'

In [34]:
aml = H2OAutoML(max_runtime_secs = 60, seed = 42,stopping_metric='RMSLE' ,project_name = "restaurant_visitor_prediction",stopping_rounds=10,nfolds=5)

In [35]:
aml.train(y=y,training_frame=train_data)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [36]:
aml.leaderboard

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
XRT_1_AutoML_20191128_200506,0.281579,0.53064,0.281579,0.393632,0.163407
DRF_1_AutoML_20191128_200506,0.281934,0.530975,0.281934,0.393898,0.163507
GLM_grid_1_AutoML_20191128_200506_model_1,0.313381,0.559805,0.313381,0.425876,0.173442
GLM_grid_1_AutoML_20191128_190030_model_1,123.136,11.0967,123.136,7.11983,0.539211
StackedEnsemble_BestOfFamily_AutoML_20191128_190030,123.561,11.1158,123.561,7.11894,0.540074
StackedEnsemble_AllModels_AutoML_20191128_190030,123.655,11.12,123.655,7.11912,0.538872
DeepLearning_1_AutoML_20191128_190030,124.097,11.1399,124.097,7.10986,0.537997
XGBoost_grid_1_AutoML_20191128_190030_model_6,124.561,11.1607,124.561,7.12977,0.539703
XGBoost_grid_1_AutoML_20191128_190030_model_7,124.633,11.1639,124.633,7.13297,0.539948
DeepLearning_grid_1_AutoML_20191128_190030_model_1,124.698,11.1668,124.698,7.12085,




In [37]:
aml_pred = aml.predict(test_data)

drf prediction progress: |████████████████████████████████████████████████| 100%


In [38]:
sub = pd.read_csv('../data/sample_submission.csv.gz')

In [42]:
sub['id'] = test_df['id']
sub['visitors'] = np.expm1(aml_pred.as_data_frame()['predict'].values)
sub.to_csv('../data/tokaggle/aml_reg.csv',index=False)

In [43]:
! kaggle competitions submit -c recruit-restaurant-visitor-forecasting -f ../data/tokaggle/aml_reg.csv -m "AutoML"

100%|███████████████████████████████████████| 1.54M/1.54M [00:03<00:00, 435kB/s]
Successfully submitted to Recruit Restaurant Visitor Forecasting

In [44]:
! kaggle competitions submissions -c recruit-restaurant-visitor-forecasting 

fileName               date                 description                status    publicScore  privateScore  
---------------------  -------------------  -------------------------  --------  -----------  ------------  
aml_reg.csv            2019-11-28 17:07:47  AutoML                     complete  0.52062      0.55843       
aml_reg.csv            2019-11-28 17:07:19  AutoML                     complete  1.64144      1.65908       
aml_reg.csv            2019-11-28 17:04:04  AutoML                     complete  0.54418      0.57952       
optimized_xgb_reg.csv  2019-11-28 15:26:14  Optimized XGB              complete  0.52154      0.55867       
lgb_reg.csv            2019-11-28 14:27:19  LGBM                       complete  0.52335      0.56034       
xgb_reg.csv            2019-11-28 14:22:30  XGB                        complete  0.52216      0.55939       
lgb_reg.csv            2019-11-23 14:44:20  DataCamp Sess2 LGBM        complete  0.71537      0.74051       
lgb_reg.csv        

In [45]:
h2o.cluster().shutdown()

H2O session _sid_bc8a closed.
