## Model Calibration and Selection

#### install h2oEnsemble package if not installed

##library(devtools)
##install_github("h2oai/h2o-3/h2o-r/ensemble/h2oEnsemble-package")

In [25]:
library(h2oEnsemble) 

### Start h2o server
max_mem_size sets the maximal memory allocated for h2o

In [85]:
h2o.init(nthreads = -1, max_mem_size="5g")  # Start an H2O cluster with nthreads = num cores on your machine, -1 means using all cores
h2o.removeAll() 


 Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         32 minutes 25 seconds 
    H2O cluster version:        3.8.1.3 
    H2O cluster name:           H2O_started_from_R_jiehuachen_yrq021 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   4.72 GB 
    H2O cluster total cores:    8 
    H2O cluster allowed cores:  8 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    R Version:                  R version 3.2.2 (2015-08-14) 

[1] 0


In [86]:
train <- h2o.importFile(path = normalizePath("./EthM3_Cal.csv"))
test <- h2o.importFile(path = normalizePath("./EthM3_Val.csv"))



In [87]:
cov_names <- names(train)[32:length(names(train))]

#### y is the column name of the property for prediction, x is the array of covariate names

In [88]:
y <- "V0"
x <- cov_names

## h2o Ensemble Learning

### Define learner and meta learner. 
* There are many options for setting up learner and metalearner, so we should use this script try different options and get the best one;
* You can see that there are a lot of flexibilities for defining those learners;

In [95]:
h2o.randomForest.1 <- function(..., ntrees = 50, nbins = 100, seed = 1) {
  h2o.randomForest.wrapper(..., ntrees = ntrees, nbins = nbins, seed = seed)
}

h2o.randomForest.2 <- function(..., ntrees = 20, nbins = 20, seed = 1) {
  h2o.randomForest.wrapper(..., ntrees = ntrees, nbins = nbins, seed = seed)
}

h2o.deeplearning.1 <- function(..., hidden = c(100,100), activation = "Rectifier", seed = 1) {
  h2o.deeplearning.wrapper(..., hidden = hidden, activation = activation, seed = seed)
}

h2o.deeplearning.2 <- function(..., hidden = c(200,200), activation = "Rectifier", seed = 1) {
  h2o.deeplearning.wrapper(..., hidden = hidden, activation = activation, seed = seed)
}

h2o.gbm.1 <- function(..., ntrees = 50, col_sample_rate = 0.8, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, col_sample_rate = col_sample_rate, seed = seed)
h2o.gbm.2 <- function(..., ntrees = 50, col_sample_rate = 0.7, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, col_sample_rate = col_sample_rate, seed = seed)

learner <- c("h2o.randomForest.wrapper","h2o.gbm.wrapper", "h2o.deeplearning.wrapper", "h2o.randomForest.1", "h2o.randomForest.2","h2o.deeplearning.1", "h2o.deeplearning.2", "h2o.gbm.1", "h2o.gbm.2")
h2o.glm_nn <- function(..., non_negative = TRUE) h2o.glm.wrapper(..., non_negative = non_negative,  lambda_search=TRUE)
metalearner <- "h2o.glm_nn"

In [96]:
learner

### Ensemble learning wrapper to implement the diagram procedure
#### check out ensembling_onelevel.pdf

In [50]:
h2o_ensemble_predictionerror <- function(train, test, y, x, learner, metalearner){

    if(is.factor(train[,y])){
        family="binomial"
    }else{
        family="gaussian"
    }

    fit <- h2o.ensemble(x = x, y = y, 
                    training_frame = train, 
                    family = family, 
                    learner = learner, 
                    metalearner = metalearner,
                    cvControl = list(V = 5))
    
    newperf <- h2o.ensemble_performance(fit, newdata = test)
    return(newperf)
}


In [97]:
h2o_ensemble_predictionerror(train, test, y, x, learner, metalearner)

[1] "Cross-validating and training base learner 1: h2o.randomForest.wrapper"
[1] "Cross-validating and training base learner 2: h2o.gbm.wrapper"
[1] "Cross-validating and training base learner 3: h2o.deeplearning.wrapper"
[1] "Cross-validating and training base learner 4: h2o.randomForest.1"
[1] "Cross-validating and training base learner 5: h2o.randomForest.2"
[1] "Cross-validating and training base learner 6: h2o.deeplearning.1"
[1] "Cross-validating and training base learner 7: h2o.deeplearning.2"
[1] "Cross-validating and training base learner 8: h2o.gbm.1"
[1] "Cross-validating and training base learner 9: h2o.gbm.2"
[1] "Metalearning"



Base learner performance, sorted by specified metric:
                   learner       MSE
3 h2o.deeplearning.wrapper 0.3156998
7       h2o.deeplearning.2 0.3020509
6       h2o.deeplearning.1 0.2985562
5       h2o.randomForest.2 0.2951719
9                h2o.gbm.2 0.2945313
8                h2o.gbm.1 0.2915467
2          h2o.gbm.wrapper 0.2914784
1 h2o.randomForest.wrapper 0.2859065
4       h2o.randomForest.1 0.2849589


H2O Ensemble Performance on <newdata>:
----------------
Family: gaussian

Ensemble performance (MSE): 0.279357631444789


### Shut down h2o server when the computation is finished

In [98]:
h2o.shutdown()

Are you sure you want to shutdown the H2O instance running at http://localhost:54321/ (Y/N)? 
