# Spark + H2O

In [1]:
import pickle
import pandas as pd
import numpy as np
from pysparkling import *
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch
from utils import SEED

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("best_one").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate()

In [2]:
with open('train_set_sru.pickle', mode='rb') as f:
    train_set = pickle.load(f)

In [3]:
with open('test_set.pickle', mode='rb') as f:
    test_set = pickle.load(f)

In [4]:
df_train = pd.DataFrame(train_set['x'], columns=train_set['x'].columns)
df_train['Results'] = train_set['y']

In [5]:
# в Spark DF
spark_df_train = spark.createDataFrame(df_train)

In [6]:
# H2O инит
hc = H2OContext.getOrCreate(spark)

Connecting to H2O server at http://ec2-34-253-225-131.eu-west-1.compute.amazonaws.com:54323... successful.


0,1
H2O cluster uptime:,2 mins 11 secs
H2O cluster version:,3.14.0.2
H2O cluster version age:,"21 days, 6 hours and 30 minutes"
H2O cluster name:,sparkling-water-root_app-20170912044422-0017
H2O cluster total nodes:,2
H2O cluster free memory:,3.845 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"locked, healthy"
H2O connection url:,http://ec2-34-253-225-131.eu-west-1.compute.amazonaws.com:54323


In [7]:
# в h2o DF
df_train_h2o = hc.as_h2o_frame(spark_df_train, "phishing_websites_train")

In [8]:
for col in list(df_train.columns):
    df_train_h2o[col] = df_train_h2o[col].asfactor()

In [9]:
df_test = pd.DataFrame(test_set['x'], columns=test_set['x'].columns)
df_test['Results'] = test_set['y']

spark_df_test = spark.createDataFrame(df_test)

df_test_h2o = hc.as_h2o_frame(spark_df_test, "phishing_websites_test")

for col in list(df_test.columns):
    df_test_h2o[col] = df_test_h2o[col].asfactor()

## Grandient Boosting

In [10]:
# выбор модели
gbm_model = H2OGradientBoostingEstimator(
                                         distribution = "bernoulli",
                                         nfolds=5,
                                         seed=SEED
                                        )

tuned_parameters = {'ntrees': [10, 50, 70, 80, 90, 100
                              ],
                    'max_depth': [4, 8, 15
                                 ],
                    'learn_rate': [0.1, 0.2, 0.5
                                  ],
                    'min_split_improvement': [0.002, 0.005, 0.01
                                             ],
                    'col_sample_rate_per_tree': [5/len(train_set['x'].columns), 
                                                8/len(train_set['x'].columns), 
                                                10/len(train_set['x'].columns)
                                               ],
                    'col_sample_rate_change_per_level': [1, 0.5, 0.2
                                                        ]}


search_criteria = {
  "strategy": "RandomDiscrete", 
  "max_runtime_secs": 2700,
  "stopping_metric": "AUC",
  "stopping_tolerance": 0.00001, 
  "seed": SEED 
}

# тестируем с GRIDSEARCH
gs = H2OGridSearch(gbm_model, tuned_parameters, search_criteria=search_criteria)

gs.train(x=list(range(0, (len(df_train.columns)-1) ))
         ,y=29, training_frame=df_train_h2o)

grid = gs.get_grid(sort_by='Precision', decreasing=True)

best_model_id = grid.model_ids[0]
print(gs.get_hyperparams(best_model_id))
best_model_gbm = h2o.get_model(best_model_id)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%
Hyperparameters: [col_sample_rate_per_tree, col_sample_rate_change_per_level, min_split_improvement, learn_rate, ntrees, max_depth]
[0.3333333333333333, 1.0, 0.005, 0.1, 80, 15]


In [11]:
print(gs.summary())


Grid Summary:



0,1,2,3,4,5,6,7,8,9
Model Id,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
Grid_GBM_py_30_sid_ac21_model_python_1505191582243_1_model_6,80.0,80.0,42607.0,0.0,12.0,5.9875,1.0,197.0,37.4
Grid_GBM_py_30_sid_ac21_model_python_1505191582243_1_model_3,90.0,90.0,22266.0,0.0,9.0,5.3444443,1.0,34.0,14.688889
Grid_GBM_py_30_sid_ac21_model_python_1505191582243_1_model_11,100.0,100.0,20236.0,0.0,10.0,2.22,1.0,112.0,10.93
Grid_GBM_py_30_sid_ac21_model_python_1505191582243_1_model_16,100.0,100.0,10972.0,0.0,4.0,1.52,1.0,13.0,3.59
Grid_GBM_py_30_sid_ac21_model_python_1505191582243_1_model_15,80.0,80.0,13714.0,2.0,8.0,4.5,1.0,22.0,8.6875
Grid_GBM_py_30_sid_ac21_model_python_1505191582243_1_model_10,50.0,50.0,5018.0,0.0,8.0,1.3,1.0,13.0,2.82
Grid_GBM_py_30_sid_ac21_model_python_1505191582243_1_model_8,70.0,70.0,7276.0,0.0,4.0,1.5285715,1.0,9.0,3.142857
Grid_GBM_py_30_sid_ac21_model_python_1505191582243_1_model_1,50.0,50.0,4728.0,0.0,5.0,0.92,1.0,12.0,2.36
Grid_GBM_py_30_sid_ac21_model_python_1505191582243_1_model_12,90.0,90.0,9333.0,0.0,4.0,1.0444444,1.0,15.0,3.0666666


None


In [12]:
print(gs.sorted_metric_table())

     col_sample_rate_change_per_level col_sample_rate_per_tree learn_rate  \
0                                 1.0       0.3333333333333333        0.1   
1                                 0.5       0.3333333333333333        0.1   
2                                 1.0      0.26666666666666666        0.2   
3                                 0.2       0.3333333333333333        0.2   
4                                 0.2       0.3333333333333333        0.1   
5                                 0.2       0.3333333333333333        0.5   
6                                 0.2       0.3333333333333333        0.2   
7                                 0.5      0.16666666666666666        0.5   
8                                 1.0      0.26666666666666666        0.2   
9                                 0.5       0.3333333333333333        0.2   
10                                0.5      0.16666666666666666        0.2   
11                                0.2       0.3333333333333333        0.2   

In [13]:
# повторяем над тестовым DF
df_test = pd.DataFrame(test_set['x'], columns=test_set['x'].columns)
df_test['Results'] = test_set['y']

spark_df_test = spark.createDataFrame(df_test)

df_test_h2o = hc.as_h2o_frame(spark_df_test, "phishing_websites_test")

for col in list(df_test.columns):
    df_test_h2o[col] = df_test_h2o[col].asfactor()

In [14]:
performance_test = best_model_gbm.model_performance(df_test_h2o)

print(performance_test)


ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.038409138690364206
RMSE: 0.19598249587747424
LogLoss: 0.14552904445966308
Mean Per-Class Error: 0.04365552337747125
AUC: 0.9900174933458326
Gini: 0.9800349866916651
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.46020107046395525: 


0,1,2,3,4
,-1.0,1.0,Error,Rate
-1,886.0,54.0,0.0574,(54.0/940.0)
1,40.0,1231.0,0.0315,(40.0/1271.0)
Total,926.0,1285.0,0.0425,(94.0/2211.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4602011,0.9632238,226.0
max f2,0.3719842,0.9695550,247.0
max f0point5,0.6305106,0.9686033,190.0
max accuracy,0.4602011,0.9574853,226.0
max precision,0.9942215,1.0,0.0
max recall,0.0459753,1.0,356.0
max specificity,0.9942215,1.0,0.0
max absolute_mcc,0.4602011,0.9129291,226.0
max min_per_class_accuracy,0.5604329,0.9553191,207.0


Gains/Lift Table: Avg response rate: 57.49 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0104025,0.9904273,1.7395751,1.7395751,1.0,1.0,0.0180960,0.0180960,73.9575138,73.9575138
,2,0.0208051,0.9893965,1.7395751,1.7395751,1.0,1.0,0.0180960,0.0361920,73.9575138,73.9575138
,3,0.0303030,0.9885533,1.7395751,1.7395751,1.0,1.0,0.0165224,0.0527144,73.9575138,73.9575138
,4,0.0402533,0.9874578,1.7395751,1.7395751,1.0,1.0,0.0173092,0.0700236,73.9575138,73.9575138
,5,0.0502035,0.9862758,1.7395751,1.7395751,1.0,1.0,0.0173092,0.0873328,73.9575138,73.9575138
,6,0.1004071,0.9803184,1.7395751,1.7395751,1.0,1.0,0.0873328,0.1746656,73.9575138,73.9575138
,7,0.1501583,0.9753533,1.7395751,1.7395751,1.0,1.0,0.0865460,0.2612116,73.9575138,73.9575138
,8,0.2012664,0.9688764,1.7395751,1.7395751,1.0,1.0,0.0889064,0.3501180,73.9575138,73.9575138
,9,0.3007689,0.9502169,1.7316680,1.7369592,0.9954545,0.9984962,0.1723053,0.5224233,73.1667978,73.6959235






In [15]:
print(performance_test.accuracy()[0][1])

0.9574853007688828


## Random Forest

In [23]:
rf_model = H2ORandomForestEstimator(
                                     nfolds=5,
                                     seed=SEED,
                                     fold_assignment='Stratified'
                                    )

tuned_parameters = {'ntrees': [50, 100, 200],
                    'mtries': [-1],
                    'max_depth': [4, 8, 10, 15],
                    'min_rows': [0.02, 0.05, 0.1]
                   }

search_criteria = {
  "strategy": "RandomDiscrete", 
  "max_runtime_secs": 1800, 
  "stopping_metric": "AUC",
  "stopping_tolerance": 0.00001,
  "seed": SEED 
}

In [24]:
# grid search
gs = H2OGridSearch(rf_model, tuned_parameters, search_criteria=search_criteria)

In [25]:
gs.train(x=list(range(0, (len(df_train.columns)-1) )), y=29, training_frame=df_train_h2o)

drf Grid Build progress: |████████████████████████████████████████████████| 100%


In [26]:
grid = gs.get_grid(sort_by='Precision', decreasing=True)

best_model_id = grid.model_ids[0]
print(gs.get_hyperparams(best_model_id))
best_model_rf = h2o.get_model(best_model_id)

Hyperparameters: [min_rows, mtries, ntrees, max_depth]
[0.02, -1, 200, 15]


In [27]:
print(gs.summary())


Grid Summary:



0,1,2,3,4,5,6,7,8,9
Model Id,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
Grid_DRF_py_30_sid_bf3c_model_python_1505155590336_19525_model_17,200.0,200.0,897903.0,15.0,15.0,15.0,256.0,477.0,352.815
Grid_DRF_py_30_sid_bf3c_model_python_1505155590336_19525_model_27,200.0,200.0,897905.0,15.0,15.0,15.0,256.0,477.0,352.815
Grid_DRF_py_30_sid_bf3c_model_python_1505155590336_19525_model_0,200.0,200.0,897898.0,15.0,15.0,15.0,256.0,477.0,352.815
Grid_DRF_py_30_sid_bf3c_model_python_1505155590336_19525_model_16,100.0,100.0,449469.0,15.0,15.0,15.0,256.0,477.0,353.3
Grid_DRF_py_30_sid_bf3c_model_python_1505155590336_19525_model_5,100.0,100.0,449466.0,15.0,15.0,15.0,256.0,477.0,353.3
Grid_DRF_py_30_sid_bf3c_model_python_1505155590336_19525_model_14,50.0,50.0,218406.0,15.0,15.0,15.0,273.0,477.0,343.3
Grid_DRF_py_30_sid_bf3c_model_python_1505155590336_19525_model_10,50.0,50.0,218399.0,15.0,15.0,15.0,273.0,477.0,343.3
Grid_DRF_py_30_sid_bf3c_model_python_1505155590336_19525_model_25,50.0,50.0,218403.0,15.0,15.0,15.0,273.0,477.0,343.3
Grid_DRF_py_30_sid_bf3c_model_python_1505155590336_19525_model_24,100.0,100.0,242896.0,10.0,10.0,10.0,134.0,258.0,188.51


None


In [28]:
print(gs.sorted_metric_table())

     max_depth min_rows mtries ntrees  \
0           15     0.02     -1    200   
1           15     0.05     -1    200   
2           15      0.1     -1    200   
3           15     0.02     -1    100   
4           15      0.1     -1    100   
5           15     0.05     -1     50   
6           15     0.02     -1     50   
7           15      0.1     -1     50   
8           10     0.02     -1    100   
9           10      0.1     -1     50   
10          10     0.02     -1     50   
11          10     0.05     -1     50   
12          10      0.1     -1    200   
13          10     0.05     -1    200   
14          10     0.02     -1     15   
15           8      0.1     -1     50   
16           8     0.05     -1     50   
17           8     0.02     -1     50   
18           8     0.02     -1    100   
19           8      0.1     -1    100   
20           8     0.02     -1    200   
21           8      0.1     -1    200   
22           4      0.1     -1     50   
23           4  

In [31]:
performance_test = best_model_rf.model_performance(df_test_h2o)

print(performance_test)


ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.027665373213013794
RMSE: 0.16632911114117635
LogLoss: 0.10509879056943468
Mean Per-Class Error: 0.0302802283341983
AUC: 0.9955199457622579
Gini: 0.9910398915245158
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.48991800505667926: 


0,1,2,3,4
,-1.0,1.0,Error,Rate
-1,906.0,34.0,0.0362,(34.0/940.0)
1,31.0,1240.0,0.0244,(31.0/1271.0)
Total,937.0,1274.0,0.0294,(65.0/2211.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4899180,0.9744597,221.0
max f2,0.2554940,0.9826410,271.0
max f0point5,0.5978853,0.9770487,196.0
max accuracy,0.4899180,0.9706015,221.0
max precision,0.9999965,1.0,0.0
max recall,0.1006564,1.0,320.0
max specificity,0.9999965,1.0,0.0
max absolute_mcc,0.4899180,0.9398337,221.0
max min_per_class_accuracy,0.5138052,0.9677419,215.0


Gains/Lift Table: Avg response rate: 57.49 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0104025,0.9998509,1.7395751,1.7395751,1.0,1.0,0.0180960,0.0180960,73.9575138,73.9575138
,2,0.0221619,0.9996927,1.7395751,1.7395751,1.0,1.0,0.0204563,0.0385523,73.9575138,73.9575138
,3,0.0307553,0.9995152,1.7395751,1.7395751,1.0,1.0,0.0149489,0.0535012,73.9575138,73.9575138
,4,0.0407056,0.9992777,1.7395751,1.7395751,1.0,1.0,0.0173092,0.0708104,73.9575138,73.9575138
,5,0.0502035,0.9990550,1.7395751,1.7395751,1.0,1.0,0.0165224,0.0873328,73.9575138,73.9575138
,6,0.1004071,0.9975135,1.7395751,1.7395751,1.0,1.0,0.0873328,0.1746656,73.9575138,73.9575138
,7,0.1501583,0.9946785,1.7395751,1.7395751,1.0,1.0,0.0865460,0.2612116,73.9575138,73.9575138
,8,0.2003618,0.9897450,1.7395751,1.7395751,1.0,1.0,0.0873328,0.3485445,73.9575138,73.9575138
,9,0.3003166,0.9750156,1.7395751,1.7395751,1.0,1.0,0.1738788,0.5224233,73.9575138,73.9575138






In [32]:
print(performance_test.accuracy()[0][1])

0.9706015377657169
