In [45]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
import warnings
import h2o

from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.naive_bayes import H2ONaiveBayesEstimator
from h2o.estimators.estimator_base import H2OEstimator

h2o.init(nthreads=-1, max_mem_size="20g")

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,1 hour 42 mins
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.7
H2O cluster version age:,19 days
H2O cluster name:,H2O_from_python_paperspace_fhd724
H2O cluster total nodes:,1
H2O cluster free memory:,15.26 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12


In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
%config InlineBackend.figure_format = 'retina'

## 1. Loading the dataset¶

In [47]:
with sqlite3.connect('./data/reviewsV1.db') as conn:
    data = pd.read_sql_query('SELECT * FROM Review', conn)

In [48]:
data.drop('index', inplace=True, axis=1)

## 2. Time Based Splitting

In [49]:
data.sort_values(by='Time', inplace=True)
data.reset_index(drop=True, inplace=True)
TRAIN_SIZE = int(data.shape[0] * 0.7)
TEST_SIZE = data.shape[0] - TRAIN_SIZE

In [50]:
TRAIN_SIZE

254883

In [51]:
TEST_SIZE

109236

In [52]:
data_train = data[0: TRAIN_SIZE]
data_test = data[TRAIN_SIZE:]

#### 2.1 Check if the Splitting was performed properly

In [53]:
assert(data_train.shape[0] == TRAIN_SIZE)
assert(data_test.shape[0] == TEST_SIZE)
assert(data.Time.max() == data_test.Time.reset_index(drop=True)[TEST_SIZE -1])
assert(data.Time.min() == data_train.Time.reset_index(drop=True)[0])

## 3. Training TFIDF Model on data_train

#### 3.1 Creating a TFIDF on train data

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [55]:
tfidf = TfidfVectorizer(max_features=8000)
tfidf.fit(data_train.Text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=8000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [56]:
Dtrain = tfidf.transform(data_train.Text)
Dtrain.get_shape()

(254883, 8000)

#### 3.2 Converting sparse data into h2o frame

In [57]:
# This might take 5 to 7 minutes but this can save the scipy sparse data into h2o sparse frame
Dtrain_h2o = h2o.H2OFrame(Dtrain)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [58]:
Dtrain_h2o.shape  # No loss in data...:)

(254883, 8000)

In [59]:
del Dtrain # Release some memory

In [60]:
label = h2o.H2OFrame(data_train.Polarity.values)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [61]:
label = label.set_names(['label'])

In [62]:
label

label
positive
positive
positive
positive
negative
positive
positive
positive
positive
positive




In [63]:
Dtrain_h2o = Dtrain_h2o.concat(label, axis=1)

In [64]:
Dtrain_h2o.shape

(254883, 8001)

#### 3.3  Some housekeeping for h2o

In [65]:
Dtrain_h2o[['label']] = Dtrain_h2o[['label']].asfactor()

In [66]:
warnings.filterwarnings("ignore",category=FutureWarning)

In [67]:
from h2o.two_dim_table import H2OTwoDimTable
import itertools
from functools import reduce

class NaiveBayesGrid(H2OGridSearch):
    def __init__(self, model, hyper_params, grid_id=None, search_criteria=None):
        super(self.__class__, self).__init__(model, hyper_params, grid_id, search_criteria)

    def get_cv_performace(self, metrics, combine=True):
        frames = list()
        for metric in metrics:
            grid_metrics = self._get_grid(sort_by=metric, decreasing=True)
            temp = grid_metrics.get()
            temp.drop('model_ids', axis=1, inplace=True)
            frames.append(temp)
        if not combine:
            return frames
        else:
            rFrame = reduce(lambda left, right: pd.merge(left,right,on='laplace'), frames)
            metrics.extend(['laplace'])
            rFrame = rFrame[metrics]
            rFrame.set_index('laplace', inplace=True)
            return rFrame

    def _get_grid(self, sort_by=None, decreasing=None):
        """
        Retrieve an H2OGridSearch instance.

        Optionally specify a metric by which to sort models and a sort order.
        Note that if neither cross-validation nor a validation frame is used in the grid search, then the
        training metrics will display in the "get grid" output. If a validation frame is passed to the grid, and
        ``nfolds = 0``, then the validation metrics will display. However, if ``nfolds`` > 1, then cross-validation
        metrics will display even if a validation frame is provided.

        :param str sort_by: A metric by which to sort the models in the grid space. Choices are: ``"logloss"``,
            ``"residual_deviance"``, ``"mse"``, ``"auc"``, ``"r2"``, ``"accuracy"``, ``"precision"``, ``"recall"``,
            ``"f1"``, etc.
        :param bool decreasing: Sort the models in decreasing order of metric if true, otherwise sort in increasing
            order (default).

        :returns: A new H2OGridSearch instance optionally sorted on the specified metric.
        """
        if sort_by is None and decreasing is None: return self

        grid_json = h2o.api("GET /99/Grids/%s" % self._id, data={"sort_by": sort_by, "decreasing": decreasing})
        grid = NaiveBayesGrid(self.model, self.hyper_params, self._id)
        grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]  # reordered
        first_model_json = h2o.api("GET /99/Models/%s" % grid_json['model_ids'][0]['name'])['models'][0]
        model_class = H2OGridSearch._metrics_class(first_model_json)
        m = model_class()
        m._id = self._id
        m._grid_json = grid_json
#         m._metrics_class = metrics_class
        m._parms = grid._parms
        H2OEstimator.mixin(grid, model_class)
        grid.__dict__.update(m.__dict__.copy())
        return grid

    def get(self):
        """Return models sorted by metric."""
        hyper_combos = itertools.product(*list(self.hyper_params.values()))
        if not self.models:
            c_values = [[idx + 1, list(val)] for idx, val in enumerate(hyper_combos)]
            return H2OTwoDimTable(
                col_header=['Model', 'Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']'],
                table_header='Grid Search of Model ' + self.model.__class__.__name__, cell_values=c_values)
        else:
            return self.sorted_metric_table()

In [68]:
hyper_parameters = {'laplace':[0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 500, 1000]}
estimator = H2ONaiveBayesEstimator(
#     balance_classes=True,
    compute_metrics=True,
    fold_assignment="stratified",
    keep_cross_validation_predictions=True,
    nfolds=10,
)
gs = NaiveBayesGrid(estimator, hyper_params = hyper_parameters, grid_id='nb')

In [None]:
gs.train(y="label", training_frame=Dtrain_h2o)

naivebayes Grid Build progress: |█████████████████████████████████████████

In [20]:
metrics=['accuracy', 'f1', 'recall', 'precision']

In [21]:
k = gs.get_cv_performace(metrics)

In [22]:
k.sort_values(by=metrics, ascending=[False] * len(metrics))

Unnamed: 0_level_0,accuracy,f1,recall,precision
laplace,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,0.9411764705882352,0.926517571884984,0.9602649006622516,0.8950617283950617
0.1,0.9411764705882352,0.9260450160771704,0.9536423841059604,0.9
0.001,0.9411764705882352,0.9204152249134948,0.8807947019867549,0.9637681159420288
0.0001,0.938618925831202,0.9230769230769232,0.9536423841059604,0.8944099378881988
0.01,0.938618925831202,0.9172413793103448,0.8807947019867549,0.9568345323741008
50.0,0.9360613810741688,0.9201277955271564,0.9536423841059604,0.8888888888888888
5.0,0.9360613810741688,0.9201277955271564,0.9536423841059604,0.8888888888888888
10.0,0.9360613810741688,0.919614147909968,0.9470198675496688,0.89375
100.0,0.9335038363171356,0.9197530864197532,0.9867549668874172,0.861271676300578
500.0,0.9335038363171356,0.9192546583850932,0.9801324503311258,0.8654970760233918
