In [1]:
import h2o

In [2]:
h2o.init(max_mem_size="25g", nthreads=-1)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_181"; OpenJDK Runtime Environment (build 1.8.0_181-8u181-b13-0ubuntu0.16.04.1-b13); OpenJDK 64-Bit Server VM (build 25.181-b13, mixed mode)
  Starting server from /home/paperspace/anaconda3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp2xtkh862
  JVM stdout: /tmp/tmp2xtkh862/h2o_paperspace_started_from_python.out
  JVM stderr: /tmp/tmp2xtkh862/h2o_paperspace_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.7
H2O cluster version age:,26 days
H2O cluster name:,H2O_from_python_paperspace_4ifl06
H2O cluster total nodes:,1
H2O cluster free memory:,22.22 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12


In [3]:
import sqlite3

In [6]:
!ls ../Databases

reviewsV1db  reviewsV1.db  reviewsV2.db  reviewsV3.db


In [7]:
conn = sqlite3.connect('../Databases/reviewsV1.db')

In [8]:
import pandas as pd

In [9]:
with sqlite3.connect('../Databases/reviewsV1.db') as conn:
    data = pd.read_sql_query('SELECT * FROM Review', conn)

In [10]:
data.sort_values(by='Time', inplace=True)
data.reset_index(drop=True, inplace=True)
TRAIN_SIZE = int(data.shape[0] * 0.7)
TEST_SIZE = data.shape[0] - TRAIN_SIZE

In [11]:
TRAIN_SIZE

254883

In [12]:
TEST_SIZE

109236

In [191]:
data_train = data[0: TRAIN_SIZE]
data_test = data[TRAIN_SIZE:]

In [14]:
from h2o.estimators.word2vec import H2OWord2vecEstimator

In [16]:
from h2o import H2OFrame

In [24]:
data_train = H2OFrame(data_train)

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [33]:
corpus = data_train['Text'].tokenize("\\W+")

In [36]:
w2v_model = H2OWord2vecEstimator(vec_size = 100, model_id = "w2v.hex")
w2v_model.train(training_frame=corpus)

word2vec Model Build progress: |██████████████████████████████████████████| 100%


In [37]:
!ls

Avg W2V.ipynb  BOW -- Logistic Regression.ipynb  tfidf_best.model
best.model     README.md			 TFIDF.ipynb


In [38]:
w2v_model.save_model_details()

'/home/paperspace/Amazon-Review----Logistic-Regression/w2v.hex.json'

In [41]:
w2v_model.find_synonyms("tasty", count = 5)

OrderedDict([('super', 0.7557393908500671),
             ('although', 0.749390184879303),
             ('pack', 0.7356115579605103),
             ('prefer', 0.7264364957809448),
             ('mock', 0.7117513418197632)])

In [42]:
w2v_model.find_synonyms("disappointed", count = 5)

OrderedDict([('sadly', 0.7593790292739868),
             ('sorely', 0.7421629428863525),
             ('returning', 0.7421483993530273),
             ('apprehensive', 0.7391559481620789),
             ('buying', 0.7303898334503174)])

In [44]:
Dtrain = w2v_model.transform(corpus, aggregate_method = "AVERAGE")  # Performs average W2V

In [46]:
Dtrain.shape

(254883, 100)

In [89]:
Dtrain['Label'] = data_train["Polarity"].asfactor()

In [90]:
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [91]:
hyper_params = {
    'alpha': [0, .25, .5, .75, 1],
    'lambda': [1, 0.5, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0]
}

In [92]:
glm = H2OGeneralizedLinearEstimator(family = 'binomial', # Binomial for logistic regression
                                    nfolds=10,
                                    keep_cross_validation_predictions=True,
                                    fold_assignment="stratified",
                                    standardize = True,
                                    seed=42)   

In [93]:
grid = H2OGridSearch(model = glm, hyper_params = hyper_params,
                     search_criteria = {'strategy': "Cartesian"})    # Cartesian for fitting all the hp combinations

In [94]:
grid.train(y = "Label", training_frame = Dtrain)

glm Grid Build progress: |████████████████████████████████████████████████| 100%


In [141]:
param_combs = []
for ids in grid.model_ids:
    param_combs.append(grid.get_hyperparams_dict(ids, False))

cv_results = []
for iters, param in enumerate(param_combs):
    res = grid._grid_json['cross_validation_metrics_summary'][iters].as_data_frame()
    res.set_index('', inplace=True)
#     print(param)
    cvRes = res.T[['accuracy', 'f1', 'recall', 'precision']].loc['mean']
    cvRes['alpha'] = param['alpha']
    cvRes['lambda'] = param['lambda']
    cv_results.append(cvRes)

In [169]:
results = pd.DataFrame(cv_results).reset_index(drop=True)

In [172]:
results.sort_values(by='f1', ascending=False).head()

Unnamed: 0,accuracy,f1,recall,precision,alpha,lambda
1,0.89728844,0.94129634,0.9678962,0.91613555,0.5,1e-05
6,0.8972028,0.94129527,0.9686958,0.9154179,0.0,1e-05
9,0.89721453,0.94129366,0.9685539,0.9155394,1.0,0.0
11,0.89721453,0.94129366,0.9685539,0.9155394,0.25,0.0
7,0.89721453,0.94129366,0.9685539,0.9155394,0.75,0.0


So as we can see that the best value of F1 score is from the alpha = 0.5 and the lambda = 0.00001

In [175]:
clf = H2OGeneralizedLinearEstimator(family = 'binomial', # Binomial for logistic regression
                                    nfolds=0,
                                    keep_cross_validation_predictions=False,
                                    standardize = True,
                                    seed=42,
                                    alpha=0.5,
                                    lambda_=0.00001)

In [176]:
clf.train(y='Label', training_frame=Dtrain)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [192]:
data_test = H2OFrame(data_test)
corpus = data_test['Text'].tokenize("\\W+")

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [193]:
Dtest = w2v_model.transform(corpus, aggregate_method = "AVERAGE")

In [194]:
Dtest.shape

(109236, 100)

In [195]:
Dtest['Label'] = data_test["Polarity"].asfactor()

In [196]:
Dtest.head(5)

C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C27,C28,C29,C30,C31,C32,C33,C34,C35,C36,C37,C38,C39,C40,C41,C42,C43,C44,C45,C46,C47,C48,C49,C50,C51,C52,C53,C54,C55,C56,C57,C58,C59,C60,C61,C62,C63,C64,C65,C66,C67,C68,C69,C70,C71,C72,C73,C74,C75,C76,C77,C78,C79,C80,C81,C82,C83,C84,C85,C86,C87,C88,C89,C90,C91,C92,C93,C94,C95,C96,C97,C98,C99,C100,Label
-0.0811165,-0.0803215,-0.0336636,-0.103793,-0.0547823,-0.0841875,0.0188438,0.0274271,-0.00481949,0.0469253,0.104655,-0.0302858,-0.121666,-0.00913039,-0.014415,-0.0145939,0.180635,-0.0688314,-0.0828794,0.10184,-0.328082,-0.00637254,-0.0942929,0.0290757,0.0192483,-0.181837,-0.145791,-0.0879789,-0.0638319,0.0560504,-0.055323,0.0186724,-0.114894,0.127237,-0.036918,-0.100001,0.179494,-0.075413,-0.0658221,0.00109704,-0.00473262,0.0154825,-0.213608,0.0271758,0.146747,-0.0732614,-0.0509814,0.119581,0.0332473,-0.16616,0.0510228,-0.0520874,-0.0369934,-0.25704,0.0234957,0.0231302,-0.0292449,0.0407036,0.054094,-0.0183176,0.164926,0.00462391,0.0705556,0.0375905,-0.0389699,-0.200092,0.0838363,0.0989617,-0.0169785,0.113406,0.0988472,0.0365938,0.0380149,-0.00543264,3.05288e-06,0.208588,0.146993,0.165111,0.0393786,-0.0629135,-0.0355512,0.14917,0.013998,0.0473533,0.122279,0.0329617,0.139322,-0.107491,0.0479625,-0.063389,0.152561,-0.170565,0.105882,0.103273,-0.142177,-0.0667417,-0.173864,-0.0490656,0.00917091,-0.0363559,positive
0.0205021,-0.165466,0.0113958,-0.13754,-0.127519,0.00323893,-0.143967,-0.170892,0.1052,0.0703633,0.113251,-0.062599,-0.0376514,-0.0557374,0.101851,0.0541932,0.0129429,0.0119299,-0.0371769,0.0041082,-0.0655403,-0.0723812,-0.124848,-0.0997183,0.00295157,-0.174578,-0.0827804,0.0724277,-0.0667448,-0.0317697,-0.0226917,-0.0859436,-0.202249,0.175156,0.163163,0.117243,0.167959,0.175641,-0.106399,-0.156009,-0.105977,-0.0349188,-0.0252956,0.121822,0.0197896,0.119748,-0.0531078,0.208683,0.0756448,0.199906,-0.113473,0.0512793,-0.0994539,-0.142196,0.0471209,0.161815,-0.0203249,0.0278624,-0.0229132,-0.026676,0.174203,0.0618324,-0.147551,0.267534,-0.0361523,-0.125435,0.163924,0.112849,-0.134062,0.132554,-0.0427346,0.0600259,0.10593,0.198305,-0.0672954,0.101777,-0.107409,-0.0701598,-0.0980992,-0.0935637,0.00833548,0.265662,0.0392818,-0.0897258,0.107489,-0.0741501,0.135833,0.0640829,0.0800151,0.122526,0.0461561,-0.128246,-0.141923,0.11436,-0.155642,0.134814,0.0989109,-0.0525174,-0.0965649,-0.073218,positive
-0.0300266,-0.0774715,0.0158763,0.0138112,0.0483541,-0.012576,-0.0465112,-0.043096,-0.0182889,0.0132047,0.0920811,-0.0571766,-0.0966778,-0.0513781,0.0435777,0.054261,0.135046,-0.040214,-0.040473,0.00109839,-0.122983,-0.0456106,-0.052722,-0.0122191,0.109677,-0.154464,-0.0370185,-0.0156445,-0.0232651,0.0990379,-0.00922851,0.0204283,-0.183302,0.0893498,0.0693676,-0.000178192,0.223722,0.0534113,-0.128497,0.00244342,-0.110544,-0.0625153,-0.168856,0.124218,0.0545067,-0.120823,0.00257057,0.0791614,-0.00334674,0.098529,-0.0199161,-0.013678,-0.117751,-0.112827,0.0467203,0.169246,0.0249376,-0.00472849,-0.0143022,-0.0794176,0.149886,0.0939491,-0.10148,0.0628088,-0.0962232,-0.144186,0.0658782,0.113765,-0.0771283,0.0773388,0.0690667,-0.0286337,0.068038,0.0696626,0.0304662,0.233369,0.10009,-0.000324637,-0.0397532,-0.00475507,0.0705414,0.0771543,-0.0296498,-0.0431993,0.0790168,-0.00647494,0.127454,-0.10165,0.0876053,-0.0443461,0.125389,-0.123542,0.0227924,0.171478,-0.094382,0.0617737,-0.0219504,-0.0441576,0.0518979,0.0159508,positive
-0.00373758,-0.180687,0.00534436,0.0372322,-0.0233362,0.0339849,0.0202284,-0.0440014,0.0549587,0.0832781,0.1053,-0.00739376,-0.0741288,-0.0716986,0.0886715,0.00738196,0.129294,-0.107934,-0.0209502,0.0643498,-0.124082,-0.0224766,-0.0430142,-0.0392409,0.102203,-0.046844,0.04392,0.0640543,0.0229777,-0.0469806,-0.0533902,-0.0659166,-0.0900956,0.0985171,0.00989937,-0.0846132,0.170755,-0.00939771,-0.102259,-0.0194039,-0.106857,-0.0819121,-0.0460772,0.0543467,0.136909,-0.056267,-0.0228944,0.0072829,0.0536907,0.00243512,-0.0183058,-0.0132267,0.0391507,-0.172441,0.0568884,0.0801557,0.0500506,0.0208112,0.11718,-0.122427,0.124398,0.127346,-0.0553345,0.0543196,-0.0896683,-0.146269,0.13297,0.0910446,-0.0297654,0.0671919,0.0566812,0.00947502,0.0663402,0.0464224,-0.124017,0.105508,0.0574148,-0.00515434,0.0167618,-0.0389177,-0.0386788,0.0938291,-0.0548251,-0.0200181,0.110357,-0.115052,0.198284,-0.0521946,0.147442,-0.0335026,0.242455,-0.159285,0.055382,0.0448773,-0.0409257,-0.115868,-0.124268,-0.0223679,0.130964,-0.0357328,positive
-0.058977,-0.131072,-0.0527665,-0.0865414,-0.0161196,-0.0192357,-0.0505091,0.0324884,0.0724568,0.0281864,0.129099,-0.0572646,-0.0352554,-0.0149479,-0.0636034,0.048542,0.0310397,0.0358071,-0.0913153,-0.056166,-0.111529,0.0319539,-0.0525935,0.00284119,0.09313,-0.0869873,-0.0324177,-0.0202552,0.04047,0.058133,0.00821839,0.0655714,-0.180545,0.0810469,0.0745845,-0.115295,0.196244,0.117471,-0.238035,-0.0394386,-0.176513,0.0131177,-0.193768,0.076535,0.103351,-0.121202,0.0941532,-0.00437828,-0.100062,0.0766182,-0.0411809,-0.0175017,-0.0428799,-0.162121,0.0180615,0.0393091,0.0561416,-0.0158448,-0.0264854,-0.0995744,0.136804,0.0721665,-0.113975,0.0321404,-0.163044,-0.181728,0.137097,0.134314,-0.0979797,0.162572,0.0229825,0.0191574,0.0320695,-0.0662178,-0.0520614,0.214593,0.125183,-0.0311344,0.0601311,-0.0460493,0.0119002,0.140013,-0.0932242,-0.0355831,0.146309,0.000989272,0.123004,-0.0765913,0.101531,-0.0590904,0.0240321,-0.140105,0.00570312,0.116393,-0.132099,0.0821007,-0.127014,-0.0761184,-0.0450169,0.0649057,positive




In [201]:
from sklearn.metrics import f1_score

In [208]:
predict = clf.predict(Dtest)

glm prediction progress: |████████████████████████████████████████████████| 100%


In [210]:
predict.summary()

Unnamed: 0,predict,negative,positive
type,enum,real,real
mins,,6.218323148621252e-07,3.916595841314959e-06
mean,,0.16338188782043545,0.8366181121795644
maxs,,0.9999960834041587,0.9999993781676851
sigma,,0.23520870436929783,0.2352087043692978
zeros,,0,0
missing,0,0,0
0,positive,0.0654132321035984,0.9345867678964016
1,positive,0.25098149234069,0.74901850765931
2,positive,0.108727675662454,0.891272324337546


In [234]:
from prettytable import PrettyTable

In [222]:
mp = clf.model_performance(Dtest, train=False, valid=False)

In [233]:
f1 = mp.F1()[0][1]

In [232]:
ac = mp.accuracy()[0][1]

In [231]:
pre = mp.precision()[0][1]

In [236]:
x = PrettyTable(['f1', 'accuracy', 'precision'])

In [237]:
x.add_row([f1, ac, pre])

In [238]:
print(x)

+--------------------+--------------------+--------------------+
|         f1         |      accuracy      |     precision      |
+--------------------+--------------------+--------------------+
| 0.9328953037483844 | 0.8860632026071991 | 0.9984358706986444 |
+--------------------+--------------------+--------------------+


In [239]:
mp.confusion_matrix()

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5617146491016723: 


0,1,2,3,4
,negative,positive,Error,Rate
negative,10166.0,8910.0,0.4671,(8910.0/19076.0)
positive,3550.0,86610.0,0.0394,(3550.0/90160.0)
Total,13716.0,95520.0,0.1141,(12460.0/109236.0)




In [242]:
h2o.cluster().shutdown()

H2O session _sid_8e06 closed.
