In [1]:
import h2o
from h2o.automl import H2OAutoML
from h2o.estimators import H2OWord2vecEstimator

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 38 mins
H2O_cluster_timezone:,Asia/Colombo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.1
H2O_cluster_version_age:,"21 days, 5 hours and 29 minutes"
H2O_cluster_name:,H2O_from_python_mathanraj_5vin33
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.786 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [3]:
# Import the craigslist dataset into H2O:
job_titles = h2o.import_file(
    ("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"),
    col_names = ["category", "jobtitle"],
    col_types = ["string", "string"],
    header = 1
)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [4]:
STOP_WORDS = [
    "ax","i","you","edu","s","t","m","subject","can",
    "lines","re","what","there","all","we","one","the",
    "a","an","of","or","in","for","by","on","but","is",
    "in","a","not","with","as","was","if","they","are",
    "this","and","it","have","from","at","my","be","by",
    "not","that","to","from","com","org","like","likes",
    "so"
]

In [5]:
# Make the 'tokenize' function:
def tokenize(sentences, stop_word = STOP_WORDS):
    tokenized = sentences.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

In [6]:
# Make the `predict` function:
def predict(job_title, w2v, gbm):
    words = tokenize(h2o.H2OFrame(job_title).ascharacter())
    job_title_vec = w2v.transform(words, aggregate_method="AVERAGE")
    print(gbm.predict(test_data=job_title_vec))

In [7]:
# Break job titles into a sequence of words:
words = tokenize(job_titles["jobtitle"])

# Build word2vec model:
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model.train(training_frame=words)

# Find synonyms for the words "teacher":
w2v_model.find_synonyms("teacher", count = 5)

word2vec Model Build progress: |█████████████████████████████████████████████████| (done) 100%


OrderedDict([('teaching', 0.6702170372009277),
             ('infant', 0.6643931269645691),
             ('preschool', 0.6431465148925781),
             ('aide', 0.6305525302886963),
             ('toddler', 0.6170276403427124)])

In [8]:
# Calculate a vector for each job title:
job_title_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")

# Prepare training & validation data (keep only job titles made of known words):
valid_job_titles = ~ job_title_vecs["C1"].isna()
job_titles["category"] = job_titles["category"].asfactor()
data = job_titles[valid_job_titles,:].cbind(job_title_vecs[valid_job_titles,:])

train, valid, test = data.split_frame(ratios=[0.7, 0.2])

In [10]:
# Build a AutoML model:
aml = H2OAutoML(
    nfolds=5,
    max_models=5,
    max_runtime_secs=60, 
    seed=1234
)

aml.train(x=job_title_vecs.names ,y="category", training_frame=train, validation_frame=valid)

AutoML progress: |
23:19:56.465: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
23:19:56.466: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
,multinomial,multinomial,Ridge ( lambda = 5.606E-4 ),"nlambda = 30, lambda.max = 19.97, lambda.min = 5.606E-4, lambda.1se = 0.006069",606,600,59,AutoML_2_20230711_231956_training_py_71_sid_b6ba

accounting,administrative,customerservice,education,foodbeverage,labor,Error,Rate
891.0,115.0,55.0,17.0,16.0,11.0,0.1936652,"214 / 1,105"
88.0,1386.0,184.0,36.0,30.0,36.0,0.2125,"374 / 1,760"
27.0,178.0,993.0,43.0,128.0,235.0,0.3809227,"611 / 1,604"
10.0,41.0,36.0,1576.0,15.0,17.0,0.0702065,"119 / 1,695"
2.0,46.0,117.0,12.0,1426.0,157.0,0.1897727,"334 / 1,760"
13.0,31.0,159.0,31.0,117.0,1376.0,0.2032426,"351 / 1,727"
1031.0,1797.0,1544.0,1715.0,1732.0,1832.0,0.2075433,"2,003 / 9,651"

k,hit_ratio
1,0.7924568
2,0.9141022
3,0.9686043
4,0.984872
5,0.9950264
6,1.0

accounting,administrative,customerservice,education,foodbeverage,labor,Error,Rate
271.0,23.0,13.0,6.0,2.0,2.0,0.1451104,46 / 317
20.0,359.0,58.0,14.0,9.0,16.0,0.2457983,117 / 476
10.0,50.0,273.0,16.0,38.0,66.0,0.397351,180 / 453
3.0,14.0,12.0,422.0,12.0,8.0,0.104034,49 / 471
2.0,10.0,35.0,1.0,415.0,33.0,0.1633065,81 / 496
2.0,8.0,44.0,9.0,41.0,388.0,0.2113821,104 / 492
308.0,464.0,435.0,468.0,517.0,513.0,0.2133087,"577 / 2,705"

k,hit_ratio
1,0.7866913
2,0.9038817
3,0.9611829
4,0.981146
5,0.992976
6,1.0

accounting,administrative,customerservice,education,foodbeverage,labor,Error,Rate
879.0,123.0,57.0,19.0,14.0,13.0,0.2045249,"226 / 1,105"
101.0,1352.0,194.0,40.0,35.0,38.0,0.2318182,"408 / 1,760"
39.0,191.0,960.0,47.0,134.0,233.0,0.4014963,"644 / 1,604"
11.0,49.0,40.0,1555.0,20.0,20.0,0.0825959,"140 / 1,695"
5.0,49.0,115.0,15.0,1399.0,177.0,0.2051136,"361 / 1,760"
16.0,31.0,171.0,33.0,125.0,1351.0,0.2177186,"376 / 1,727"
1051.0,1795.0,1537.0,1709.0,1727.0,1832.0,0.2232929,"2,155 / 9,651"

k,hit_ratio
1,0.776707
2,0.9013573
3,0.9613512
4,0.9815563
5,0.9930577
6,0.9999999

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.7767069,0.009113,0.7783532,0.7813472,0.788601,0.7683937,0.7668394
auc,,0.0,,,,,
err,0.2232931,0.009113,0.2216468,0.2186528,0.211399,0.2316062,0.2331606
err_count,431.0,17.578396,428.0,422.0,408.0,447.0,450.0
logloss,0.6639877,0.0213329,0.6580191,0.6511244,0.6448423,0.6666728,0.6992798
max_per_class_error,0.3958723,0.0209478,0.3956386,0.375,0.3831776,0.3956386,0.4299065
mean_per_class_accuracy,0.7762642,0.0091159,0.7769651,0.7806857,0.7886436,0.7692381,0.7657885
mean_per_class_error,0.2237358,0.0091159,0.2230348,0.2193143,0.2113564,0.230762,0.2342115
mse,0.2030884,0.0033281,0.2002412,0.2043177,0.1994183,0.2038301,0.2076345
null_deviance,6873.3027,1.556745,6876.0854,6872.4976,6872.644,6872.644,6872.644

Unnamed: 0,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_test,deviance_xval,deviance_se,alpha,iterations,training_rmse,training_logloss,training_r2,training_classification_error,training_auc,training_pr_auc,validation_rmse,validation_logloss,validation_r2,validation_classification_error,validation_auc,validation_pr_auc
,2023-07-11 23:20:40,0.000 sec,2,.2E2,603,3.3002648,3.2974311,3.3479999,0.0012813,0.0,,,,,,,,,,,,,
,2023-07-11 23:20:40,0.228 sec,4,.12E2,606,3.1686976,3.1629597,3.2364666,0.0019635,0.0,,,,,,,,,,,,,
,2023-07-11 23:20:41,0.455 sec,6,.77E1,606,2.9917940,2.9820589,3.0814674,0.0029182,0.0,,,,,,,,,,,,,
,2023-07-11 23:20:41,0.676 sec,8,.48E1,606,2.7746131,2.7599241,2.8821871,0.0041502,0.0,,,,,,,,,,,,,
,2023-07-11 23:20:41,1.022 sec,11,.3E1,606,2.5350400,2.5149144,2.6509369,0.0055622,0.0,,,,,,,,,,,,,
,2023-07-11 23:20:41,1.375 sec,14,.18E1,606,2.2976666,2.2725605,2.4103677,0.0069945,0.0,,,,,,,,,,,,,
,2023-07-11 23:20:42,1.736 sec,17,.11E1,606,2.0822289,2.0534011,2.1839153,0.0082397,0.0,,,,,,,,,,,,,
,2023-07-11 23:20:42,2.096 sec,20,.71E0,606,1.8984580,1.8675159,1.9861266,0.0091842,0.0,,,,,,,,,,,,,
,2023-07-11 23:20:43,2.459 sec,23,.44E0,606,1.7480453,1.7165088,1.8221971,0.0098459,0.0,,,,,,,,,,,,,
,2023-07-11 23:20:43,2.824 sec,26,.27E0,606,1.6283814,1.5977769,1.6910412,0.0103115,0.0,,,,,,,,,,,,,

variable,relative_importance,scaled_importance,percentage
C32,2.1748502,1.0,0.0178158
C97,2.0583851,0.9464491,0.0168618
C34,2.0555570,0.9451488,0.0168386
C57,1.9459052,0.8947307,0.0159404
C91,1.9442985,0.8939919,0.0159272
C55,1.9252015,0.8852111,0.0157708
C88,1.8353012,0.8438747,0.0150343
C82,1.8067181,0.8307322,0.0148002
C4,1.8065215,0.8306418,0.0147986
C39,1.7864355,0.8214062,0.0146340


In [13]:
aml.leaderboard

model_id,mean_per_class_error,logloss,rmse,mse
GLM_1_AutoML_2_20230711_231956,0.223878,0.664172,0.450748,0.203173


In [16]:
best_model = aml.get_best_model()

In [17]:
# Predict
print(predict(["school teacher having holidays every month"], w2v_model, best_model))
print(predict(["developer with 3+ Java experience, jumping"], w2v_model, best_model))
print(predict(["Financial accountant CPA preferred"], w2v_model, best_model))

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
predict      accounting    administrative    customerservice    education    foodbeverage      labor
education    0.00103003         0.0459678          0.0214857     0.899527     0.000945136  0.0310447
[1 row x 7 columns]

None
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
predict      accounting    administrative    customerservice    education    foodbeverage     labor
labor         0.0135229          0.306551           0.202388   0.00520171       0.0911395  0.381197
[1 row x 7 columns]

None
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |█████████████████████████████████████████████

In [18]:
best_model.explain()

ImportError: Plotting functionality requires matplotlib. Please install matplotlib.