In [1]:
import numpy as np
import pandas as pd
import pickle 
from sklearn.model_selection import train_test_split
import h2o
from h2o.automl import H2OAutoML

In [2]:
# Load the dataset
with open('pickled/train_word2vec_dataset.pkl', 'rb') as f:
    data = pickle.load(f)

In [3]:
# Split the dataset into training and validation sets based on the label 
train, valid = train_test_split(data, test_size=0.3, stratify=data['mbti'])

In [4]:
# Initialize the H2O cluster
h2o.init()

response_column = "mbti"

# Save validation set as H2OFrame
h2o_valid = h2o.H2OFrame(valid)
h2o_valid[response_column] = h2o_valid[response_column].asfactor()
h2o_valid_path = "h2o/h2o_valid.hex"
h2o.save_frame(h2o_valid, h2o_valid_path)
print("Validation set saved")
del h2o_valid  # Free up memory

# Split training data into smaller chunks
chunk_size = 5000000  # Adjust chunk size based on available memory
num_chunks = int(np.ceil(len(train) / chunk_size))

# Save each chunk as a separate H2OFrame in binary format
for i in range(num_chunks):
    chunk = train.iloc[i * chunk_size:(i + 1) * chunk_size]
    h2o_chunk = h2o.H2OFrame(chunk)
    h2o_chunk[response_column] = h2o_chunk[response_column].asfactor()
    h2o.save_frame(h2o_chunk, f"h2o/h2o_train_chunk_{i}.hex")
    print(f"Chunk {i + 1}/{num_chunks} saved")
    del h2o_chunk  # Free up memory

predictors = ['vector']

# Train a classification model using H2OAutoML
aml = H2OAutoML(max_runtime_secs=7200, seed=42, balance_classes=True)

# Load each chunk one by one during training
for i in range(num_chunks):
    h2o_chunk = h2o.upload_frame(f"h2o/h2o_train_chunk_{i}.hex")
    if i == 0:
        aml.train(x=predictors, y=response_column, training_frame=h2o_chunk, validation_frame=h2o_valid)
    else:
        aml.train(x=predictors, y=response_column, training_frame=h2o_chunk)
    del h2o_chunk  # Free up memory

# Load validation set for evaluation
h2o_valid = h2o.upload_frame(h2o_valid_path)

# View the AutoML Leaderboard
lb = aml.leaderboard
print(lb)

# Make predictions on the validation set
predictions = aml.predict(h2o_valid)
print(predictions)

# Evaluate the model performance
performance = aml.leader.model_performance(h2o_valid)
print(performance)

# Shutdown H2O cluster
h2o.shutdown(prompt=False)


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "21.0.3" 2024-04-16; OpenJDK Runtime Environment (build 21.0.3+9-Ubuntu-1ubuntu1); OpenJDK 64-Bit Server VM (build 21.0.3+9-Ubuntu-1ubuntu1, mixed mode, sharing)
  Starting server from /mnt/c/Users/zhang/Code/MBTI_Classifier/.venv/lib/python3.12/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpo_bbhhou
  JVM stdout: /tmp/tmpo_bbhhou/h2o_ubuntu24zmh_started_from_python.out
  JVM stderr: /tmp/tmpo_bbhhou/h2o_ubuntu24zmh_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,08 secs
H2O_cluster_timezone:,Asia/Shanghai
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.4
H2O_cluster_version_age:,6 days
H2O_cluster_name:,H2O_from_python_ubuntu24zmh_eylib2
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,9.78 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


H2OServerError: HTTP 500 Server Error:
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=ISO-8859-1"/>
<title>Error 500 java.lang.OutOfMemoryError: Java heap space</title>
</head>
<body><h2>HTTP ERROR 500 java.lang.OutOfMemoryError: Java heap space</h2>
<table>
<tr><th>URI:</th><td>/3/PostFile</td></tr>
<tr><th>STATUS:</th><td>500</td></tr>
<tr><th>MESSAGE:</th><td>java.lang.OutOfMemoryError: Java heap space</td></tr>
<tr><th>SERVLET:</th><td>water.api.PostFileServlet-7d42c224</td></tr>
<tr><th>CAUSED BY:</th><td>java.lang.OutOfMemoryError: Java heap space</td></tr>
</table>
<h3>Caused by:</h3><pre>java.lang.OutOfMemoryError: Java heap space
</pre>

</body>
</html>
