In [None]:
!pip install h2o

In [None]:
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
import os
from score import pipeline_score
import time
import csv
import sys
import io
from google.colab import files

h2o.init()

In [3]:
# Redireciona os prints da função
def capturar_e_salvar_prints_em_csv(func, *args, csv_filename="log.csv", **kwargs):
    buffer = io.StringIO()
    stdout_original = sys.stdout  # Guarda o stdout original
    sys.stdout = buffer           # Redireciona para o buffer

    try:
        resultado = func(*args, **kwargs)
    finally:
        sys.stdout = stdout_original  # Restaura o stdout original

    # Pega o conteúdo dos prints
    saida = buffer.getvalue().splitlines()

    # Salva em um CSV
    with open(csv_filename, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["mensagem"])
        for linha in saida:
            writer.writerow([linha])

    return resultado


# Listar todos os arquivos CSV na pasta
datasets = [f for f in os.listdir() if f.endswith(".csv")]

# Caminho do arquivo CSV onde os resultados serão armazenados
result_csv_path = "H2O_results.csv"

# Se o arquivo de resultados ainda não existir, cria com cabeçalho
if not os.path.exists(result_csv_path):
    pd.DataFrame(columns=["dataset", "test_score", "execution_time"]).to_csv(result_csv_path, index=False)

In [4]:
for dataset in datasets:
  if dataset == "H2O_results.csv":
    continue
  print(f"Rodando {dataset}")

  df = h2o.import_file(dataset)

  df[-1] = df[-1].asfactor()

  train, test = df.split_frame(ratios=[.75], seed=42)

  x = df.columns[:-1]
  y = df.columns[-1]

  aml = H2OAutoML(verbosity='info', exclude_algos=["StackedEnsemble", "DeepLearning"])

  start_time = time.time()
  aml.train(x=x, y=y ,training_frame = train)
  end_time = time.time()
  execution_time = end_time - start_time

  y_pred = aml.leader.predict(test).as_data_frame().iloc[:, 0]
  y_test = test[-1].as_data_frame().values.ravel()
  final_score = capturar_e_salvar_prints_em_csv(pipeline_score, y_test, y_pred, verbosity=True, csv_filename=f"tabela {dataset}")

  result = {
      "dataset": dataset,
      "test_score": final_score,
      "execution_time": execution_time,
  }

  # Adicionar ao arquivo CSV imediatamente
  pd.DataFrame([result]).to_csv(result_csv_path, mode="a", header=False, index=False)

  files.download('H2O_results.csv')
  files.download(f"tabela {dataset}")

Rodando Maternal Health Risk.csv
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
17:20:38.349: Project: AutoML_1_20250426_172038
17:20:38.353: 5-fold cross-validation will be used.
17:20:38.354: User didn't set any runtime constraints (max runtime or max models), using default 1h time limit
17:20:38.354: Setting stopping tolerance adaptively based on the training frame: 0.03649051825844134
17:20:38.355: Build control seed: -1 (random)
17:20:38.369: training frame: Frame key: AutoML_1_20250426_172038_training_py_3_sid_bf12    cols: 7    rows: 751  chunks: 1    size: 7154  checksum: -8894078642253502692
17:20:38.369: validation frame: NULL
17:20:38.370: leaderboard frame: NULL
17:20:38.370: blending frame: NULL
17:20:38.370: response column: RiskLevel
17:20:38.370: fold column: null
17:20:38.370: weights column: null
17:20:38.442: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Rodando AIDS Clinical Trials Group Study 175.csv
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
18:11:18.798: Project: AutoML_2_20250426_181118
18:11:18.799: 5-fold cross-validation will be used.
18:11:18.799: User didn't set any runtime constraints (max runtime or max models), using default 1h time limit
18:11:18.799: Setting stopping tolerance adaptively based on the training frame: 0.024937733402690822
18:11:18.799: Build control seed: -1 (random)
18:11:18.799: training frame: Frame key: AutoML_2_20250426_181118_training_py_11_sid_bf12    cols: 24    rows: 1608  chunks: 8    size: 52120  checksum: 7227175635959498066
18:11:18.799: validation frame: NULL
18:11:18.799: leaderboard frame: NULL
18:11:18.799: blending frame: NULL
18:11:18.799: response column: cid
18:11:18.799: fold column: null
18:11:18.799: weights column: null
18:11:18.800: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Rodando Breast Cancer Wisconsin (Diagnostic).csv
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
18:59:26.632: Project: AutoML_3_20250426_185926
18:59:26.633: 5-fold cross-validation will be used.
18:59:26.633: User didn't set any runtime constraints (max runtime or max models), using default 1h time limit
18:59:26.633: Setting stopping tolerance adaptively based on the training frame: 0.048795003647426664
18:59:26.633: Build control seed: -1 (random)
18:59:26.633: training frame: Frame key: AutoML_3_20250426_185926_training_py_19_sid_bf12    cols: 31    rows: 420  chunks: 8    size: 55538  checksum: 6253737325151822802
18:59:26.633: validation frame: NULL
18:59:26.633: leaderboard frame: NULL
18:59:26.633: blending frame: NULL
18:59:26.633: response column: Diagnosis
18:59:26.633: fold column: null
18:59:26.633: weights column: null
18:59:26.633: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), de





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
lb = aml.leaderboard.as_data_frame
lb.head()

In [None]:
lm = aml.leader