In [9]:
import pandas as pd
import h2o
from h2o.estimators import H2ORandomForestEstimator

# Initialize H2O
h2o.init()

# Load the training data from "base_completa.csv"
train_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv"
train_data = h2o.import_file(train_data_path)

# Load the test data from "teste.csv"
test_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/teste.csv"
test_df = pd.read_csv(test_data_path)

# Process Training Data
train_df = h2o.as_list(train_data)
train_df['Date'] = pd.to_datetime(train_df['Date'], unit='ms')
train_df['Year'] = train_df['Date'].dt.year
train_df['Month'] = train_df['Date'].dt.month
train_df['Day'] = train_df['Date'].dt.day
train_df['Date'] = train_df['Date'].dt.strftime('%Y-%m-%d')
train_df['Diferenca_1_Dia'] = train_df['Preco_Real'].diff(periods=1)
train_df = train_df.dropna()  # Removing rows with NaN values after diff calculation

# Process Test Data (including sliding window)
test_df['Date'] = pd.to_datetime(test_df['Date'])
for lag in range(1, 4):  # Example for 3 days lag
    test_df[f'Preco_Real_Lag{lag}'] = test_df['Preco_Real'].shift(lag)
window_size = 1
test_df['Rolling_Mean_3d'] = test_df['Preco_Real'].rolling(window=window_size).mean()
test_df['Diferenca_1_Dia'] = test_df['Preco_Real'].diff(periods=1)
test_df.fillna(method='bfill', inplace=True)

# Convert the DataFrames back to H2O Frames
train_data = h2o.H2OFrame(train_df)
test_data = h2o.H2OFrame(test_df)

# Define the target variable and predictor variables
target_variable = 'Diferenca_1_Dia'
predictors = train_data.columns
predictors.remove(target_variable)

# Initialize and Train the GLM Model
glm_model = H2ORandomForestEstimator(model_id="rf_regressor.hex")
glm_model.train(x=predictors, y=target_variable, training_frame=train_data)

# Make predictions on the test set
predictions = glm_model.predict(test_data)

# Convert predictions to Pandas DataFrame
predictions_df = h2o.as_list(predictions)
predictions_df['Date'] = test_df['Date'].dt.strftime('%Y-%m-%d')
predictions_df['Diferenca_Pontos_Predita'] = predictions_df['predict']
predictions_df['Diferenca_Pontos_Real'] = test_df['Diferenca_1_Dia']

# Display the predictions along with actual differences
print(predictions_df[['Date', 'Diferenca_Pontos_Predita', 'Diferenca_Pontos_Real']])

# Display performance metrics
performance = glm_model.model_performance(test_data=test_data)
print("\nPerformance Metrics:")
print("MSE:", performance.mse())
print("RMSE:", performance.rmse())
print("MAE:", performance.mae())
print("R2:", performance.r2())


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,58 mins 15 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_milen_o1cevs
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.007 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%




drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%




           Date  Diferenca_Pontos_Predita  Diferenca_Pontos_Real
0    2022-01-03                  4.893933                  40.10
1    2022-01-04                  6.742200                  40.10
2    2022-01-05                  9.347900                  -1.32
3    2022-01-06                  6.755400                  13.34
4    2022-01-07                 10.018800                  27.04
..          ...                       ...                    ...
453  2023-10-24                  2.814340                   4.38
454  2023-10-25                  2.257629                 -25.06
455  2023-10-26                 -3.345167                  -3.07
456  2023-10-27                  2.118133                  -6.19
457  2023-10-30                 -0.474980                  -4.67

[458 rows x 3 columns]

Performance Metrics:
MSE: 330.51712776387956
RMSE: 18.180130026044356
MAE: 13.59044496244763
R2: -0.10782036405837436


In [3]:
import pandas as pd
import h2o
from h2o.estimators import H2ORandomForestEstimator
from datetime import datetime

# Function to convert timestamp to standard date format
def convert_timestamp_to_date(timestamp):
    return datetime.utcfromtimestamp(timestamp / 1000).strftime('%Y-%m-%d %H:%M:%S')

# Inicializar o H2O
h2o.init()

# Carregar os dados de treino
train_data = h2o.import_file("C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv")

# Carregar os dados de teste
test_data = h2o.import_file("C:/Users/milen/OneDrive/Documentos/TCC/Bases/teste.csv")

# Conversão para DataFrame do Pandas para manipulação (treino)
df_train = h2o.as_list(train_data)

# Calcular a diferença de pontos para intervalos de 1 dia (treino)
df_train['Diferenca_1_Dia'] = df_train['Preco_Real'].diff(periods=-1)

# Remover a última linha, pois não tem um valor de diferença válido (treino)
df_train = df_train[:-1]

# Converter de volta para H2O DataFrame (treino)
train_data = h2o.H2OFrame(df_train)

# Definir a variável alvo e as variáveis preditoras (treino)
target_variable = 'Diferenca_1_Dia'
predictors = train_data.columns
predictors.remove(target_variable)

# Construir e treinar o modelo
rf_model = H2ORandomForestEstimator(model_id="rf_regressor.hex")
rf_model.train(x=predictors, y=target_variable, training_frame=train_data)

# Fazer previsões no conjunto de teste
predictions = rf_model.predict(test_data)

# Converter as previsões para um DataFrame do Pandas
predictions_df = h2o.as_list(predictions)

# Adicionar a coluna de data à DataFrame das previsões (supondo que 'Data' é o nome da coluna de datas no teste)
test_data_df = h2o.as_list(test_data)
predictions_df['Date'] = test_data_df['Date'].apply(convert_timestamp_to_date)

# Exibir as previsões com datas
print(predictions_df)


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 37 mins
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_milen_yi06mc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.043 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%




drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
      predict                 Date
0   -1.884891  2022-01-03 00:00:00
1    4.137400  2022-01-04 00:00:00
2   -1.923000  2022-01-05 00:00:00
3   -4.333800  2022-01-06 00:00:00
4    4.274800  2022-01-07 00:00:00
..        ...                  ...
453 -2.325662  2023-10-24 00:00:00
454 -0.836428  2023-10-25 00:00:00
455 -3.821384  2023-10-26 00:00:00
456 -1.269600  2023-10-27 00:00:00
457  0.665600  2023-10-30 00:00:00

[458 rows x 2 columns]


In [7]:
import pandas as pd
import h2o
from h2o.estimators import H2ORandomForestEstimator
from datetime import datetime

# Function to convert timestamp to standard date format
def convert_timestamp_to_date(timestamp):
    return datetime.utcfromtimestamp(timestamp / 1000).strftime('%Y-%m-%d %H:%M:%S')

# Function to create a sliding window on the test data
def create_sliding_window(data, window_size=3):
    windowed_data = []
    for i in range(len(data) - window_size + 1):
        window = data[i:i + window_size]
        windowed_data.append(window)
    return pd.concat(windowed_data, axis=0)

# Inicializar o H2O
h2o.init()

# Carregar os dados de treino
train_data = h2o.import_file("C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv")

# Carregar os dados de teste
test_data = h2o.import_file("C:/Users/milen/OneDrive/Documentos/TCC/Bases/teste.csv")

# Conversão para DataFrame do Pandas para manipulação (treino)
df_train = h2o.as_list(train_data)

# Calcular a diferença de pontos para intervalos de 1 dia (treino)
df_train['Diferenca_1_Dia'] = df_train['Preco_Real'].diff(periods=-1)

# Remover a última linha, pois não tem um valor de diferença válido (treino)
df_train = df_train[:-1]

# Converter de volta para H2O DataFrame (treino)
train_data = h2o.H2OFrame(df_train)

# Definir a variável alvo e as variáveis preditoras (treino)
target_variable = 'Diferenca_1_Dia'
predictors = train_data.columns
predictors.remove(target_variable)

# Construir e treinar o modelo
rf_model = H2ORandomForestEstimator(model_id="rf_regressor.hex")
rf_model.train(x=predictors, y=target_variable, training_frame=train_data)

# Converter o conjunto de teste para DataFrame do Pandas e aplicar a janela deslizante
test_data_df = h2o.as_list(test_data)
test_data_windowed = create_sliding_window(test_data_df)

# Converter de volta para H2O DataFrame (teste)
test_data_windowed_h2o = h2o.H2OFrame(test_data_windowed)

# Fazer previsões no conjunto de teste
predictions = rf_model.predict(test_data_windowed_h2o)

# Converter as previsões para um DataFrame do Pandas
predictions_df = h2o.as_list(predictions)

# Preparar a coluna de data para combinar com as previsões
# Selecionar apenas a primeira ocorrência de cada data no conjunto de teste deslizante
unique_dates = test_data_windowed.drop_duplicates(subset=['Data'])
unique_dates['Data'] = unique_dates['Data'].apply(convert_timestamp_to_date)

# Assegurar que a quantidade de linhas em unique_dates seja igual à de predictions_df
if len(unique_dates) != len(predictions_df):
    raise ValueError("Mismatch in number of rows between predictions and unique dates")

# Adicionar a coluna de data à DataFrame das previsões
predictions_df['Data'] = unique_dates['Data']

# Exibir as previsões com datas
print(predictions_df)


Checking whether there is an H2O instance running at http://localhost:54321.

 connected.


0,1
H2O_cluster_uptime:,1 hour 48 mins
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_milen_yi06mc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.038 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='localhost', port=54321): Max retries exceeded with url: /3/Parse (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000269602F3910>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [None]:
import pandas as pd
import h2o
from h2o.estimators import H2ORandomForestEstimator

# Inicializar o H2O
h2o.init()

# Carregar os dados
data = h2o.import_file("C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv")

# Conversão para DataFrame do Pandas para manipulação
df = h2o.as_list(data)

# Garantir que a coluna 'Date' esteja no formato correto
df['Date'] = pd.to_datetime(df['Date'], unit='ms')

# Separar a data em ano, mês e dia
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Agora pode converter a data para string, se necessário
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

# Calcular a diferença de pontos para intervalos de 5 dias
df['Diferenca_5_Dias'] = df['Preco_Real'].diff(periods=-5)

# Salvar a diferença de pontos real para comparação futura
df['Diferenca_Pontos_Real'] = df['Diferenca_5_Dias']

# Remover as últimas 5 linhas, pois não têm um valor de diferença válido
df = df[:-5]

# Converter de volta para H2O DataFrame
data = h2o.H2OFrame(df)

# Definir a variável alvo e as variáveis preditoras
target_variable = 'Diferenca_5_Dias'
predictors = data.columns
predictors.remove(target_variable)

# Dividir em conjuntos de treino e teste
train, test = data.split_frame(ratios=[.7], seed=123)
test = test[test['Year'] >= 2021]

# Construir e treinar o modelo
rf_model = H2ORandomForestEstimator(model_id="rf_regressor.hex")
rf_model.train(x=predictors, y=target_variable, training_frame=train)

# Fazer previsões no conjunto de teste
predictions = rf_model.predict(test)

# Converter o conjunto de teste e as previsões para DataFrames do Pandas
test_df = h2o.as_list(test)
predictions_df = h2o.as_list(predictions)

# Adicionar a coluna 'Date' ao DataFrame de previsões
predictions_df['Date'] = df.loc[test_df.index, 'Date'].tolist()

# Converter a coluna 'Date' no DataFrame de previsões para o formato 'YYYY-MM-DD'
predictions_df['Date'] = pd.to_datetime(predictions_df['Date']).dt.strftime('%Y-%m-%d')

# Adicionar a coluna 'Diferenca_Pontos_Real' ao DataFrame de previsões
predictions_df['Diferenca_Pontos_Real'] = test_df['Diferenca_Pontos_Real']

# Obter o desempenho do modelo no conjunto de teste
performance = rf_model.model_performance(test_data=test)

# Exibir métricas de desempenho
print("MSE:", performance.mse())
print("RMSE:", performance.rmse())
print("MAE:", performance.mae())
print("R2:", performance.r2())

# Exibir as previsões com Date e Diferença de Pontos Real
print(predictions_df)


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,42 mins 55 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_milen_yi06mc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.098 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%




drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
MSE: 48.77897318375703
RMSE: 6.984194526483138
MAE: 3.4339247849622336
R2: 0.9622188471187418
       predict        Date  Diferenca_Pontos_Real
0   -23.754400  2006-12-01                 -23.62
1   -34.792900  2006-12-04                 -34.12
2   -34.271200  2006-12-05                 -33.49
3    -7.906278  2006-12-06                  -8.14
4   -14.518467  2006-12-07                 -14.73
..         ...         ...                    ...
197 -21.676500  2007-09-17                 -21.66
198 -33.082200  2007-09-18                 -31.09
199 -23.045200  2007-09-19                 -18.05
200   8.738100  2007-09-20                   9.76
201  18.508800  2007-09-21                  18.89

[202 rows x 3 columns]
