JANELA 1 - PREVISÃO 1 DIA

In [44]:
import pandas as pd
import h2o
from h2o.estimators import H2ORandomForestEstimator

# Função para remover outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Initialize H2O
h2o.init()

# Load the training and test data
train_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_train.csv"
train_data = h2o.import_file(train_data_path)
test_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_test.csv"
test_df = pd.read_csv(test_data_path)

# Process Training Data
train_df = h2o.as_list(train_data)
train_df['Date'] = pd.to_datetime(train_df['Date'], unit='ms')
train_df['Year'] = train_df['Date'].dt.year
train_df['Month'] = train_df['Date'].dt.month
train_df['Day'] = train_df['Date'].dt.day
train_df['Date'] = train_df['Date'].dt.strftime('%Y-%m-%d')
train_df['Diferenca_1_Dia'] = train_df['Preco_Real'].diff(periods=1)
train_df = train_df.dropna()  # Removing rows with NaN values after diff calculation

# Process Test Data (including sliding window)
test_df['Date'] = pd.to_datetime(test_df['Date'])
for lag in range(1, 2):  # Example for 3 days lag
    test_df[f'Preco_Real_Lag{lag}'] = test_df['Preco_Real'].shift(lag)
window_size = 1
test_df['Rolling_Mean_3d'] = test_df['Preco_Real'].rolling(window=window_size).mean()
test_df['Diferenca_1_Dia'] = test_df['Preco_Real'].diff(periods=1)
test_df.fillna(method='bfill', inplace=True)

# Remove outliers from both train_df and test_df
train_df = remove_outliers(train_df, 'Diferenca_1_Dia')
test_df = remove_outliers(test_df, 'Diferenca_1_Dia')

# Convert the DataFrames back to H2O Frames
train_data = h2o.H2OFrame(train_df)
test_data = h2o.H2OFrame(test_df)

# Define the target variable and predictor variables
target_variable = 'Diferenca_1_Dia'
predictors = train_data.columns
predictors.remove(target_variable)

# Initialize and Train the GLM Model
rf = H2ORandomForestEstimator(model_id="rf_regressor.hex")
rf.train(x=predictors, y=target_variable, training_frame=train_data)

# Make predictions on the test set
predictions = rf.predict(test_data)

# Convert predictions to Pandas DataFrame
predictions_df = h2o.as_list(predictions)
predictions_df['Date'] = test_df['Date'].dt.strftime('%Y-%m-%d')
predictions_df['Diferenca_Pontos_Predita'] = predictions_df['predict']
predictions_df['Diferenca_Pontos_Real'] = test_df['Diferenca_1_Dia']

# Display the predictions along with actual differences
print(predictions_df[['Date', 'Diferenca_Pontos_Predita', 'Diferenca_Pontos_Real']])

# Display performance metrics
performance = rf.model_performance(test_data=test_data)
print("\nPerformance Metrics:")
print("MSE:", performance.mse())
print("RMSE:", performance.rmse())
print("MAE:", performance.mae())
print("R2:", performance.r2())


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,6 hours 11 mins
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_milen_o1cevs
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.905 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%




           Date  Diferenca_Pontos_Predita  Diferenca_Pontos_Real
0    2023-03-01                  0.193979                   2.45
1    2023-03-02                 -0.675328                   2.45
2    2023-03-03                 -2.334141                 -22.61
3    2023-03-06                 -0.329720                   9.53
4    2023-03-07                 -0.912782                  10.10
..          ...                       ...                    ...
155  2023-10-11                  0.043560                  -1.30
156  2023-10-13                  0.429372                  19.00
157  2023-10-16                  2.432453                   5.03
158  2023-10-17                  0.002184                   2.39
159  2023-10-18                  0.046867                   5.97

[160 rows x 3 columns]

Performance Metrics:
MSE: 108.41176238171886
RMSE: 10.412096925294101
MAE: 8.070443281148679
R2: -0.022059744647849877


In [45]:
import pandas as pd
import h2o
from h2o.estimators import H2ORandomForestEstimator

# Função para remover outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Initialize H2O
h2o.init()

# Load the training and test data
train_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/dolar_train.csv"
train_data = h2o.import_file(train_data_path)
test_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/dolar_test.csv"
test_df = pd.read_csv(test_data_path)

# Process Training Data
train_df = h2o.as_list(train_data)
train_df['Date'] = pd.to_datetime(train_df['Date'], unit='ms')
train_df['Year'] = train_df['Date'].dt.year
train_df['Month'] = train_df['Date'].dt.month
train_df['Day'] = train_df['Date'].dt.day
train_df['Date'] = train_df['Date'].dt.strftime('%Y-%m-%d')
train_df['Diferenca_1_Dia'] = train_df['Preco_Real'].diff(periods=1)
train_df = train_df.dropna()  # Removing rows with NaN values after diff calculation

# Process Test Data (including sliding window)
test_df['Date'] = pd.to_datetime(test_df['Date'])
for lag in range(1, 2):  # Example for 3 days lag
    test_df[f'Preco_Real_Lag{lag}'] = test_df['Preco_Real'].shift(lag)
window_size = 1
test_df['Rolling_Mean_3d'] = test_df['Preco_Real'].rolling(window=window_size).mean()
test_df['Diferenca_1_Dia'] = test_df['Preco_Real'].diff(periods=1)
test_df.fillna(method='bfill', inplace=True)

# Remove outliers from both train_df and test_df
train_df = remove_outliers(train_df, 'Diferenca_1_Dia')
test_df = remove_outliers(test_df, 'Diferenca_1_Dia')

# Convert the DataFrames back to H2O Frames
train_data = h2o.H2OFrame(train_df)
test_data = h2o.H2OFrame(test_df)

# Define the target variable and predictor variables
target_variable = 'Diferenca_1_Dia'
predictors = train_data.columns
predictors.remove(target_variable)

# Initialize and Train the GLM Model
rf = H2ORandomForestEstimator(model_id="rf_regressor.hex")
rf.train(x=predictors, y=target_variable, training_frame=train_data)

# Make predictions on the test set
predictions = rf.predict(test_data)

# Convert predictions to Pandas DataFrame
predictions_df = h2o.as_list(predictions)
predictions_df['Date'] = test_df['Date'].dt.strftime('%Y-%m-%d')
predictions_df['Diferenca_Pontos_Predita'] = predictions_df['predict']
predictions_df['Diferenca_Pontos_Real'] = test_df['Diferenca_1_Dia']

# Display the predictions along with actual differences
print(predictions_df[['Date', 'Diferenca_Pontos_Predita', 'Diferenca_Pontos_Real']])

# Display performance metrics
performance = rf.model_performance(test_data=test_data)
print("\nPerformance Metrics:")
print("MSE:", performance.mse())
print("RMSE:", performance.rmse())
print("MAE:", performance.mae())
print("R2:", performance.r2())


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,6 hours 12 mins
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_milen_o1cevs
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.904 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%




           Date  Diferenca_Pontos_Predita  Diferenca_Pontos_Real
0    2023-03-01                 -2.247791                   2.45
1    2023-03-02                 -0.370272                   2.45
2    2023-03-03                 -0.502072                 -22.61
3    2023-03-06                 -1.885091                   9.53
4    2023-03-07                 -0.558228                  10.10
..          ...                       ...                    ...
155  2023-10-11                  0.161421                  -1.30
156  2023-10-13                 -0.142179                  19.00
157  2023-10-16                 -0.142179                   5.03
158  2023-10-17                  0.469421                   2.39
159  2023-10-18                 -0.428139                   5.97

[160 rows x 3 columns]

Performance Metrics:
MSE: 112.58220446083621
RMSE: 10.610476165603323
MAE: 8.31690685120132
R2: -0.061376889511183474


In [46]:
import pandas as pd
import h2o
from h2o.estimators import H2ORandomForestEstimator

# Função para remover outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Initialize H2O
h2o.init()

# Load the training and test data
train_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/ibov_train.csv"
train_data = h2o.import_file(train_data_path)
test_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/ibov_test.csv"
test_df = pd.read_csv(test_data_path)

# Process Training Data
train_df = h2o.as_list(train_data)
train_df['Date'] = pd.to_datetime(train_df['Date'], unit='ms')
train_df['Year'] = train_df['Date'].dt.year
train_df['Month'] = train_df['Date'].dt.month
train_df['Day'] = train_df['Date'].dt.day
train_df['Date'] = train_df['Date'].dt.strftime('%Y-%m-%d')
train_df['Diferenca_1_Dia'] = train_df['Preco_Real'].diff(periods=1)
train_df = train_df.dropna()  # Removing rows with NaN values after diff calculation

# Process Test Data (including sliding window)
test_df['Date'] = pd.to_datetime(test_df['Date'])
for lag in range(1, 2):  # Example for 3 days lag
    test_df[f'Preco_Real_Lag{lag}'] = test_df['Preco_Real'].shift(lag)
window_size = 1
test_df['Rolling_Mean_3d'] = test_df['Preco_Real'].rolling(window=window_size).mean()
test_df['Diferenca_1_Dia'] = test_df['Preco_Real'].diff(periods=1)
test_df.fillna(method='bfill', inplace=True)

# Remove outliers from both train_df and test_df
train_df = remove_outliers(train_df, 'Diferenca_1_Dia')
test_df = remove_outliers(test_df, 'Diferenca_1_Dia')

# Convert the DataFrames back to H2O Frames
train_data = h2o.H2OFrame(train_df)
test_data = h2o.H2OFrame(test_df)

# Define the target variable and predictor variables
target_variable = 'Diferenca_1_Dia'
predictors = train_data.columns
predictors.remove(target_variable)

# Initialize and Train the GLM Model
rf = H2ORandomForestEstimator(model_id="rf_regressor.hex")
rf.train(x=predictors, y=target_variable, training_frame=train_data)

# Make predictions on the test set
predictions = rf.predict(test_data)

# Convert predictions to Pandas DataFrame
predictions_df = h2o.as_list(predictions)
predictions_df['Date'] = test_df['Date'].dt.strftime('%Y-%m-%d')
predictions_df['Diferenca_Pontos_Predita'] = predictions_df['predict']
predictions_df['Diferenca_Pontos_Real'] = test_df['Diferenca_1_Dia']

# Display the predictions along with actual differences
print(predictions_df[['Date', 'Diferenca_Pontos_Predita', 'Diferenca_Pontos_Real']])

# Display performance metrics
performance = rf.model_performance(test_data=test_data)
print("\nPerformance Metrics:")
print("MSE:", performance.mse())
print("RMSE:", performance.rmse())
print("MAE:", performance.mae())
print("R2:", performance.r2())


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,6 hours 13 mins
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_milen_o1cevs
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.901 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%




           Date  Diferenca_Pontos_Predita  Diferenca_Pontos_Real
0    2023-03-01                  2.828733                   2.45
1    2023-03-02                  1.482939                   2.45
2    2023-03-03                  0.932856                 -22.61
3    2023-03-06                  0.991286                   9.53
4    2023-03-07                  1.239957                  10.10
..          ...                       ...                    ...
155  2023-10-11                 -1.330617                  -1.30
156  2023-10-13                 -1.957883                  19.00
157  2023-10-16                 -0.930917                   5.03
158  2023-10-17                 -1.962717                   2.39
159  2023-10-18                 -1.424583                   5.97

[160 rows x 3 columns]

Performance Metrics:
MSE: 112.34016600887466
RMSE: 10.5990643930903
MAE: 8.22669767713873
R2: -0.059095054468821795


In [47]:
import pandas as pd
import h2o
from h2o.estimators import H2ORandomForestEstimator

# Função para remover outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Initialize H2O
h2o.init()

# Load the training and test data
train_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/selic_train.csv"
train_data = h2o.import_file(train_data_path)
test_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/selic_test.csv"
test_df = pd.read_csv(test_data_path)

# Process Training Data
train_df = h2o.as_list(train_data)
train_df['Date'] = pd.to_datetime(train_df['Date'], unit='ms')
train_df['Year'] = train_df['Date'].dt.year
train_df['Month'] = train_df['Date'].dt.month
train_df['Day'] = train_df['Date'].dt.day
train_df['Date'] = train_df['Date'].dt.strftime('%Y-%m-%d')
train_df['Diferenca_1_Dia'] = train_df['Preco_Real'].diff(periods=1)
train_df = train_df.dropna()  # Removing rows with NaN values after diff calculation

# Process Test Data (including sliding window)
test_df['Date'] = pd.to_datetime(test_df['Date'])
for lag in range(1, 2):  # Example for 3 days lag
    test_df[f'Preco_Real_Lag{lag}'] = test_df['Preco_Real'].shift(lag)
window_size = 1
test_df['Rolling_Mean_3d'] = test_df['Preco_Real'].rolling(window=window_size).mean()
test_df['Diferenca_1_Dia'] = test_df['Preco_Real'].diff(periods=1)
test_df.fillna(method='bfill', inplace=True)

# Remove outliers from both train_df and test_df
train_df = remove_outliers(train_df, 'Diferenca_1_Dia')
test_df = remove_outliers(test_df, 'Diferenca_1_Dia')

# Convert the DataFrames back to H2O Frames
train_data = h2o.H2OFrame(train_df)
test_data = h2o.H2OFrame(test_df)

# Define the target variable and predictor variables
target_variable = 'Diferenca_1_Dia'
predictors = train_data.columns
predictors.remove(target_variable)

# Initialize and Train the GLM Model
rf = H2ORandomForestEstimator(model_id="rf_regressor.hex")
rf.train(x=predictors, y=target_variable, training_frame=train_data)

# Make predictions on the test set
predictions = rf.predict(test_data)

# Convert predictions to Pandas DataFrame
predictions_df = h2o.as_list(predictions)
predictions_df['Date'] = test_df['Date'].dt.strftime('%Y-%m-%d')
predictions_df['Diferenca_Pontos_Predita'] = predictions_df['predict']
predictions_df['Diferenca_Pontos_Real'] = test_df['Diferenca_1_Dia']

# Display the predictions along with actual differences
print(predictions_df[['Date', 'Diferenca_Pontos_Predita', 'Diferenca_Pontos_Real']])

# Display performance metrics
performance = rf.model_performance(test_data=test_data)
print("\nPerformance Metrics:")
print("MSE:", performance.mse())
print("RMSE:", performance.rmse())
print("MAE:", performance.mae())
print("R2:", performance.r2())


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,6 hours 14 mins
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_milen_o1cevs
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.901 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%




drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%




           Date  Diferenca_Pontos_Predita  Diferenca_Pontos_Real
0    2023-03-01                 -0.952638                   2.45
1    2023-03-02                 -3.206888                   2.45
2    2023-03-03                 -3.378488                 -22.61
3    2023-03-06                 -3.033488                   9.53
4    2023-03-07                 -3.196288                  10.10
..          ...                       ...                    ...
155  2023-10-11                 -2.997000                  -1.30
156  2023-10-13                 -2.503000                  19.00
157  2023-10-16                 -2.503000                   5.03
158  2023-10-17                 -2.503000                   2.39
159  2023-10-18                 -2.654800                   5.97

[160 rows x 3 columns]

Performance Metrics:
MSE: 113.35590304487133
RMSE: 10.646872923298716
MAE: 8.335588396291717
R2: -0.06867098896922053


In [52]:
import pandas as pd
import h2o
from h2o.estimators import H2ORandomForestEstimator

# Função para remover outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Initialize H2O
h2o.init()

# Load the training and test data
train_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_train.csv"
train_data = h2o.import_file(train_data_path)
test_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_test.csv"
test_df = pd.read_csv(test_data_path)

# Process Training Data
train_df = h2o.as_list(train_data)
train_df['Date'] = pd.to_datetime(train_df['Date'], unit='ms')
train_df['Year'] = train_df['Date'].dt.year
train_df['Month'] = train_df['Date'].dt.month
train_df['Day'] = train_df['Date'].dt.day
train_df['Date'] = train_df['Date'].dt.strftime('%Y-%m-%d')
train_df['Diferenca_1_Dia'] = train_df['Preco_Real'].diff(periods=1)
train_df = train_df.dropna()  # Removing rows with NaN values after diff calculation

# Process Test Data (including sliding window)
test_df['Date'] = pd.to_datetime(test_df['Date'])
for lag in range(1, 2):  # Example for 3 days lag
    test_df[f'Preco_Real_Lag{lag}'] = test_df['Preco_Real'].shift(lag)
window_size = 1
test_df['Rolling_Mean_3d'] = test_df['Preco_Real'].rolling(window=window_size).mean()
test_df['Diferenca_1_Dia'] = test_df['Preco_Real'].diff(periods=1)
test_df.fillna(method='bfill', inplace=True)

# Remove outliers from both train_df and test_df
train_df = remove_outliers(train_df, 'Diferenca_1_Dia')
test_df = remove_outliers(test_df, 'Diferenca_1_Dia')

# Convert the DataFrames back to H2O Frames
train_data = h2o.H2OFrame(train_df)
test_data = h2o.H2OFrame(test_df)

# Define the target variable and predictor variables
target_variable = 'Diferenca_1_Dia'
predictors = train_data.columns
predictors.remove(target_variable)

# Initialize and Train the GLM Model
rf = H2ORandomForestEstimator(model_id="rf_regressor.hex")
rf.train(x=predictors, y=target_variable, training_frame=train_data)

# Make predictions on the test set
predictions = rf.predict(test_data)

# Convert predictions to Pandas DataFrame
predictions_df = h2o.as_list(predictions)
predictions_df['Date'] = test_df['Date'].dt.strftime('%Y-%m-%d')
predictions_df['Diferenca_Pontos_Predita'] = predictions_df['predict']
predictions_df['Diferenca_Pontos_Real'] = test_df['Diferenca_1_Dia']

# Display the predictions along with actual differences
print(predictions_df[['Date', 'Diferenca_Pontos_Predita', 'Diferenca_Pontos_Real']])

# Display performance metrics
performance = rf.model_performance(test_data=test_data)
print("\nPerformance Metrics:")
print("MSE:", performance.mse())
print("RMSE:", performance.rmse())
print("MAE:", performance.mae())
print("R2:", performance.r2())


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,7 hours 36 mins
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_milen_o1cevs
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.688 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%




drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%




           Date  Diferenca_Pontos_Predita  Diferenca_Pontos_Real
0    2023-01-06                 -1.087906                 -20.62
1    2023-01-09                 -0.239576                 -20.62
2           NaN                  0.038493                    NaN
3    2023-01-11                 -1.971426                 -23.81
4    2023-01-12                 -2.678744                   8.44
..          ...                       ...                    ...
187  2023-10-05                 -2.369923                  -1.59
188  2023-10-06                 -3.134393                   5.21
189  2023-10-09                 -2.040431                   2.00
190  2023-10-10                 -1.374732                  -3.07
191  2023-10-11                 -0.975151                  -1.30

[192 rows x 3 columns]

Performance Metrics:
MSE: 132.0876742462364
RMSE: 11.492940191536558
MAE: 9.04907768762729
R2: -0.031089551556690465


In [51]:
import pandas as pd
import h2o
from h2o.estimators import H2ORandomForestEstimator

# Função para remover outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Initialize H2O
h2o.init()

# Load the training and test data
train_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_train.csv"
train_data = h2o.import_file(train_data_path)
test_data_path = "C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_test.csv"
test_df = pd.read_csv(test_data_path)

# Process Training Data
train_df = h2o.as_list(train_data)
train_df['Date'] = pd.to_datetime(train_df['Date'], unit='ms')
train_df['Year'] = train_df['Date'].dt.year
train_df['Month'] = train_df['Date'].dt.month
train_df['Day'] = train_df['Date'].dt.day
train_df['Date'] = train_df['Date'].dt.strftime('%Y-%m-%d')
train_df['Diferenca_1_Dia'] = train_df['Preco_Real'].diff(periods=1)
train_df = train_df.dropna()  # Removing rows with NaN values after diff calculation

# Process Test Data
test_df['Date'] = pd.to_datetime(test_df['Date'])
for lag in range(1, 2):  # Example for 1 day lag
    test_df[f'Preco_Real_Lag{lag}'] = test_df['Preco_Real'].shift(lag)
window_size = 1
test_df['Rolling_Mean_3d'] = test_df['Preco_Real'].rolling(window=window_size).mean()
test_df['Diferenca_1_Dia'] = test_df['Preco_Real'].diff(periods=1)
test_df.fillna(method='bfill', inplace=True)

# Remove outliers
train_df = remove_outliers(train_df, 'Diferenca_1_Dia')
test_df = remove_outliers(test_df, 'Diferenca_1_Dia')

# Garantir que os conjuntos de dados tenham as mesmas colunas
common_columns = set(train_df.columns).intersection(set(test_df.columns))
train_df = train_df[list(common_columns)]
test_df = test_df[list(common_columns)]

# Convert the DataFrames back to H2O Frames
train_data = h2o.H2OFrame(train_df)
test_data = h2o.H2OFrame(test_df)

# Define the target variable and predictor variables
target_variable = 'Diferenca_1_Dia'
predictors = list(common_columns)
predictors.remove(target_variable)

# Initialize the Random Forest Model
rf = H2ORandomForestEstimator(model_id="rf_regressor.hex")

# Walk-forward validation
results = []
for i in range(len(test_df)):
    # Update train_data with the current point from test_df
    current_point = h2o.H2OFrame(test_df.iloc[i:i+1])
    train_data = train_data.rbind(current_point)

    # Train the model on the updated dataset
    rf.train(x=predictors, y=target_variable, training_frame=train_data)

    # If not the last point, make a prediction for the next point
    if i < len(test_df) - 1:
        next_point = h2o.H2OFrame(test_df.iloc[i+1:i+2])
        prediction = rf.predict(next_point)
        predicted_value = h2o.as_list(prediction)[0][0]
        actual_value = test_df.iloc[i+1][target_variable]
        results.append({'Date': test_df.iloc[i+1]['Date'].strftime('%Y-%m-%d'), 'Predicted': predicted_value, 'Actual': actual_value})

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)
print(results_df)

# Display performance metrics
performance = rf.model_performance(test_data=test_data)
print("\nPerformance Metrics:")
print("MSE:", performance.mse())
print("RMSE:", performance.rmse())
print("MAE:", performance.mae())
print("R2:", performance.r2())


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,7 hours 35 mins
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_milen_o1cevs
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.690 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%




drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%


KeyError: 0