In [4]:
import pandas as pd
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

# Initialize H2O
h2o.init()

# Load the data from "base_completa.csv"
data = h2o.import_file("C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv")

# Convert H2O Frame to a Pandas DataFrame for manipulation
df = h2o.as_list(data)

# Convert the 'Date' column from timestamp in milliseconds to 'YYYY-MM-DD' format
df['Date'] = pd.to_datetime(df['Date'], unit='ms')
df['Date'] = pd.to_datetime(df['Date'])

# Calculate the difference of points for intervals of 5 days
df['Diferenca_5_Dias'] = df['Preco_Real'].diff(periods=5)

# Remove rows where 'Diferenca_5_Dias' is NaN (such as in the first 5 rows)
df.dropna(subset=['Diferenca_5_Dias'], inplace=True)

# Save the actual difference of points for future comparison
df['Diferenca_Pontos_Real'] = df['Diferenca_5_Dias']

# Convert back to H2O DataFrame
data = h2o.H2OFrame(df)

# Define the target variable and predictor variables
target_variable = 'Diferenca_5_Dias'
predictors = data.columns
predictors.remove(target_variable)

# Split into training and test sets
train, test = data.split_frame(ratios=[.7])

# Set the `alpha` parameter to 0.25
glm_model = H2OGeneralizedLinearEstimator(alpha=0.25)

# Train the GLM model
glm_model.train(x=predictors, y=target_variable, training_frame=train)

# Make predictions on the test set
predictions = glm_model.predict(test)

# Convert the test set and predictions to Pandas DataFrames
test_df = h2o.as_list(test)
predictions_df = h2o.as_list(predictions)

# Add the 'Date' and 'Diferenca_Pontos_Real' columns to the predictions DataFrame
predictions_df['Date'] = test_df['Date']
predictions_df['Diferenca_Pontos_Real'] = test_df['Diferenca_Pontos_Real']

# Display predictions with 'Date' and 'Diferenca_Pontos_Real'
print(predictions_df)


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 11 mins
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_milen_gnqnw6
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.899 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%




glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
        predict           Date  Diferenca_Pontos_Real
0      3.278666  1165881600000                   3.56
1     -4.748508  1166659200000                  -4.93
2      5.371459  1167177600000                   5.78
3      1.855038  1167696000000                   2.10
4    -10.360781  1167955200000                 -10.90
...         ...            ...                    ...
1270  21.478236  1696550400000                  22.94
1271   6.865935  1696809600000                   7.46
1272  46.151922  1698105600000                  49.08
1273  16.873291  1698192000000                  18.05
1274 -32.611145  1698624000000                 -34.61

[1275 rows x 3 columns]


In [8]:
import pandas as pd
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator


# Initialize H2O
h2o.init()

# Load the data from "base_completa.csv"
data = h2o.import_file("C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv")

# Convert H2O Frame to a Pandas DataFrame for manipulation
df = h2o.as_list(data)

# Convert the 'Date' column from timestamp in milliseconds to 'YYYY-MM-DD' format
df['Date'] = pd.to_datetime(df['Date'], unit='ms')
df['Date'] = pd.to_datetime(df['Date'])

# Calculate the difference of points for intervals of 5 days
df['Diferenca_5_Dias'] = df['Preco_Real'].diff(periods=5)

# Remove rows where 'Diferenca_5_Dias' is NaN (such as in the first 5 rows)
df.dropna(subset=['Diferenca_5_Dias'], inplace=True)

# Save the actual difference of points for future comparison
df['Diferenca_Pontos_Real'] = df['Diferenca_5_Dias']

# Convert back to H2O DataFrame
data = h2o.H2OFrame(df)

# Define the target variable and predictor variables
target_variable = 'Diferenca_5_Dias'
predictors = data.columns
predictors.remove(target_variable)

# Split into training and test sets
train, test = data.split_frame(ratios=[.7])

# Set the `alpha` parameter to 0.25
glm_model = H2OGeneralizedLinearEstimator(alpha=0.25)

# Train the GLM model
glm_model.train(x=predictors, y=target_variable, training_frame=train)

# Make predictions on the test set
predictions = glm_model.predict(test)

# Convert the test set and predictions to Pandas DataFrames
test_df = h2o.as_list(test)
predictions_df = h2o.as_list(predictions)

# Add the 'Date' and 'Diferenca_Pontos_Real' columns to the predictions DataFrame
predictions_df['Date'] = test_df['Date']
predictions_df['Diferenca_Pontos_Real'] = test_df['Diferenca_Pontos_Real']

# Display predictions with 'Date' and 'Diferenca_Pontos_Real'
print(predictions_df)
performance = glm_model.model_performance(test_data=test)
# Exibir métricas de desempenho
print("MSE:", performance.mse())
print("RMSE:", performance.rmse())
print("MAE:", performance.mae())
print("R2:", performance.r2())

# Exibir as previsões com Date e Diferença de Pontos Real
print(predictions_df)



Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 15 mins
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_milen_gnqnw6
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.898 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%




glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [None]:
import pandas as pd
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.model.metrics_base import H2OBinomialModelMetrics, H2OMultinomialModelMetrics

# Initialize H2O
h2o.init()

# Load the data from "base_completa.csv"
data = h2o.import_file("C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv")

# Convert H2O Frame to a Pandas DataFrame for manipulation
df = h2o.as_list(data)

# Convert the 'Date' column from timestamp in milliseconds to 'YYYY-MM-DD' format
df['Date'] = pd.to_datetime(df['Date'], unit='ms')
df['Date'] = pd.to_datetime(df['Date'])

# Calculate the difference of points for intervals of 5 days
df['Diferenca_5_Dias'] = df['Preco_Real'].diff(periods=5)

# Remove rows where 'Diferenca_5_Dias' is NaN (such as in the first 5 rows)
df.dropna(subset=['Diferenca_5_Dias'], inplace=True)

# Save the actual difference of points for future comparison
df['Diferenca_Pontos_Real'] = df['Diferenca_5_Dias']

# Convert back to H2O DataFrame
data = h2o.H2OFrame(df)

# Define the target variable and predictor variables
target_variable = 'Diferenca_5_Dias'
predictors = data.columns
predictors.remove(target_variable)

# Define the sliding window size and step size
window_size = 30  # Adjust as needed
step_size = 10     # Adjust as needed

# Initialize lists to store MSE and R-squared values for each window
mse_list = []
rsquared_list = []

# Perform sliding window validation
for i in range(0, len(data) - window_size + 1, step_size):
    train_window = data[i:i + window_size]
    valid_window = data[i + window_size:i + window_size + step_size]
    
    # Set up and train the GLM model for each window
    glm_model = H2OGeneralizedLinearEstimator(alpha=0.25)
    glm_model.train(x=predictors, y=target_variable, training_frame=train_window)
    
    # Make predictions on the validation window
    predictions = glm_model.predict(valid_window)
    
    # Calculate MSE and R-squared for the validation window
    mse = glm_model.model_performance(valid_window)['mse']
    rsquared = glm_model.model_performance(valid_window)['r2']
    
    mse_list.append(mse)
    rsquared_list.append(rsquared)

# Display the MSE and R-squared values for each window
for i, (mse, rsquared) in enumerate(zip(mse_list, rsquared_list)):
    print(f"Window {i+1}:")
    print(f"MSE: {mse}")
    print(f"R-squared (R²): {rsquared}")
    print()
