In [2]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from datetime import datetime

# Load the dataset
df = pd.read_csv('C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima.csv')  # Adjust the path to your file location
# Simulate a dataset
np.random.seed(0)  # Seed for reproducibility
dates = pd.date_range(start='2021-01-01', end='2023-08-01', freq='D')
prices = np.random.rand(len(dates)) * 100 + 1400  # Simulated 'Preco_Real'

df = pd.DataFrame({'Date': dates, 'Preco_Real': prices})

# Function to perform walk-forward validation with multi-step forecasting
def walk_forward_validation(data, start_date, end_date, forecast_horizon=7):
    predictions = []
    actuals = []
    prediction_dates = []

    # Prepare the training and validation datasets
    training_data = data[data['Date'] < start_date]['Preco_Real'].tolist()
    validation_data = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)]

    for i in range(0, len(validation_data), forecast_horizon):
        # Prepare the end index for the forecast period
        end_ix = i + forecast_horizon
        if end_ix > len(validation_data):
            end_ix = len(validation_data)
        actual = validation_data.iloc[i:end_ix]['Preco_Real'].tolist()

        # Forecast the next 'forecast_horizon' steps
        model = ARIMA(training_data, order=(5,1,0))  # Simplified model for demonstration
        model_fit = model.fit()
        yhat = model_fit.forecast(steps=forecast_horizon)[:len(actual)]

        # Append predictions and actuals
        predictions.extend(yhat)
        actuals.extend(actual)
        prediction_dates.extend(validation_data.iloc[i:end_ix]['Date'].tolist())

        # Add the actual values to the training data for the next iteration
        training_data.extend(actual)
    
    return prediction_dates, actuals, predictions

# Define the start and end dates for the validation period
start_date = datetime(2022, 1, 1)
end_date = datetime(2023, 8, 1)

# Perform walk-forward validation
forecast_horizon = 14  # Set this to how many days you want to forecast at each step
prediction_dates, actuals, predictions = walk_forward_validation(df, start_date, end_date, forecast_horizon)

# Calculate error metrics
mse = mean_squared_error(actuals, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(actuals, predictions)
mape = np.mean(np.abs((np.array(actuals) - np.array(predictions)) / np.array(actuals))) * 100
r2 = r2_score(actuals, predictions)

# Output the error metrics
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'MAPE: {mape}%')
print(f'R-squared: {r2}')

# Combine the actual and predicted values for comparison
results = pd.DataFrame({
    'Date': prediction_dates,
    'Actual': actuals,
    'Predicted': predictions
})

# Filter the results for a specific date range
specific_start_date = datetime(2023, 7, 1)
specific_end_date = datetime(2023, 7, 7)
specific_results = results[(results['Date'] >= specific_start_date) & (results['Date'] <= specific_end_date)]

print(specific_results)


MSE: 1111.5454773349086
RMSE: 33.33984819003992
MAE: 28.24513872615632
MAPE: 1.950395717879663%
R-squared: -0.2964181512702011
          Date       Actual    Predicted
546 2023-07-01  1405.687726  1440.777313
547 2023-07-02  1498.048566  1442.371694
548 2023-07-03  1409.645086  1436.089758
549 2023-07-04  1486.347065  1440.029153
550 2023-07-05  1456.650611  1444.309983
551 2023-07-06  1436.791749  1447.350443
552 2023-07-07  1434.234238  1442.420017
