In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from datetime import datetime, timedelta
import numpy as np
import time

# Load the dataset
df = pd.read_csv('C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima.csv')  # Change this to the path of your CSV file

# Convert the Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Forward fill to handle missing values if there's any
df.fillna(method='ffill', inplace=True)

# Function to perform walk-forward validation
def walk_forward_validation(data, start_date, end_date):
    predictions = []
    actuals = []
    # Split the data
    training_data = data[data['Date'] < start_date].copy()
    validation_data = data[(data['Date'] >= start_date) & (data['Date'] <= end_date)].copy()
    
    total_days = (end_date - start_date).days + 1
    start_time = time.time()
    
    # Train and make predictions day by day
    for i, current_date in enumerate(pd.date_range(start_date, end_date), 1):
        elapsed_time = time.time() - start_time
        average_time_per_iteration = elapsed_time / i
        estimated_time_remaining = average_time_per_iteration * (total_days - i)
        
        print(f"Processing date {current_date.strftime('%Y-%m-%d')} ({i}/{total_days}). "
              f"Estimated time remaining: {timedelta(seconds=estimated_time_remaining)}")
        
        train_X = training_data.drop(['Date', 'Preco_Real'], axis=1)
        train_y = training_data['Preco_Real']
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(train_X, train_y)
        
        test_X = validation_data[validation_data['Date'] == current_date].drop(['Date', 'Preco_Real'], axis=1)
        if not test_X.empty:
            yhat = model.predict(test_X)
            actual = validation_data.loc[validation_data['Date'] == current_date, 'Preco_Real'].values[0]
            predictions.append(yhat[0])
            actuals.append(actual)
            
            # Add the new observation to the training dataset
            new_obs = validation_data[validation_data['Date'] == current_date]
            training_data = pd.concat([training_data, new_obs])
            
    return predictions, actuals

# Define the start and end dates for the validation period
start_date = datetime(2022, 1, 1)
end_date = datetime(2023, 12, 31)

# Perform walk-forward validation
predictions, actuals = walk_forward_validation(df, start_date, end_date)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(actuals, predictions))

# Print the RMSE
print(f'RMSE: {rmse}')

# Combine the actual and predicted values for comparison
results = pd.DataFrame({
    'Date': pd.date_range(start_date, min(end_date, df['Date'].max())),
    'Actual': actuals,
    'Predicted': predictions
})

# Save the results to a CSV file
results.to_csv('path_to_save_results/walk_forward_validation_results.csv', index=False)

# Print the first few rows of the results
print(results.head())



Processing date 2022-01-01 (1/730). Estimated time remaining: 0:00:00.725645
Processing date 2022-01-02 (2/730). Estimated time remaining: 1:08:21.520109
Processing date 2022-01-03 (3/730). Estimated time remaining: 1:30:42.777325
Processing date 2022-01-04 (4/730). Estimated time remaining: 1:41:58.857756
Processing date 2022-01-05 (5/730). Estimated time remaining: 1:49:51.738414
Processing date 2022-01-06 (6/730). Estimated time remaining: 2:00:08.760761
Processing date 2022-01-07 (7/730). Estimated time remaining: 2:08:08.638961
Processing date 2022-01-08 (8/730). Estimated time remaining: 2:12:40.090887
Processing date 2022-01-09 (9/730). Estimated time remaining: 2:15:06.868962
Processing date 2022-01-10 (10/730). Estimated time remaining: 2:14:11.193598
Processing date 2022-01-11 (11/730). Estimated time remaining: 2:13:40.216971
Processing date 2022-01-12 (12/730). Estimated time remaining: 2:13:33.368152
Processing date 2022-01-13 (13/730). Estimated time remaining: 2:12:44.17

ValueError: All arrays must be of the same length