In [None]:
import gdown
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from torch.utils.data import DataLoader, Dataset
from google.colab import drive
from google.colab import files

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
wind_data_2022_file_id = "14VKyrH9-XQrsYd3RvY5YfZOb9lwehZt7"
url = f'https://drive.google.com/uc?id={wind_data_2022_file_id}'

output = 'wind_data_2022.xlsx'
gdown.download(url, output, quiet=False)

wind_data_2022_df = pd.read_excel(output)

wind_data_2023_file_id = "1urT9E3QFX-xQQ7mTrioPBrXx3KmiTK35"
url = f'https://drive.google.com/uc?id={wind_data_2023_file_id}'

output = 'wind_data_2023.xlsx'
gdown.download(url, output, quiet=False)

wind_data_2023_df = pd.read_excel(output)

wind_data_2024_file_id = "1_XfCSTREiiU5RbjiRznw4hhCC5AZt8R4"
url = f'https://drive.google.com/uc?id={wind_data_2024_file_id}'

output = 'wind_data_2024.xlsx'
gdown.download(url, output, quiet=False)

wind_data_2024_df = pd.read_excel(output)

solar_data_file_id = "1AbDDWUy5ZHC8HqlxRvzeMesGhNQZh36U"
url = f'https://drive.google.com/uc?id={solar_data_file_id}'

output = 'solar_data_2022_to_2024_df.xlsx'
gdown.download(url, output, quiet=False)

solar_data_df = pd.read_excel(output)

price_data_file_id = "1VI3vC-55JfauOvn6JNN-yPRpJ7uCnkPK"
url = f'https://drive.google.com/uc?id={price_data_file_id}'

output = 'hubavg_rtm_2022_to_2024_prices.xlsx'
gdown.download(url, output, quiet=False)

price_data_df = pd.read_excel(output)

Downloading...
From: https://drive.google.com/uc?id=14VKyrH9-XQrsYd3RvY5YfZOb9lwehZt7
To: /content/wind_data_2022.xlsx
100%|██████████| 1.30M/1.30M [00:00<00:00, 152MB/s]
Downloading...
From: https://drive.google.com/uc?id=1urT9E3QFX-xQQ7mTrioPBrXx3KmiTK35
To: /content/wind_data_2023.xlsx
100%|██████████| 1.30M/1.30M [00:00<00:00, 143MB/s]
Downloading...
From: https://drive.google.com/uc?id=1_XfCSTREiiU5RbjiRznw4hhCC5AZt8R4
To: /content/wind_data_2024.xlsx
100%|██████████| 895k/895k [00:00<00:00, 79.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1AbDDWUy5ZHC8HqlxRvzeMesGhNQZh36U
To: /content/solar_data_2022_to_2024_df.xlsx
100%|██████████| 2.45M/2.45M [00:00<00:00, 140MB/s]
Downloading...
From: https://drive.google.com/uc?id=1VI3vC-55JfauOvn6JNN-yPRpJ7uCnkPK
To: /content/hubavg_rtm_2022_to_2024_prices.xlsx
100%|██████████| 3.31M/3.31M [00:00<00:00, 164MB/s]


In [None]:
wind_data_df = pd.concat(
    [wind_data_2022_df, wind_data_2023_df, wind_data_2024_df],
    ignore_index=True
)

In [None]:
merged_solar_and_wind_df = pd.merge(
    wind_data_df,
    solar_data_df,
    on=['Time (Hour-Ending)'],
    suffixes=('_wind', '_solar')
)

merged_solar_and_wind_df.head(-5)

Unnamed: 0,Time (Hour-Ending),Date_wind,ERCOT.LOAD_wind,ERCOT.WIND.GEN,"Total Wind Installed, MW","Wind Output, % of Load","Wind Output, % of Installed",Wind 1-hr MW change,Wind 1-hr % change,Date_solar,ERCOT.LOAD_solar,ERCOT.PVGR.GEN,"Total Solar Installed, MW","Solar Output, % of Load","Solar Output, % of Installed",Solar 1-hr MW change,Solar 1-hr % change,Daytime Hour,Ramping Daytime Hour
0,2022-01-01 01:00:00,2022-01-01,38124.261975,12067.479497,34173,31.653018,35.312906,,,2022-01-01,38124.261975,0.122304,9323,0.000321,0.001312,,,0.0,0.0
1,2022-01-01 02:00:00,2022-01-01,37122.946803,12884.367833,34173,34.707287,37.703356,816.888337,6.769337,2022-01-01,37122.946803,0.110247,9323,0.000297,0.001183,0.0,0.000000,0.0,0.0
2,2022-01-01 03:00:00,2022-01-01,35936.747949,14366.542968,34173,39.977304,42.040626,1482.175134,11.503670,2022-01-01,35936.747949,0.111524,9323,0.000310,0.001196,0.0,0.000000,0.0,0.0
3,2022-01-01 04:00:00,2022-01-01,35132.555947,16463.459644,34173,46.860979,48.176805,2096.916676,14.595833,2022-01-01,35132.555947,0.116857,9323,0.000333,0.001253,0.0,0.000000,0.0,0.0
4,2022-01-01 05:00:00,2022-01-01,34602.741810,18337.533839,34173,52.994453,53.660884,1874.074195,11.383234,2022-01-01,34602.741810,0.125148,9323,0.000362,0.001342,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26300,2024-12-31 15:00:00,2024-12-31,46622.073665,4114.358041,39357,8.824914,10.453942,-973.863943,-19.139573,2024-12-31,46622.073665,19535.237201,28793,41.901262,67.847175,727.1,3.865885,,
26301,2024-12-31 16:00:00,2024-12-31,46785.657174,3637.299542,39357,7.774390,9.241811,-477.058499,-11.594968,2024-12-31,46785.657174,19392.288824,28793,41.449218,67.350706,-142.9,-0.731499,,
26302,2024-12-31 17:00:00,2024-12-31,47143.970586,3813.464953,39357,8.088977,9.689420,176.165411,4.843302,2024-12-31,47143.970586,11971.255212,28793,25.392972,41.576964,-7421.0,-38.267788,,
26303,2024-12-31 18:00:00,2024-12-31,48047.625284,4072.616780,39357,8.476208,10.347884,259.151827,6.795705,2024-12-31,48047.625284,1455.119061,28793,3.028493,5.053725,-10516.1,-87.844590,,


In [None]:
columns_to_drop = [
    'Date_wind',
    'Date_solar',
    'Wind Output, % of Load',
    'Solar Output, % of Load',
    'Wind Output, % of Installed',
    'Solar Output, % of Installed',
    'Daytime Hour',
    'Ramping Daytime Hour'
]

merged_solar_and_wind_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
filtered_rtm_df = price_data_df[
     (price_data_df['Delivery Interval'] == 1) &
    (price_data_df['Settlement Point Name'] == 'HB_HUBAVG')
]

In [None]:
filtered_rtm_df.shape

(26304, 8)

In [None]:
# Convert 'Delivery Timestamp' in both DataFrames to datetime format
filtered_rtm_df['Delivery Timestamp'] = pd.to_datetime(filtered_rtm_df['Delivery Timestamp'])
merged_solar_and_wind_df['Time (Hour-Ending)'] = pd.to_datetime(merged_solar_and_wind_df['Time (Hour-Ending)'])

# Now you can merge the DataFrames
final_merged_df = pd.merge(
    filtered_rtm_df,
    merged_solar_and_wind_df,
    left_on='Delivery Timestamp',
    right_on='Time (Hour-Ending)'
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rtm_df['Delivery Timestamp'] = pd.to_datetime(filtered_rtm_df['Delivery Timestamp'])


In [None]:
final_merged_df['month'] = final_merged_df['Delivery Timestamp'].dt.month

# Perform one-hot encoding for the 'month' column
month_encoded_df = pd.get_dummies(final_merged_df['month'], prefix='month', drop_first=True)

# Concatenate the one-hot encoded columns to the original DataFrame
final_merged_df = pd.concat([final_merged_df, month_encoded_df], axis=1)

In [None]:
columns_to_drop = [
    'Delivery Date',
    'Delivery Hour',
    'Delivery Interval',
    'Repeated Hour Flag',
    'Settlement Point Name',
    'Settlement Point Type',
    'Total Wind Installed, MW',
    'Total Solar Installed, MW'
]

final_merged_df.drop(columns=columns_to_drop, inplace=True)

# Reorder columns to move 'Time (Hour-Ending)' to the first position
columns = ['Delivery Timestamp'] + [col for col in final_merged_df.columns if col != 'Delivery Timestamp']
final_merged_df = final_merged_df[columns]


In [None]:
# Step 1: Split the features and target variable
month_columns = ['month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12']

# # Select the original features
# X = final_merged_df[['ERCOT.LOAD_wind', 'ERCOT.WIND.GEN', 'Wind 1-hr MW change', 'Wind 1-hr % change',
#                       'ERCOT.PVGR.GEN', 'Solar 1-hr MW change', 'Solar 1-hr % change']]


# Select the original features
X = final_merged_df[['ERCOT.LOAD_wind', 'ERCOT.WIND.GEN',
                      'ERCOT.PVGR.GEN']]


X = pd.concat([X, final_merged_df[month_columns]], axis=1)

y = final_merged_df['Settlement Point Price']

In [None]:
# Extract the features (all columns except the target column, e.g., 'Price')
features = X.values

# Normalize the features
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

# Scale the target (energy prices)
target = y.values.reshape(-1, 1)
target_scaled = scaler.fit_transform(target)

# Combine the scaled features and target for easy sequence generation
scaled_data = np.column_stack((features_scaled, target_scaled))

In [None]:
SEQ_LEN = 30  # Length of sequence (e.g., 30 days)
split_idx = int(len(scaled_data) * 0.8)
train_data = scaled_data[:split_idx]
test_data = scaled_data[split_idx - SEQ_LEN:]

In [None]:
def create_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[i:i+seq_length, :-1]
        y = data[i+seq_length, -1]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

X_train, y_train = create_sequences(train_data, SEQ_LEN)
X_test, y_test = create_sequences(test_data, SEQ_LEN)


In [None]:
class EnergyDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = EnergyDataset(X_train, y_train)
test_dataset = EnergyDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)                # (batch_size, seq_len, hidden_size)
        return self.fc(out[:, -1, :])

# Instantiate the model
input_size = X_train.shape[2]
hidden_size = 64
num_layers = 2

model = LSTMModel(input_size, hidden_size, num_layers)


In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

EPOCHS = 100
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        predictions = model(batch_X)
        loss = criterion(predictions.squeeze(), batch_y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {running_loss / len(train_loader)}")


Epoch 10/100, Loss: 0.0011685781586449019
Epoch 20/100, Loss: 0.0010284722287806814
Epoch 30/100, Loss: 0.0008874406403016513
Epoch 40/100, Loss: 0.0008233451220012921
Epoch 50/100, Loss: 0.0007260518074587645
Epoch 60/100, Loss: 0.0006218682377600108
Epoch 70/100, Loss: 0.0005345372459028513
Epoch 80/100, Loss: 0.0004932535618196325
Epoch 90/100, Loss: 0.0003781886664186138
Epoch 100/100, Loss: 0.000461913131264759


In [None]:
model.eval()
predictions = []
actuals = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        output = model(batch_X)
        predictions.append(output.squeeze().numpy())
        actuals.append(batch_y.numpy())

# Flatten all batches into single arrays
predictions = np.concatenate(predictions)
actuals = np.concatenate(actuals)

# Reshape to 2D for inverse scaling
pred_scaled = predictions.reshape(-1, 1)
actual_scaled = actuals.reshape(-1, 1)

# Inverse transform using the target scaler
pred_original = scaler.inverse_transform(pred_scaled).flatten()
actual_original = scaler.inverse_transform(actual_scaled).flatten()


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

window_size = 100
num_windows = len(y_test) // window_size

y_test_index = y.index[split_idx : split_idx + len(y_test)]

mse_arr = []
mae_arr = []
r2_arr = []

for i in range(num_windows):
    start_idx = i * window_size
    end_idx = (i + 1) * window_size

    plt.figure(figsize=(12, 5))
    plt.plot(y_test_index[start_idx:end_idx], actual_original[start_idx:end_idx], label="Actual", color='black')
    plt.plot(y_test_index[start_idx:end_idx], pred_original[start_idx:end_idx], label="LSTM", color='blue')
    plt.title(f"Electricity Price Forecast: ({y_test_index[start_idx]}, {y_test_index[end_idx-1]})")
    plt.xlabel("Time")
    plt.ylabel("Energy Price")
    plt.legend()
    plt.tight_layout()
    plt.show()

    mse = mean_squared_error(actual_original[start_idx:end_idx], pred_original[start_idx:end_idx])
    mae = mean_absolute_error(actual_original[start_idx:end_idx], pred_original[start_idx:end_idx])
    r2 = r2_score(actual_original[start_idx:end_idx], pred_original[start_idx:end_idx])

    mse_arr.append(mse)
    mae_arr.append(mae)
    r2_arr.append(r2)

    print(f"--- Evaluation for LSTM ({start_idx}, {end_idx}) ---")
    print(f"RMSE: {mse**0.5:.4f}, MAE: {mae:.4f}, R2 Score: {r2:.4f}")
    print("-----------------------------------------------")

r2_arr = [r for r in r2_arr if r >= 0]  # Filter out negative values
print(sum(r2_arr) / len(r2_arr))  # Calculate the average
print(sum(mae_arr) / len(mae_arr))  # Calculate the average
print(sum(mse_arr) / len(mse_arr))  # Calculate the average





Output hidden; open in https://colab.research.google.com to view.

In [None]:
import os

save_dir = '/content/drive/MyDrive/energy_lstm_project'
os.makedirs(save_dir, exist_ok=True)
import torch
import numpy as np
import joblib

# Save train/test data
np.save(os.path.join(save_dir, 'X_train.npy'), X_train)
np.save(os.path.join(save_dir, 'y_train.npy'), y_train)
np.save(os.path.join(save_dir, 'X_test.npy'), X_test)
np.save(os.path.join(save_dir, 'y_test.npy'), y_test)

# Save the scaler
joblib.dump(scaler, os.path.join(save_dir, 'scaler.save'))

# Save the model weights
torch.save(model.state_dict(), os.path.join(save_dir, 'lstm_model.pt'))


In [None]:
# Re-mount Drive
from google.colab import drive
drive.mount('/content/drive')

import os
import torch
import numpy as np
import joblib

save_dir = '/content/drive/MyDrive/energy_lstm_project'

# Reload data
X_train = np.load(os.path.join(save_dir, 'X_train.npy'))
y_train = np.load(os.path.join(save_dir, 'y_train.npy'))
X_test = np.load(os.path.join(save_dir, 'X_test.npy'))
y_test = np.load(os.path.join(save_dir, 'y_test.npy'))

# Reload scaler
scaler = joblib.load(os.path.join(save_dir, 'scaler.save'))

# Recreate model architecture
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])  # Last time step

# Reload model
input_size = X_train.shape[2]
model = LSTMModel(input_size=input_size, hidden_size=64, num_layers=2)
model.load_state_dict(torch.load(os.path.join(save_dir, 'lstm_model.pt')))
model.eval()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


LSTMModel(
  (lstm): LSTM(14, 64, num_layers=2, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)