In [8]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [9]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

### LSTM

In [10]:
# ─────── Hyperparameters ───────
SEQ_LEN     = 60        # Length of input sequence (seconds)
BATCH_SIZE  = 64        # Batch size for training
HIDDEN_SIZE = 64        # Number of hidden units in LSTM
NUM_LAYERS  = 2         # Number of LSTM layers
DROPOUT     = 0.2       # Dropout rate between LSTM layers
LR          = 1e-3      # Learning rate for Adam optimizer
MAX_EPOCHS  = 50        # Maximum number of training epochs
PATIENCE    = 7         # Early stopping patience

In [12]:
FEATURES = [
    "Ambient Temp Trace(F)",
    "Inlet Temp Trace(F)",
    "Outlet Temp Trace (F)",
    "Operational Mode Code",
]
TARGET = "Power (W)"

# ─────── Device configuration ───────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ─────── Load and scale data ───────
train_df = pd.read_csv(r"C:\Users\jliu359\Downloads\Hpwh\Train\Train\Merged_HPWH_Train_1s.csv")
test_df  = pd.read_csv(r"C:\Users\jliu359\Downloads\Hpwh\Test\Test\Merged_HPWH_Test_1s.csv")

scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

# Fit scalers on training set and transform both train and test
X_train = scaler_x.fit_transform(train_df[FEATURES])
y_train = scaler_y.fit_transform(train_df[[TARGET]]).squeeze(-1)
X_test  = scaler_x.transform(test_df[FEATURES])
y_test  = scaler_y.transform(test_df[[TARGET]]).squeeze(-1)

# ─────── Dataset definition (lazy slicing) ───────
class SequenceDataset(Dataset):
    def __init__(self, X, y, seq_len):
        # Store as tensors; slicing will happen in __getitem__
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.seq_len = seq_len

    def __len__(self):
        # Number of samples is total length minus one sequence
        return self.X.size(0) - self.seq_len

    def __getitem__(self, idx):
        # Return one sequence window and its target value
        x_seq = self.X[idx : idx + self.seq_len]  # Shape: [seq_len, num_features]
        y_val = self.y[idx + self.seq_len]        # Scalar target
        return x_seq, y_val

# Instantiate datasets and loaders
train_ds = SequenceDataset(X_train, y_train, SEQ_LEN)
test_ds  = SequenceDataset(X_test,  y_test,  SEQ_LEN)
print("Train samples:", len(train_ds), "Test samples:", len(test_ds))

Using device: cpu
Train samples: 176281 Test samples: 50281


In [None]:
train_loader = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True
)
test_loader = DataLoader(
    test_ds, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True
)

# ─────── Model definition ───────
class HPWH_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super().__init__()
        # Stacked LSTM with dropout between layers
        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        # Final linear layer to map hidden state to output
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # x shape: [batch, seq_len, features]
        _, (h_n, _) = self.lstm(x)
        # Use the last layer's final hidden state
        out = self.fc(h_n[-1])
        return out.squeeze(-1)  # Shape: [batch]

# Create model, loss, and optimizer
model = HPWH_LSTM(
    input_size=len(FEATURES),
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT
).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

# ─────── Training loop with early stopping ───────
best_val_loss = float("inf")
no_improve    = 0

for epoch in range(1, MAX_EPOCHS + 1):
    # ---- Training ----
    model.train()
    train_loss = 0.0
    loop = tqdm(train_loader, desc=f"Epoch {epoch}/{MAX_EPOCHS}", leave=False)
    for Xb, yb in loop:
        # Move batch to the correct device
        Xb, yb = Xb.to(device, non_blocking=True), yb.to(device, non_blocking=True)
        optimizer.zero_grad()
        preds = model(Xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * Xb.size(0)
        loop.set_postfix(loss=loss.item())
    train_loss /= len(train_loader.dataset)

    # ---- Validation ----
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for Xb, yb in test_loader:
            Xb, yb = Xb.to(device, non_blocking=True), yb.to(device, non_blocking=True)
            preds = model(Xb)
            val_loss += criterion(preds, yb).item() * Xb.size(0)
    val_loss /= len(test_loader.dataset)

    print(f"Epoch {epoch:02d} | Train MSE: {train_loss:.5f} | Val MSE: {val_loss:.5f}")
        # ---- Early stopping check ----
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        no_improve = 0
        torch.save(model.state_dict(), "best_hpwh_lstm.pt")
    else:
        no_improve += 1
        if no_improve >= PATIENCE:
            print(f"Early stopping at epoch {epoch} (no improvement for {PATIENCE} epochs).")
            break



                                                                             

Epoch 01 | Train MSE: 0.02831 | Val MSE: 0.06213


                                                                              

Epoch 02 | Train MSE: 0.02184 | Val MSE: 0.05547


                                                                              

Epoch 03 | Train MSE: 0.01825 | Val MSE: 0.09964


                                                                              

Epoch 04 | Train MSE: 0.01120 | Val MSE: 0.12617


Epoch 5/50:  10%|█         | 280/2755 [00:10<01:30, 27.31it/s, loss=0.000816]

In [None]:
# ─────── Load best model & evaluate ───────
model.load_state_dict(torch.load("best_hpwh_lstm.pt"))
model.eval()

y_true_list, y_pred_list = [], []
with torch.no_grad():
    for Xb, yb in test_loader:
        Xb = Xb.to(device)
        preds = model(Xb).cpu().numpy()  # Move to CPU before converting
        y_true_list.append(yb.numpy())
        y_pred_list.append(preds)

# Concatenate batches and inverse-transform
y_true_scaled = np.concatenate(y_true_list).reshape(-1, 1)
y_pred_scaled = np.concatenate(y_pred_list).reshape(-1, 1)

y_true = scaler_y.inverse_transform(y_true_scaled).ravel()
y_pred = scaler_y.inverse_transform(y_pred_scaled).ravel()

mae  = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
print(f"\nTest MAE: {mae:.2f} W | RMSE: {rmse:.2f} W")

# ─────── Plot predictions vs. actual ───────
plt.figure()
plt.plot(y_pred, label="Predicted")
plt.plot(y_true, label="Actual")
plt.legend()
plt.title("LSTM HPWH Power Prediction")
plt.xlabel("Time Step")
plt.ylabel("Power (W)")
plt.show()

### GRU

In [None]:
# hpwh_gru.py

import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error

# ───── Hyperparameters ─────
SEQ_LEN     = 60
BATCH_SIZE  = 64
HIDDEN_SIZE = 64
NUM_LAYERS  = 2
DROPOUT     = 0.2
LR          = 1e-3
MAX_EPOCHS  = 50
PATIENCE    = 7

FEATURES = [
    "Ambient Temp Trace(F)",
    "Inlet Temp Trace(F)",
    "Outlet Temp Trace (F)",
    "Operational Mode Code",
]
TARGET = "Power (W)"

# ───── Device ─────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ───── Load & Scale ─────
train_df = pd.read_csv(r"C:\Users\default.DESKTOP-C4C7JDR\Downloads\Trian\Trian\Merged_HPWH_Train_1s.csv")
test_df  = pd.read_csv(r"C:\Users\default.DESKTOP-C4C7JDR\Downloads\Test\Test\Merged_HPWH_Test_1s.csv")

scaler_x = MinMaxScaler(); scaler_y = MinMaxScaler()
X_train = scaler_x.fit_transform(train_df[FEATURES])
y_train = scaler_y.fit_transform(train_df[[TARGET]]).squeeze(-1)
X_test  = scaler_x.transform(test_df[FEATURES])
y_test  = scaler_y.transform(test_df[[TARGET]]).squeeze(-1)

# ───── Dataset ─────
class SequenceDataset(Dataset):
    def __init__(self, X, y, seq_len):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.seq_len = seq_len
    def __len__(self):
        return self.X.size(0) - self.seq_len
    def __getitem__(self, idx):
        return (
            self.X[idx:idx+self.seq_len],   # [seq_len, features]
            self.y[idx+self.seq_len]        # scalar
        )

train_ds = SequenceDataset(X_train, y_train, SEQ_LEN)
test_ds  = SequenceDataset(X_test,  y_test,  SEQ_LEN)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

# ───── Model ─────
class HPWH_GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super().__init__()
        self.gru = nn.GRU(
            input_size, hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_size, 1)
    def forward(self, x):
        # x: [batch, seq_len, features]
        _, h_n = self.gru(x)
        # h_n: [num_layers, batch, hidden_size]
        out = self.fc(h_n[-1])           # use last layer
        return out.squeeze(-1)           # [batch]

model = HPWH_GRU(
    input_size=len(FEATURES),
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT
).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

# ───── Training Loop ─────
best_val = float('inf'); wait = 0
for epoch in range(1, MAX_EPOCHS+1):
    model.train(); train_loss = 0.
    for Xb, yb in tqdm(train_loader, desc=f"GRU Epoch {epoch}/{MAX_EPOCHS}", leave=False):
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(Xb)
        loss  = criterion(preds, yb)
        loss.backward(); optimizer.step()
        train_loss += loss.item() * Xb.size(0)
    train_loss /= len(train_loader.dataset)

    model.eval(); val_loss = 0.
    with torch.no_grad():
        for Xb, yb in test_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            val_loss += criterion(model(Xb), yb).item() * Xb.size(0)
    val_loss /= len(test_loader.dataset)

    print(f"[GRU] Epoch {epoch:02d} | Train MSE: {train_loss:.5f} | Val MSE: {val_loss:.5f}")
    if val_loss < best_val:
        best_val = val_loss; wait = 0
        torch.save(model.state_dict(), "best_gru.pt")
    else:
        wait += 1
        if wait >= PATIENCE:
            print("Early stopping GRU.")
            break

# ───── Evaluate ─────
model.load_state_dict(torch.load("best_gru.pt"))
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for Xb, yb in test_loader:
        Xb = Xb.to(device)
        out = model(Xb).cpu().numpy()
        y_pred.append(out); y_true.append(yb.numpy())
y_true = scaler_y.inverse_transform(np.concatenate(y_true).reshape(-1,1)).ravel()
y_pred = scaler_y.inverse_transform(np.concatenate(y_pred).reshape(-1,1)).ravel()
mae  = mean_absolute_error(y_true,y_pred)
rmse = root_mean_squared_error(y_true,y_pred)
print(f"GRU Test MAE: {mae:.2f} W | RMSE: {rmse:.2f} W")

# ───── Plot ─────
plt.figure()
plt.plot(y_pred, label="Pred")
plt.plot(y_true, label="True")
plt.title("GRU HPWH Prediction")
plt.xlabel("Time Step"); plt.ylabel("Power (W)")
plt.legend(); plt.show()


### 1D_CNN

In [None]:
# hpwh_cnn.py

import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error

# ───── Hyperparameters ─────
SEQ_LEN     = 60
BATCH_SIZE  = 64
NUM_FILTERS = 64
KERNEL_SIZE = 3
DROPOUT     = 0.2
LR          = 1e-3
MAX_EPOCHS  = 50
PATIENCE    = 7

FEATURES = [
    "Ambient Temp Trace(F)",
    "Inlet Temp Trace(F)",
    "Outlet Temp Trace (F)",
    "Operational Mode Code",
]
TARGET = "Power (W)"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ───── Load & Scale ─────
train_df = pd.read_csv(r"C:\Users\default.DESKTOP-C4C7JDR\Downloads\Trian\Trian\Merged_HPWH_Train_1s.csv")
test_df  = pd.read_csv(r"C:\Users\default.DESKTOP-C4C7JDR\Downloads\Test\Test\Merged_HPWH_Test_1s.csv")

scaler_x = MinMaxScaler(); scaler_y = MinMaxScaler()
X_train = scaler_x.fit_transform(train_df[FEATURES])
y_train = scaler_y.fit_transform(train_df[[TARGET]]).squeeze(-1)
X_test  = scaler_x.transform(test_df[FEATURES])
y_test  = scaler_y.transform(test_df[[TARGET]]).squeeze(-1)

# ───── Dataset ─────
class SeqDataset(Dataset):
    def __init__(self, X, y, seq_len):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.seq_len = seq_len
    def __len__(self):
        return self.X.size(0) - self.seq_len
    def __getitem__(self, idx):
        # CNN expects [batch, channels, seq_len]
        x = self.X[idx:idx+self.seq_len].transpose(0,1)  # [features, seq_len]
        y = self.y[idx+self.seq_len]
        return x, y

train_dl = DataLoader(SeqDataset(X_train,y_train,SEQ_LEN), batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
test_dl  = DataLoader(SeqDataset(X_test,y_test,SEQ_LEN),  batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

# ───── Model ─────
class HPWH_CNN(nn.Module):
    def __init__(self, in_channels, num_filters, kernel_size, dropout):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels, num_filters, kernel_size, padding=kernel_size//2)
        self.conv2 = nn.Conv1d(num_filters, num_filters, kernel_size, padding=kernel_size//2)
        self.pool  = nn.AdaptiveAvgPool1d(1)
        self.dropout = nn.Dropout(dropout)
        self.fc    = nn.Linear(num_filters, 1)
    def forward(self, x):
        # x: [batch, features, seq_len]
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = self.pool(x).squeeze(-1)  # [batch, num_filters]
        x = self.dropout(x)
        return self.fc(x).squeeze(-1)

model = HPWH_CNN(
    in_channels=len(FEATURES),
    num_filters=NUM_FILTERS,
    kernel_size=KERNEL_SIZE,
    dropout=DROPOUT
).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

# ───── Train & Early Stop ─────
best_val, wait = float('inf'), 0
for epoch in range(1, MAX_EPOCHS+1):
    model.train(); tloss=0.
    for Xb, yb in tqdm(train_dl, desc=f"CNN Ep {epoch}", leave=False):
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(Xb)
        loss=criterion(out,yb)
        loss.backward(); optimizer.step()
        tloss+=loss.item()*Xb.size(0)
    tloss/=len(train_dl.dataset)

    model.eval(); vloss=0.
    with torch.no_grad():
        for Xb,yb in test_dl:
            Xb,yb=Xb.to(device),yb.to(device)
            vloss+=criterion(model(Xb),yb).item()*Xb.size(0)
    vloss/=len(test_dl.dataset)

    print(f"[CNN] Ep{epoch:02d} Train MSE:{tloss:.5f} Val MSE:{vloss:.5f}")
    if vloss<best_val: best_val,wait=vloss,0; torch.save(model.state_dict(),"best_cnn.pt")
    else:
        wait+=1
        if wait>=PATIENCE:
            print("Early stopping CNN."); break

# ───── Evaluate ─────
model.load_state_dict(torch.load("best_cnn.pt")); model.eval()
yt,yp=[],[]
with torch.no_grad():
    for Xb,yb in test_dl:
        out = model(Xb.to(device)).cpu().numpy()
        yp.append(out); yt.append(yb.numpy())
yt = scaler_y.inverse_transform(np.concatenate(yt).reshape(-1,1)).ravel()
yp = scaler_y.inverse_transform(np.concatenate(yp).reshape(-1,1)).ravel()
mae=mean_absolute_error(yt,yp); rmse=root_mean_squared_error(yt,yp)
print(f"CNN Test MAE:{mae:.2f} W RMSE:{rmse:.2f} W")

plt.figure(); plt.plot(yp,label="Pred"); plt.plot(yt,label="True")
plt.title("CNN HPWH"); plt.xlabel("Step"); plt.ylabel("W"); plt.legend(); plt.show()


### Transformer

In [None]:
# hpwh_transformer.py

import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error

# ───── Hyperparams ─────
SEQ_LEN     = 60
BATCH_SIZE  = 64
D_MODEL     = 64    # embedding dim
N_HEAD      = 4
NUM_LAYERS  = 2
DIM_FF      = 128
DROPOUT     = 0.1
LR          = 1e-3
MAX_EPOCHS  = 50
PATIENCE    = 7

FEATURES = [
    "Ambient Temp Trace(F)",
    "Inlet Temp Trace(F)",
    "Outlet Temp Trace (F)",
    "Operational Mode Code",
]
TARGET = "Power (W)"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ───── Load & Scale ─────
train_df = pd.read_csv(r"C:\Users\default.DESKTOP-C4C7JDR\Downloads\Trian\Trian\Merged_HPWH_Train_1s.csv")
test_df  = pd.read_csv(r"C:\Users\default.DESKTOP-C4C7JDR\Downloads\Test\Test\Merged_HPWH_Test_1s.csv")

scaler_x = MinMaxScaler(); scaler_y = MinMaxScaler()
X_train = scaler_x.fit_transform(train_df[FEATURES])
y_train = scaler_y.fit_transform(train_df[[TARGET]]).squeeze(-1)
X_test  = scaler_x.transform(test_df[FEATURES])
y_test  = scaler_y.transform(test_df[[TARGET]]).squeeze(-1)

# ───── Dataset ─────
class SeqDataset(Dataset):
    def __init__(self, X, y, seq_len):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.seq_len = seq_len
    def __len__(self):
        return self.X.size(0)-self.seq_len
    def __getitem__(self, idx):
        # [seq_len, features]
        return self.X[idx:idx+self.seq_len], self.y[idx+self.seq_len]

train_dl = DataLoader(SeqDataset(X_train,y_train,SEQ_LEN), batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
test_dl  = DataLoader(SeqDataset(X_test,y_test,SEQ_LEN),  batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

# ───── Positional Encoding ─────
class PosEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model,2)*(-math.log(10000.0)/d_model))
        pe[:,0::2] = torch.sin(pos*div)
        pe[:,1::2] = torch.cos(pos*div)
        self.pe = pe.unsqueeze(0)  # [1,max_len,d_model]
    def forward(self, x):
        # x: [batch, seq_len, d_model]
        return x + self.pe[:,:x.size(1),:].to(x.device)

# ───── Model ─────
class HPWH_Transformer(nn.Module):
    def __init__(self, feature_size, d_model, nhead, num_layers, dim_ff, dropout):
        super().__init__()
        self.input_proj = nn.Linear(feature_size, d_model)
        self.pos_enc    = PosEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_ff,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, 1)
    def forward(self, x):
        # x: [batch, seq_len, features]
        x = self.input_proj(x)            # → [batch, seq_len, d_model]
        x = self.pos_enc(x)
        x = self.transformer(x)           # → [batch, seq_len, d_model]
        out = self.fc(x[:,-1,:])          # take last time step
        return out.squeeze(-1)

model = HPWH_Transformer(
    feature_size=len(FEATURES),
    d_model=D_MODEL,
    nhead=N_HEAD,
    num_layers=NUM_LAYERS,
    dim_ff=DIM_FF,
    dropout=DROPOUT
).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

# ───── Train & Early Stop ─────
best_val, wait = float('inf'), 0
for epoch in range(1, MAX_EPOCHS+1):
    model.train(); tloss=0.
    for Xb,yb in tqdm(train_dl, desc=f"Trans Ep{epoch}", leave=False):
        Xb,yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(Xb)
        loss=criterion(out,yb)
        loss.backward(); optimizer.step()
        tloss+=loss.item()*Xb.size(0)
    tloss/=len(train_dl.dataset)

    model.eval(); vloss=0.
    with torch.no_grad():
        for Xb,yb in test_dl:
            Xb,yb = Xb.to(device), yb.to(device)
            vloss+=criterion(model(Xb),yb).item()*Xb.size(0)
    vloss/=len(test_dl.dataset)

    print(f"[Trans] Ep{epoch:02d} Train MSE:{tloss:.5f} Val MSE:{vloss:.5f}")
    if vloss<best_val: best_val,wait=vloss,0; torch.save(model.state_dict(),"best_trans.pt")
    else:
        wait+=1
        if wait>=PATIENCE:
            print("Early stopping Transformer."); break

# ───── Evaluate ─────
model.load_state_dict(torch.load("best_trans.pt")); model.eval()
yt,yp=[],[]
with torch.no_grad():
    for Xb,yb in test_dl:
        out = model(Xb.to(device)).cpu().numpy()
        yp.append(out); yt.append(yb.numpy())
yt = scaler_y.inverse_transform(np.concatenate(yt).reshape(-1,1)).ravel()
yp = scaler_y.inverse_transform(np.concatenate(yp).reshape(-1,1)).ravel()
mae=mean_absolute_error(yt,yp); rmse=root_mean_squared_error(yt,yp)
print(f"Trans Test MAE:{mae:.2f} W RMSE:{rmse:.2f} W")

plt.figure(); plt.plot(yp,label="Pred"); plt.plot(yt,label="True")
plt.title("Transformer HPWH"); plt.xlabel("Step"); plt.ylabel("W"); plt.legend(); plt.show()
