In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
# Load datasets (already loaded in previous step)

# Convert timestamp to datetime
madalena_df["timestamp"] = pd.to_datetime(madalena_df["timestamp"])

# Select relevant features
feature_cols = ["total_energy", "T_out [°C]"]  # Input features
comfort_cols = ["T_in[°C]", "RH [%]", "CO2[ppm]", "PM2_5[ug/m3]"]  # Target comfort metrics

# Normalize features and targets
scaler_X = MinMaxScaler()
scaler_Y = MinMaxScaler()

madalena_df[feature_cols] = scaler_X.fit_transform(madalena_df[feature_cols])
madalena_df[comfort_cols] = scaler_Y.fit_transform(madalena_df[comfort_cols])

# Define sequence length (60-minute history)
sequence_length = 60

# Prepare sequences for LSTM using PyTorch Dataset
class EnergyDataset(Dataset):
    def __init__(self, data, targets, sequence_length):
        self.data = data
        self.targets = targets
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.data) - self.sequence_length

    def __getitem__(self, idx):
        x_seq = self.data[idx:idx + self.sequence_length]
        y_target = self.targets[idx + self.sequence_length]
        return torch.tensor(x_seq, dtype=torch.float32), torch.tensor(y_target, dtype=torch.float32)

# Convert data to numpy arrays
data_X = madalena_df[feature_cols].values
data_Y = madalena_df[comfort_cols].values

# Split into training and validation sets
train_size = int(0.8 * len(data_X))
X_train, X_val = data_X[:train_size], data_X[train_size:]
Y_train, Y_val = data_Y[:train_size], data_Y[train_size:]

# Create PyTorch Datasets and Dataloaders
train_dataset = EnergyDataset(X_train, Y_train, sequence_length)
val_dataset = EnergyDataset(X_val, Y_val, sequence_length)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Compute mean and variance of total energy per house
house_energy_stats = madalena_df.groupby("house_id")["total_energy"].agg(["mean", "std", "max", "min"])

# Visualize energy distribution per house
plt.figure(figsize=(12, 6))
sns.boxplot(x="house_id", y="total_energy", data=madalena_df)
plt.xticks(rotation=45)
plt.title("Energy Consumption Distribution per House")
plt.xlabel("House ID")
plt.ylabel("Total Energy Consumption")
plt.show()

# Display summary statistics
import ace_tools as tools
tools.display_dataframe_to_user(name="House Energy Statistics", dataframe=house_energy_stats)

# Display dataset shape
len(train_dataset), len(val_dataset)