In [25]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import time


In [26]:
print(torch.backends.mps.is_available())

True


In [27]:
# Load the dataset
file_path = 'household_power_consumption.txt'
data = pd.read_csv(file_path, sep=';', na_values='?', low_memory=False)

# Combine and convert the Date and Time into a single Datetime column and set as index
data['Datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'], dayfirst=True)
data.drop(['Date', 'Time'], axis=1, inplace=True)
data.set_index('Datetime', inplace=True)



# Impute missing values based on the mean of the same time slot across different years
time_mean = data.groupby(data.index.time).mean()
for col in ['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']:
    fill_values = data.index.map(lambda x: time_mean.at[x.time(), col])
    data[col] = data[col].fillna(pd.Series(fill_values, index=data.index))

# Calculate the new feature 'Active_Energy_Not_Measured'
data['Active_Energy_Not_Measured'] = (data['Global_active_power'] * 1000 / 60) - (data['Sub_metering_1'] + data['Sub_metering_2'] + data['Sub_metering_3'])

## Resample the dataset to every 15 minutes
data_resampled = data.resample('15min').mean()

# Display the first few rows of the resampled dataframe to verify
data_resampled.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Active_Energy_Not_Measured
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006-12-16 17:15:00,4.587333,0.484,234.366667,19.7,0.0,1.333333,16.833333,58.288889
2006-12-16 17:30:00,4.140667,0.327733,234.768667,17.773333,0.0,0.733333,16.866667,51.411111
2006-12-16 17:45:00,4.159333,0.028267,234.63,17.786667,0.0,0.0,16.866667,52.455556
2006-12-16 18:00:00,4.121067,0.152533,235.566,17.706667,0.0,19.933333,17.0,31.751111
2006-12-16 18:15:00,3.768533,0.0172,234.803333,16.226667,0.0,2.866667,16.933333,43.008889


In [28]:
data_resampled.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 138352 entries, 2006-12-16 17:15:00 to 2010-11-26 21:00:00
Freq: 15min
Data columns (total 8 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Global_active_power         138352 non-null  float64
 1   Global_reactive_power       138352 non-null  float64
 2   Voltage                     138352 non-null  float64
 3   Global_intensity            138352 non-null  float64
 4   Sub_metering_1              138352 non-null  float64
 5   Sub_metering_2              138352 non-null  float64
 6   Sub_metering_3              138352 non-null  float64
 7   Active_Energy_Not_Measured  138352 non-null  float64
dtypes: float64(8)
memory usage: 9.5 MB


In [29]:
# Split the data into training and test sets by holding out the last year
test_start_date = '2009-12-01'
train_data = data_resampled[:test_start_date]
test_data = data_resampled[test_start_date:]

# For validation, split the remaining training data by holding out one month every five months
train_indices = []
val_indices = []

current_date = train_data.index.min()
end_date = train_data.index.max()

while current_date <= end_date:
    month_start = current_date
    month_end = month_start + pd.DateOffset(months=1)
    five_month_end = month_start + pd.DateOffset(months=5)
    
    # Add the indices of the current month to validation, rest to train
    month_indices = train_data[month_start:month_end].index
    train_month_indices = train_data[month_end:five_month_end].index
    
    val_indices.extend(month_indices)
    train_indices.extend(train_month_indices)
    
    # Move current date by five months
    current_date = five_month_end

train_final = train_data.loc[train_indices]
val_final = train_data.loc[val_indices]

# Scaling the data
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train_final.values)
val_scaled = scaler.transform(val_final.values)
test_scaled = scaler.transform(test_data.values)

# Display the first few rows of the resampled dataframe to verify
train_final.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 81607 entries, 2007-01-16 17:15:00 to 2009-11-16 17:15:00
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Global_active_power         81607 non-null  float64
 1   Global_reactive_power       81607 non-null  float64
 2   Voltage                     81607 non-null  float64
 3   Global_intensity            81607 non-null  float64
 4   Sub_metering_1              81607 non-null  float64
 5   Sub_metering_2              81607 non-null  float64
 6   Sub_metering_3              81607 non-null  float64
 7   Active_Energy_Not_Measured  81607 non-null  float64
dtypes: float64(8)
memory usage: 5.6 MB


In [30]:
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data[i:i+n_steps, :-1])  # all features except the target
        y.append(data[i+n_steps, 0])  # target feature
    return np.array(X), np.array(y)

# Number of time steps to look back
n_steps = 96  # corresponds to 1 day (24 hours * 4 intervals per hour)

# Create sequences for training, validation, and testing sets
X_train, y_train = create_sequences(train_scaled, n_steps)
X_val, y_val = create_sequences(val_scaled, n_steps)
X_test, y_test = create_sequences(test_scaled, n_steps)

In [31]:
# Check for MPS availability and set the device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim, device=x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim, device=x.device)
        out, (hn, cn) = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Assume X_train, y_train, X_val, y_val, X_test, y_test are already defined

# Convert to PyTorch tensors without specifying device here
train_features = torch.Tensor(X_train)
train_targets = torch.Tensor(y_train)
val_features = torch.Tensor(X_val)
val_targets = torch.Tensor(y_val)
test_features = torch.Tensor(X_test)
test_targets = torch.Tensor(y_test)

# DataLoader
train_loader = DataLoader(TensorDataset(train_features, train_targets), batch_size=64, shuffle=True)

# Initialize model, loss function, and optimizer
model = LSTMModel(input_dim=train_features.shape[2], hidden_dim=20, num_layers=2, output_dim=1).to(device) # hidDIm - 20
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 30
for epoch in range(num_epochs):
    start_time_epoch = time.time()  # Start time for the epoch
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.view(-1, 1))
        loss.backward()
        optimizer.step()
    
    # Validation
    model.eval()
    with torch.no_grad():
        val_features, val_targets = val_features.to(device), val_targets.to(device)
        val_predictions = model(val_features)
        val_loss = criterion(val_predictions, val_targets.view(-1, 1))

    end_time_epoch = time.time()  # End time for the epoch
    print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}')
    print(f'Epoch {epoch+1} completed in {end_time_epoch - start_time_epoch:.2f} seconds')
    print(f'----------------------------------------------------------------------------')

# Evaluate on test data
model.eval()
with torch.no_grad():
    test_features, test_targets = test_features.to(device), test_targets.to(device)
    predictions = model(test_features)
    test_loss = criterion(predictions, test_targets.view(-1, 1))

# Calculate RMSE and MAE
rmse = np.sqrt(mean_squared_error(test_targets.cpu().numpy(), predictions.cpu().numpy()))
mae = mean_absolute_error(test_targets.cpu().numpy(), predictions.cpu().numpy())
print(f'Test RMSE: {rmse}, Test MAE: {mae}')


Using device: mps
Epoch 1, Loss: 0.0040953392162919044, Validation Loss: 0.005072918254882097
Epoch 1 completed in 12.21 seconds
----------------------------------------------------------------------------
Epoch 2, Loss: 0.0010416405275464058, Validation Loss: 0.0041052247397601604
Epoch 2 completed in 11.77 seconds
----------------------------------------------------------------------------
Epoch 3, Loss: 0.00480554299429059, Validation Loss: 0.0038613954093307257
Epoch 3 completed in 11.79 seconds
----------------------------------------------------------------------------
Epoch 4, Loss: 0.005062546581029892, Validation Loss: 0.004227539524435997
Epoch 4 completed in 11.83 seconds
----------------------------------------------------------------------------
Epoch 5, Loss: 0.003094468731433153, Validation Loss: 0.0037207792047411203
Epoch 5 completed in 11.82 seconds
----------------------------------------------------------------------------
Epoch 6, Loss: 0.003807675326243043, Valida