# Water Demand Forecasting Project
This project involves water demand forecasting using WaterDMA2 data. This notebook walks through data preprocessing, sequence generation, model training using an LSTM model, and final evaluation.

In [1]:
from WaterDMA2 import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
sns.set()

## Defining File Paths and Setting Random Seeds
We define the file paths for the dataset and set random seeds for reproducibility.

In [2]:
# Water data set paths
WaterDMA_2_Number_of_Meters_path = '/path/to/WaterDMA_2_Number_of_Meters.csv'
Training_WaterDMA_2_path = '/path/to/Training_WaterDMA_2.csv'
Testing_WaterDMA_2_path = '/path/to/Testing_WaterDMA_2.csv'
Weather_data_path = '/path/to/Weather_Bronderslev_20152022.csv'

# Setting seeds for reproducibility
torch.manual_seed(0)
np.random.seed(0)

## Data Preprocessing
We initialize and run the data preprocessing on the training and testing datasets.

In [3]:
# Initialize the data preprocessing with file paths
prepro = Water_data_preprocessing(Training_WaterDMA_2_path, WaterDMA_2_Number_of_Meters_path, Weather_data_path)

# Fit the preprocessing on training data and transform it, then apply the same transformation to testing data
train_data = prepro.fit()
test_data = prepro.transform(Testing_WaterDMA_2_path)

# Extracting and removing scaling information from data
train_data_scaling_info = train_data[['comsumption','meters']]
test_data_scaling_info = test_data[['comsumption','meters']]
train_data = train_data.drop(['comsumption','meters'], axis=1)
test_data = test_data.drop(['comsumption','meters'], axis=1)

## Sequence Generation
We create sequences of data to be used as input (X) and labels (y) for the model. The sequence length for input is 192, and for labels, it's 24.

In [4]:
# Sequence length configuration
sequence_length_x = 192
sequence_length_y = 24  # Number of future points to predict

# Creating sequences and splitting the data into training and testing sets
sequences, labels, train_scaling_info = create_sequences(train_data, sequence_length_x, sequence_length_y, train_data_scaling_info)
train_X, train_y, train_scale_info, test_X, test_y, test_scale_info = train_test_split(sequences, labels, train_scaling_info, train_size=0.75)

# Define a function for inverse prediction
def predict_inverse(model, data):
    with torch.no_grad():
        data = torch.Tensor(data)
        prediction = model(data)
    return pd.DataFrame(prediction.numpy())

100%|██████████| 20232/20232 [00:01<00:00, 12128.80it/s]


## Model Setup
Here we define the model, set hyperparameters, and initialize the dataset and data loaders for training.

In [5]:
# Hyperparameter definition
input_size = 23  # Number of features
hidden_size = 128
num_layers = 5
output_size = 24  # Number of future steps to predict
batch_size = 32
learning_rate = 0.001
num_epochs = 500

# Get the device (CPU or GPU)
device = get_device()
print(device)

# Define the custom Dataset class for handling sequences and labels
class WaterData(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return self.sequences.shape[0]

    def __getitem__(self, idx):
        return torch.Tensor(self.sequences[idx]), torch.Tensor(self.labels[idx])

# Creating data loaders
waterData_train = WaterData(train_X, train_y)
waterData_val = WaterData(test_X, test_y)
train_dataloader = DataLoader(waterData_train, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(waterData_val, batch_size=batch_size, shuffle=False)

# Initializing the LSTM model, loss function, and optimizer
model = LSTMModel(input_size, hidden_size, output_size, num_layers)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

mps


## Model Training
This section trains the LSTM model, evaluates it on the validation set, and saves the best-performing model.

In [6]:
# Train the model
list_train_loss = []
list_val_loss = []

prev_val_loss = 1000  # Track the best validation loss
epochs = 50
for e in tqdm(range(epochs)):
    model.train()
    for data, labels in train_dataloader:
        if torch.cuda.is_available():
            data, labels = data.cuda(), labels.cuda()
        optimizer.zero_grad()
        target = model(data)
        loss = criterion(target, labels)
        loss.backward()
        optimizer.step()
    list_train_loss.append(loss.item())
    valid_loss = 0.0

    model.eval()
    for data, labels in test_dataloader:
        if torch.cuda.is_available():
            data, labels = data.cuda(), labels.cuda()
        target = model(data)
        val_loss = criterion(target, labels)
        valid_loss += val_loss.item()
    valid_loss = valid_loss / len(test_dataloader)
    list_val_loss.append(valid_loss)

    if prev_val_loss > valid_loss:
        prev_val_loss = valid_loss
        torch.save(model.state_dict(), f'model_watertest{e}.pth')

    print(f'Epoch {e+1} \t Training Loss: {loss.item()} \t Validation Loss: {valid_loss}')

## Model Evaluation and Prediction
We load the trained model and use it to generate predictions on the test set.

In [7]:
# Load the trained LSTM model weights
model = LSTMModel(input_size, hidden_size, output_size, num_layers)
model.load_state_dict(torch.load('model_watertest7.pth'))

# Generate predictions
predictions = predict_inverse(model, test_X)

# Apply inverse scaling
prediction_for_all_meter = pd.DataFrame(test_scale_info[:,:,1]) * predictions

# Calculate MAE and MAPE
mae = mean_absolute_error(test_scale_info[:,:,0], prediction_for_all_meter.to_numpy())
mape = mean_absolute_percentage_error(test_scale_info[:,:,0], prediction_for_all_meter.to_numpy())

print(f"Mean Absolute Error: {mae}")
print(f"Mean Absolute Percentage Error: {mape}")

## Predicting Missing Values
Using the trained model, we fill in the missing values in the test data.

In [8]:
# Predict and fill missing values in the test data using the model.
def test_pred(model, test_data):
    torch.no_grad()
    test_data = test_data.reset_index()
    idx = test_data[test_data.Per_meter_comsumption_with_inter.isna()].timestamp.dt.date.drop_duplicates(keep='first').index.tolist()
    for i in idx:
        pre = model(torch.Tensor((test_data.loc[i-192:i-1]).drop('timestamp', axis=1).values).reshape(1,192,23))
        test_data.loc[i:i+23,'Per_meter_comsumption_with_inter'] = pre.detach().numpy().reshape(-1,1)
    return test_data

In [9]:
# Fill missing values
preTest = test_pred(model, test_data)
preTest = preTest.set_index('timestamp')

# Calculate original consumption value from per meter consumption and number of meters
preTest['Per_meter_comsumption_with_inter'] = preTest['Per_meter_comsumption_with_inter'] * test_data_scaling_info.meters

# Store results in CSV
preTest[['Per_meter_comsumption_with_inter']].to_csv('WaterDMA_2.csv', index=True)