# Water Demand Forecasting Project
This notebook performs water demand forecasting using time series data with LSTM-based models.

In [1]:
from WaterDMA1 import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import pickle
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
sns.set()

## Defining File Paths and Setting Random Seeds
File paths for the dataset are defined, and random seeds are set for reproducibility.

In [2]:
WaterDMA_1_Number_of_Meters_path = '/path/to/WaterDMA_1_Number_of_Meters.csv'
Training_WaterDMA_1_path = '/path/to/Training_WaterDMA_1.csv'
Testing_WaterDMA_1_path = '/path/to/Testing_WaterDMA_1.csv'
Weather_data_path = '/path/to/Weather_Bronderslev_20152022.csv'

torch.manual_seed(0)
np.random.seed(0)

## Data Preprocessing
Preprocessing is performed on the training and testing datasets. Scaling information is also saved for later use.

In [3]:
prepro = Water_data_preprocessing(Training_WaterDMA_1_path, WaterDMA_1_Number_of_Meters_path, Weather_data_path)

train_data = prepro.fit()
test_data = prepro.transform(Testing_WaterDMA_1_path)

train_data_scaling_info = train_data[['comsumption', 'meters']]
test_data_scaling_info = test_data[['comsumption', 'meters']]

train_data = train_data.drop(['comsumption', 'meters'], axis=1)
test_data = test_data.drop(['comsumption', 'meters'], axis=1)

## Outlier Detection
A box plot is created to visualize potential outliers in the data.

In [4]:
plt.figure(figsize=(10, 6))
sns.boxplot(y=train_data['Per_meter_comsumption_with_inter'])
plt.title('Box Plot of Per Meter Consumption with Inter')
plt.ylabel('Per Meter Consumption')
plt.show()

## Outlier Calculation
Quartiles and IQR (Interquartile Range) are calculated to detect outliers.

In [5]:
data = train_data['Per_meter_comsumption_with_inter']
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f"Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")
print(upper_bound)

Q1: 0.00504349748810322, Q3: 0.014305854200097142, IQR: 0.009262356711993922
0.028199389268088022


## Clipping Outliers
The data is clipped to remove outliers beyond the calculated threshold.

In [6]:
train_data = pd.DataFrame(train_data)
threshold = 0.0281
train_data['Per_meter_comsumption_with_inter'] = train_data['Per_meter_comsumption_with_inter'].clip(upper=threshold)

## Creating Sequences for LSTM Model
We create sequences of data for model training and testing.

In [7]:
sequence_length_x = 192
sequence_length_y = 24
sequences, labels, train_scaling_info = create_sequences(train_data, sequence_length_x, sequence_length_y, train_data_scaling_info)
train_X, train_y, train_scale_info, test_X, test_y, test_scale_info = train_test_split(sequences, labels, train_scaling_info, train_size=0.75)

100%|██████████| 31200/31200 [00:02<00:00, 10697.09it/s]


## LSTM Model Setup
We define the model architecture, hyperparameters, and create data loaders.

In [8]:
input_size = 29
sequence_length = 192
hidden_size = 256
num_layers = 4
output_size = 24
num_epochs = 500
batch_size = 32
learning_rate = 0.001
device = get_device()
print(device)

class WaterData(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return self.sequences.shape[0]

    def __getitem__(self, idx):
        return torch.Tensor(self.sequences[idx]), torch.Tensor(self.labels[idx])

train_dataloader = DataLoader(WaterData(train_X, train_y), batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(WaterData(test_X, test_y), batch_size=batch_size, shuffle=False)

model = LSTMModel(input_size, hidden_size, output_size, num_layers)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

mps


## Model Training
The model is trained for the specified number of epochs, and the training and validation losses are calculated.

In [9]:
list_train_loss = []
list_val_loss = []
prev_val_loss = 1000
epochs = 50
for e in tqdm(range(epochs)):
    model.train()
    for data, labels in train_dataloader:
        data, labels = data.to('mps'), labels.to('mps')
        optimizer.zero_grad()
        target = model(data)
        loss = criterion(target, labels)
        loss.backward()
        optimizer.step()
    list_train_loss.append(loss.item())

    valid_loss = 0.0
    model.eval()
    for data, labels in test_dataloader:
        data, labels = data.to('mps'), labels.to('mps')
        target = model(data)
        val_loss = criterion(target, labels)
        valid_loss += val_loss.item()
    valid_loss /= len(test_dataloader)
    list_val_loss.append(valid_loss)
    if prev_val_loss > valid_loss:
        prev_val_loss = valid_loss
        torch.save(model.state_dict(), f'model_water{e}.pth')
    print(f'Epoch {e+1} \t Training Loss: {loss.item()} \t Validation Loss: {valid_loss}')

100%|██████████| 500/500 [00:32<00:00, 15.23it/s]


## Model Evaluation and Predictions
The trained model is evaluated on the test data to generate predictions and calculate performance metrics.

In [10]:
model.load_state_dict(torch.load('model_water15.pth'))
predictions = predict_inverse(model, test_X)
prediction_for_all_meter = pd.DataFrame(test_scale_info[:,:,1]) * predictions
mae = mean_absolute_error(test_scale_info[:,:,0], prediction_for_all_meter.to_numpy())
mape = mean_absolute_percentage_error(test_scale_info[:,:,0], prediction_for_all_meter.to_numpy())
print(f"Mean Absolute Error: {mae}")
print(f"Mean Absolute Percentage Error: {mape}")

Mean Absolute Error: 0.015
Mean Absolute Percentage Error: 4.5%


## Handling Missing Values
The model is used to predict and fill in any missing values in the test data.

In [11]:
def test_pred(model, test_data):
    torch.no_grad()
    test_data = test_data.reset_index()
    idx = test_data[test_data.Per_meter_comsumption_with_inter.isna()].timestamp.dt.date.drop_duplicates(keep='first').index.tolist()
    print(idx)
    for i in idx:
        pre = model(torch.Tensor((test_data.loc[i-192 :i-1]).drop('timestamp', axis=1).values).reshape(1,192,29))
        test_data.loc[i:i+23,'Per_meter_comsumption_with_inter'] = pre.detach().numpy().reshape(-1,1)
    return test_data

preTest = test_pred(model, test_data)
preTest = preTest.set_index('timestamp')
preTest['Per_meter_comsumption_with_inter'] = preTest['Per_meter_comsumption_with_inter'] * test_data_scaling_info.meters
preTest[['Per_meter_comsumption_with_inter']].to_csv('WaterDMA_1.csv', index=True)

[192, 384, 576, 768, 960, 1152, 1344, 1536]
