# Exploratory Data Analysis - EDA

In [None]:
import os 
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller

hn_data = pd.read_csv(r"/media/khanhxoe/New Volume/Studying/Project_HUST/Finished Project/Impute_misvalues_hanoi.csv", usecols=[0,1,2])
hn_data['Date'] = hn_data['Date'].ffill()
hn_data['Date'] = pd.to_datetime(hn_data['Date'])

# Build DataLoader

In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader

class Series_Dataset(Dataset):
    def __init__(self, data, past_len, pred_len):
        super(Series_Dataset, self).__init__()
        self.past_len = past_len  # seq_len (độ dài chuỗi đầu vào)
        self.pred_len = pred_len  # pred_len (độ dài dự báo)
        self.df = data
        self.X, self.y = self.create_sequences_with_cores_time(self.df, self.past_len, self.pred_len)

    def create_sequences_with_cores_time(self, data: pd.DataFrame, past_len: int, pred_len: int):
        X, y = [], []
        for i in range(len(data) - past_len - pred_len + 1):
            X.append(data.iloc[i:i + past_len])
            y.append(data.iloc[i + past_len:i + past_len + pred_len])

        X = np.array(X)  
        y = np.array(y)  
        
        return torch.tensor(X, dtype= torch.float32).clone().detach(), torch.tensor(y, dtype= torch.float32).clone().detach()

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx].unsqueeze(-1)  # Shape: [past_len, 1]
        y = self.y[idx]
        
        return x, y  

## Phân chia kích thước cho tập huấn luyện và kiểm tra

****Chia dữ liệu Hà Nội****

In [None]:
train_data = hn_data.loc[(hn_data['Date'].dt.year >= 2008) & (hn_data['Date'].dt.year <= 2016), 'Waterlevel']
test_data = hn_data.loc[(hn_data['Date'].dt.year >= 2017) & (hn_data['Date'].dt.year <= 2017), 'Waterlevel']

In [None]:
# Dữ liệu mùa hạn (tháng 11 đến tháng 4)
drought_test_data = hn_data.loc[
    (hn_data['Date'].dt.year == 2017) &
    (
        (hn_data['Date'].dt.month >= 1) &
        (hn_data['Date'].dt.month <= 3)
    ),
    'Waterlevel'
]

flood_test_data = hn_data.loc[
    (hn_data['Date'].dt.year == 2017) &
    (hn_data['Date'].dt.month.between(6, 8)),
    'Waterlevel'
]
"""
# Dữ liệu mùa hạn (tháng 11 đến tháng 4)
drought_test_data = hn_data.loc[
    (hn_data['Date'].dt.year == 2014) &
    (
        (hn_data['Date'].dt.month >= 1) &
        (hn_data['Date'].dt.month <= 3)
    ),
    'Waterlevel'
]

flood_test_data = hn_data.loc[
    (hn_data['Date'].dt.year == 2014) &
    (hn_data['Date'].dt.month.between(6, 8)),
    'Waterlevel'
]"""

****Chia dữ liệu Hưng Yên****

In [None]:
train_data = hn_data.loc[(hn_data['Date'].dt.year >= 2008) & (hn_data['Date'].dt.year <= 2013), 'Waterlevel']
test_data = hn_data.loc[(hn_data['Date'].dt.year >= 2014) & (hn_data['Date'].dt.year <= 2015), 'Waterlevel']

# Model Architecture

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTMAttention(nn.Module):
    def __init__(self, seq_len, tar_len, n_layers, hidden_dim=128, device='cuda'):
        super(LSTMAttention, self).__init__()
        self.seq_len = seq_len  
        self.hidden_dim = hidden_dim
        
        self.lstm = nn.LSTM(
            input_size=1, hidden_size=hidden_dim, 
            num_layers=n_layers, batch_first=True
        )
        
        self.attention_dense = nn.Linear(seq_len, seq_len)
        self.output_dense = nn.Linear(hidden_dim*seq_len, tar_len)
        self.device = device

    def forward(self, x, mask=None):
        # |x| = [batch_size, seq_len, input_dim]
        batch_size = x.size(0)
        
        if mask is not None:
            x = x * mask.unsqueeze(-1)  # Nhân với mask để đặt giá trị bị mask thành 0
        
        lstm_out, _ = self.lstm(x)
        
        # |attention| = [batch_size, seq_len, seq_len]
        attention = torch.bmm(lstm_out, lstm_out.transpose(1, 2))
        # |attention| = [batch_size, seq_len, seq_len]
        attention = self.attention_dense(attention)
        attention = F.softmax(attention, dim=-1)
        
        context = torch.bmm(attention, lstm_out) # |context| = [batch_size, seq_len, hidden_dim]
        flattened = context.reshape(batch_size, -1) # |flattened| = [batch_size, seq_len * hidden_dim]
        output = self.output_dense(flattened) # |output| = [batch_size, tar_len=1]
        return output

    def predict_AR_multi_step_ahead(self, x, pred_len, mask=None):
        self.eval()
        batch_size = x.size(0)
        predictions = []
        
        with torch.no_grad():
            current_input = x.clone() # [1, seq_len, 1].
            for _ in range(pred_len):
                step_output = self.forward(current_input, mask)  # [1, tar_len=1]
                new_input = step_output[:, -1:]  # Lấy giá trị cuối cùng: [1, 1]
                
                new_input = new_input.unsqueeze(-1)  # [1, 1, 1]
                current_input = torch.cat((current_input[:, 1:, :], new_input), dim=1)
                predictions.append(step_output)
        
        predictions = torch.cat(predictions, dim=1)
        
        if predictions.size(1) > pred_len:
            predictions = predictions[:, :pred_len]
        
        return predictions

# Training loop

In [None]:
import time
def evaluate(model, data_loader, criterion, device, mask=None):
    model.eval()
    losses = []
    with torch.no_grad():
        for batch in data_loader:
            x, y = batch[:2]
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            loss = criterion(y_pred, y)
            losses.append(loss.item())

    return sum(losses) / len(losses)

def train_epoch(model, train_loader, criterion, optimizer, device, mask=None):
    model.train()
    epoch_losses = []

    for batch in train_loader:
        optimizer.zero_grad()
        x, y = batch[:2]
        x, y = x.to(device), y.to(device)
        y_pred = model(x)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())

    return sum(epoch_losses) / len(epoch_losses)

def fit(model, train_loader, test_loader, criterion, optimizer, epochs, device, mask=None, scheduler=None):
    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        start_time = time.time()
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device, mask)
        val_loss = evaluate(model, test_loader, criterion, device, mask)
        end_time = time.time()
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        if scheduler is not None:
            if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                scheduler.step(val_loss)
            else:
                scheduler.step()

        current_lr = optimizer.param_groups[0]['lr']
        if current_lr <= 5e-7:
            return train_losses, val_losses
        print(
            f"Epoch: {epoch + 1}, Train loss: {train_loss:.6f}, "
            f"Val loss: {val_loss:.6f}, "
            f"Epoch time: {(end_time - start_time):.6f}s, "
            f"Learning rate: {current_lr:.6f}"
        )
    
    return train_losses, val_losses

In [None]:
batch_size = 16
seq_len = 16
pred_len = 40
num_workers = 6
pin_mem = True

train_dataset = Series_Dataset(train_data, past_len=seq_len, pred_len=pred_len)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=pin_mem,
    drop_last=True
)
test_dataset = Series_Dataset(test_data, past_len=seq_len, pred_len=pred_len)
test_loader = DataLoader(
    test_dataset,
    batch_size=8,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=pin_mem
)

drought_dataset = Series_Dataset(drought_test_data, past_len=seq_len, pred_len=pred_len)
drought_loader = DataLoader(
    drought_dataset,
    batch_size=1,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=pin_mem
)
flood_test_dataset = Series_Dataset(flood_test_data, past_len=seq_len, pred_len=pred_len)
flood_loader = DataLoader(
    flood_test_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=pin_mem
)
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau 

hidden_dim = 256
n_layers = 4
dropout = 0.1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LSTMAttention(
    seq_len= seq_len, tar_len= pred_len, 
    n_layers= n_layers,
    hidden_dim= hidden_dim
).to(device)
#model.load_state_dict(torch.load(rf'/media/khanhxoe/New Volume/Studying/Project_HUST/Finished Project/Code/Pretrained_Model/LSTM_direct_model/HaNoi/lstm_attention_hidden_dim=256_n_layers=4_seq=16_pred=1.pth', weights_only=True))

In [None]:
criterion = nn.HuberLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-9)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
train_losses, val_losses = fit(model, train_loader, test_loader, criterion, optimizer, epochs=50, device=device, scheduler=scheduler)

In [None]:
torch.save(model.state_dict(), r"/media/khanhxoe/New Volume/Studying/Project_HUST/Finished Project/Code/Model_Plot/lstm_attention_hidden_dim=256_n_layers=4_seq=16_pred=40.pth")

# Metric Evaluation Function

**Randomly direct multi-step ahead forecasting**

In [None]:
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2
import numpy as np
import torch
import random

def FSD(y, x):
    SD_y = np.std(y, ddof=1)  
    SD_x = np.std(x, ddof=1) 
    return 2 * abs(SD_y - SD_x) / (SD_y + SD_x)

def similarity(y, x):
    T = len(x)
    x_min, x_max = np.min(x), np.max(x)
    sim_score = np.mean(1 / (1 + (np.abs(y - x) / (x_max - x_min))))
    return sim_score

def NSE(y_obs, y_pred):
    numerator = np.sum((y_obs - y_pred) ** 2)
    denominator = np.sum((y_obs - np.mean(y_obs)) ** 2)
    return 1 - (numerator / denominator)

def evaluation_random_samples(model, test_loader, device, metrics, num_samples=1):
    model.eval()
    sims = []
    maes = []
    rmses = []
    r2s = []
    fsds = []
    all_batches = list(test_loader)

    with torch.no_grad():
        for _ in range(num_samples):
            x_batch, y_batch = random.choice(all_batches)
            i = random.randint(0, len(x_batch) - 1)
            x_batch, y_batch = x_batch[i:i+1, :, :], y_batch[i:i+1, :]
            y_pred = model(x_batch.to(device))

            y_pred_np = y_pred.squeeze(0).cpu().numpy()
            y_true_np = y_batch.squeeze(0).cpu().numpy()
            sim_score  = similarity(y_pred_np, y_true_np)
            mae_score = mae(y_true_np, y_pred_np)
            rmse_score = rmse(y_true_np, y_pred_np)
            r2_score = r2(y_true_np, y_pred_np)
            fsd_score = FSD(y_pred_np, y_true_np)
            sims.append(sim_score)
            maes.append(mae_score)
            rmses.append(rmse_score)
            r2s.append(r2_score)
            fsds.append(fsd_score)

    avg_results = {
        "Similarity": np.mean(sims),
        "MAE": np.mean(maes),
        "RMSE": np.mean(rmses),
        "R2": np.mean(r2s),
        "FSD": np.mean(fsds)
    }
    return avg_results

test_loaders = [
    ("test_loader_sub", drought_loader),
    ("test_loader1", flood_loader)
]

metrics = [
    ("Similarity", similarity),
    ("MAE", mae),
    ("RMSE", rmse),
    ("R2", r2),
    ("FSD", FSD)
]

score1 = evaluation_random_samples(model, drought_loader, device, metrics, num_samples=20)
score2 = evaluation_random_samples(model, flood_loader, device, metrics, num_samples=20)
print("Average Scores for 2 test_loader")
output = " ".join(
    f"{((score1[metric[0]] + score2[metric[0]]) / 2):.2f}" for metric in metrics
)
print(output)


**Randomly recursive multi-step ahead forecasting**

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score 
import math

def similarity(y, x):
    T = len(x)
    x_min, x_max = np.min(x), np.max(x)
    sim_score = np.mean(1 / (1 + (np.abs(y - x) / (x_max - x_min + 1e-8))))
    return sim_score

def compute_metrics(y_true, y_pred):
    y_true = np.array(y_true).flatten()
    y_pred = np.array(y_pred).flatten()
    
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = math.sqrt(mse)
    r_squared = r2_score(y_true, y_pred)
    sim = similarity(y_pred, y_true)

    std_y = np.std(y_true)
    std_x = np.std(y_pred)

    if std_y + std_x == 0:
        fsd = 0.0
    else:
        fsd = 2 * abs(std_y - std_x) / (std_y + std_x)

    return {
        "Sim": sim,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r_squared,
        "FSD": fsd
    }

def eval_random_multi_step_ahead(model, test_data, past_len, pred_len, num_samples, device):
    model.eval()

    rand_idx = np.random.randint(0, len(test_data) - past_len - pred_len + 1, size=num_samples)
    scores = []
    with torch.no_grad():
        for idx, i in enumerate(rand_idx):
            x_values = test_data[i:i+past_len]
            y_values = test_data[i+past_len:i+past_len+pred_len]
            x_tensor = torch.tensor(x_values.to_numpy(), dtype=torch.float32).unsqueeze(0).unsqueeze(-1).to(device)
            y_pred = model.predict_AR_multi_step_ahead(x_tensor, pred_len)
            y_pred = y_pred.squeeze(0).cpu().numpy()
            score = compute_metrics(y_values, y_pred)
            scores.append(score)

        result = {}
        keys = scores[0].keys()
        for key in keys:
            result[key] = np.mean([score[key] for score in scores])
        output = " ".join(f"{value:.4f}" for key, value in result.items())
        print(output)

        return result

result = eval_random_multi_step_ahead(model, test_data, past_len=seq_len, pred_len=4, num_samples=20, device=device)