# Supervised Time Series Analysis

In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import optuna
from utils import TrainingConfig, Loader
from typing import Union, List, Tuple
import time

sns.set_style("dark")
# device = torch.device('mps')

In [None]:
# ["zone_022_co2","zone_052_co2","zone_072_co2","mel_S"]

# Hyperparameter Tuning

Hyperparameter optuna tuning

https://github.com/elena-ecn/optuna-optimization-for-PyTorch-CNN/blob/main/optuna_optimization.py   

https://medium.com/pytorch/using-optuna-to-optimize-pytorch-hyperparameters-990607385e36

In [None]:
class HyperparameterTuning:
    def __init__(self, config:TrainingConfig) -> None:
        self.L = Loader(config)

    def train(self, model, optimizer):
        model.train()  # Set the module in training mode (only affects certain modules)

        for batch_i, (data, target) in enumerate(self.L.train_dataloader):      # For each batch
            optimizer.zero_grad()                                               # Clear gradients
            output = model(data.type(torch.float32))                            # Forward propagation
            loss = F.mse_loss(output, target.type(torch.float32))               # Compute loss
            loss.backward()                                                     # Compute gradients
            optimizer.step()                                                    # Update weights

    def test(self, model):
        model.eval()         # Set the module in evaluation mode (only affects certain modules)
        errors = []
        with torch.no_grad():  
            for batch_i, (data, target) in enumerate(self.L.test_dataloader):  
                output = model(data.type(torch.float32))                
                error = F.mse_loss(output, target.type(torch.float32)).mean().item()  
                errors.append(error)

        accuracy_test = sum(errors) / len(errors)

        return accuracy_test
    
    def xgb_train(self, model:XGBRegressor):
        model.fit(np.array(self.L.X_train).reshape(len(self.L.X_train),-1), np.array(self.L.y_train))

    def xgb_test(self, model:XGBRegressor):
        y_pred = model.predict(np.array(self.L.X_test).reshape(len(self.L.X_test),-1))
        score = mean_squared_error(y_true=self.L.y_test, y_pred=y_pred)
        return score

In [None]:
class LSTM(nn.Module):
    def __init__(self, config):
        super(LSTM, self).__init__()
        self.config = config
        self.hidden_dim = config.hidden_dim

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(1, self.hidden_dim, num_layers=config.n_layers, batch_first=True) # (B, T, C)

        # The linear layer that maps from hidden state space to tag space
        self.fc1 = nn.Linear(self.hidden_dim, 1)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc1(x[:, -1, :])
        return x

## LSMT Hyperparameter Tuning 

In [None]:
def objective(trial):
    # Define range of values to be tested for the hyperparameters
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 96, 128, 196, 256])
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    n_layers = trial.suggest_int("n_layers", 1, 3)
    hidden_size = trial.suggest_categorical("hidden_size", [48, 64, 96, 128, 196, 256])
    block_size = trial.suggest_categorical("block_size", [16, 32, 48, 64, 96, 128])

    TC = TrainingConfig(
        dataset_name="co2-1",
        column_name='zone_052_co2',
        model_type="LSMT",
        standardized=True,
        export=True,
        epochs=3,
        batch_size=batch_size,
        block_size=block_size,
        hidden_dim=hidden_size,
        n_layers = n_layers,
        lr=lr,
        colsample_bytree=None,
        subsample=None,
        n_estimators=None,
        max_depth=None
    )

    # Generate the model
    model = LSTM(config=TC)

    # Generate the optimizers
    optimizer = optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.95))

    HT = HyperparameterTuning(config=TC)

    # Training of the model
    for epoch in range(TC.epochs):
        HT.train(model, optimizer)  # Train the model
        accuracy = HT.test(model)   # Evaluate the model

        # For pruning (stops trial early if not promising)
        trial.report(accuracy, epoch)
        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return accuracy

In [None]:
# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_jobs=10, n_trials=80)

# print(study.best_trials[0].values[0])
# study.best_params

In [None]:
# Best trial:
# {'batch_size': 64, 'lr': 0.00044187545383663645, 'n_layers': 2, 'hidden_size': 256, 'block_size': 128}
#   Value:  0.024644049937310425
# {'batch_size': 64, 'lr': 0.0006091567246063254, 'n_layers': 2, 'hidden_size': 256, 'block_size': 128}
#   Value:  0.024584050078826827
# {'batch_size': 32,
#  'lr': 0.0017066009255505266,
#  'n_layers': 1,
#  'hidden_size': 256,
#  'block_size': 64} values: 0.024468314413269215

## Gradient Boosted Trees Hyperparameter Search

In [None]:
def xgb_objective(trial):
    # Define range of values to be tested for the hyperparameters
    max_depth = trial.suggest_int("max_depth", 3, 6)
    n_estimators = trial.suggest_int("n_estimator", 100, 500)
    block_size = trial.suggest_categorical("block_size", [32, 48 ,64, 96])
    subsample = trial.suggest_float("subsample", 0.7, 1.0)

    TC = TrainingConfig(
        # dataset_name="ele-2",
        dataset_name="co2-2",
        column_name='zone_072_co2',
        # column_name='mels_S',
        model_type="XGB",
        standardized = True,
        epochs=3,
        batch_size = 20, # Not used
        block_size = block_size,
        hidden_dim = None,
        n_layers = None,
        max_depth=max_depth,
        lr=None,
        n_estimators = n_estimators,
        subsample = subsample
    )

    # Generate the model
    model = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, subsample=subsample)

    HT = HyperparameterTuning(config=TC)

    # Training of the model
    for epoch in range(TC.epochs):
        HT.xgb_train(model)                # Train the model
        accuracy = HT.xgb_test(model)      # Evaluate the model

        # For pruning (stops trial early if not promising)
        trial.report(accuracy, epoch)
        # Handle pruning based on the intermediate value.
        # if trial.should_prune():
        #     raise optuna.exceptions.TrialPruned()

    return accuracy

In [None]:
# study = optuna.create_study(direction="minimize")
# study.optimize(xgb_objective, n_jobs=5, n_trials=50)

In [None]:
# print(study.best_trials[0].values[0])
# study.best_params

In [None]:
# 0.03524799340451928
# {'max_depth': 4,
#  'n_estimator': 50,
#  'block_size': 128,
#  'subsample': 0.9213629465796025}

# 0.13585137826503266
# {'max_depth': 3,
#  'n_estimator': 282,
#  'block_size': 32,
#  'subsample': 0.9423739511390844}

# Run Models

In [None]:
xgb_config = TrainingConfig(
    dataset_name="co2-2",
    column_name='zone_022_co2',
    model_type="XGB",
    standardized = True,
    epochs=None,
    batch_size = None,
    block_size = 128,
    max_depth=4,
    n_estimators=50,
    lr=None,
    subsample=0.9213629465796025
)

lsmt_config = TrainingConfig(
    dataset_name="co2-2",
    column_name='zone_022_co2',
    model_type="LSMT",
    standardized = True,
    epochs = 5,
    batch_size = 32, 
    block_size = 64,
    hidden_dim = 256,
    n_layers = 1,
    lr=0.0017066009255505266,
)

In [None]:
class Trainer: 
    def __init__(self):
        self.config: TrainingConfig

    def get_model(self, model_type:str):
        if model_type == "XGB":
            return XGBRegressor(
                n_estimators=self.config.n_estimators,
                max_depth=self.config.max_depth,
                subsample=self.config.subsample,
            )
        elif model_type == "LSMT":
            return LSTM(config=self.config)
        else:
            raise Exception(f'No Model Type Selected: {model_type}')
        
    def get_training_data(self, model_type) -> Union[Tuple[np.ndarray,np.ndarray], DataLoader]:
        if model_type == "XGB":
            X_train = np.array(self.L.X_train).reshape(len(self.L.X_train),-1)
            y_train = np.array(self.L.y_train)
            return X_train,y_train
        elif model_type == "LSMT":
            return self.L.train_dataloader
        else:
            raise Exception(f'No Model Type Selected: {model_type}')

    def lsmt_training(self):
        model = self.get_model(self.config.model_type)
        optimizer = optim.AdamW(model.parameters(), lr=self.config.lr, betas=(0.9, 0.95))
        model.train()  # Set the module in training mode (only affects certain modules)
        training_data_loader = self.get_training_data(self.config.model_type)

        for epoch in range(self.config.epochs): # type: ignore
            # print(f'lsmt_training epoch: {epoch+1}')
            for batch_i, (data, target) in enumerate(training_data_loader):         # For each batch
                optimizer.zero_grad()                                               # Clear gradients
                output = model(data.type(torch.float32))                            # Forward propagation
                loss = F.mse_loss(output, target.type(torch.float32))               # Compute loss
                loss.backward()                                                     # Compute gradients
                optimizer.step()                                                    # Update weights

        return model
    
    def xgb_fit_training(self):
        model = self.get_model(self.config.model_type)
        X_train, y_train = self.get_training_data(self.config.model_type)
        model.fit(X_train, y_train)
        return model
    
    def run(self, input_config:TrainingConfig):
        self.config, model_type = input_config, input_config.model_type
        self.L = Loader(input_config)
        print(f'Starting Training of {model_type} on dataset: {self.config.dataset_name} column: {self.config.column_name}')
        MODEL = None
        st = time.perf_counter()
        if model_type == "XGB":
            MODEL = self.xgb_fit_training()
        elif model_type == "LSMT":
            MODEL = self.lsmt_training()
        else:
            raise Exception(f'Failed to train: {model_type}')
        
        print(f'Completed Training of {model_type} in {(time.perf_counter()-st)/60:.2f} min')
        return MODEL

In [None]:
# lsmt_MODEL = Trainer().run(lsmt_config)
xgb_MODEL = Trainer().run(xgb_config)

In [None]:
class Validate:
    def __init__(self) -> None:
        self.config: TrainingConfig

    def get_testing_data(self, model_type) -> Union[Tuple[np.ndarray,np.ndarray], DataLoader]:
        if model_type == "XGB":
            X_test = np.array(self.L.X_test).reshape(len(self.L.X_test),-1)
            y_test = np.array(self.L.y_test)
            return X_test, y_test
        elif model_type == "LSMT":
            return self.L.test_dataloader
        else:
            raise Exception(f'No Model Type Selected: {model_type}')
        
    @ torch.no_grad()
    def validate_lsmt(self) -> dict:
        self.MODEL.eval()  # Set the module in training mode (only affects certain modules)
        testing_data_loader = self.get_testing_data(self.config.model_type)

        score_arr = np.zeros((2,len(testing_data_loader)))

        for batch_i, (data, target) in enumerate(testing_data_loader):  
            out = self.MODEL(data.type(torch.float32))  
            mse = mean_squared_error(y_true=target.squeeze().numpy(), y_pred=out.squeeze().numpy())
            mape = mean_absolute_percentage_error(y_true=target.squeeze().numpy(), y_pred=out.squeeze().numpy())
            score_arr[0,batch_i] = mse
            score_arr[1,batch_i] = mape  

        return {"mse":score_arr[0].mean(), "mape":score_arr[1].mean()}

    def validate_xgb(self) -> dict:
        X_test, y_test = self.get_testing_data(self.config.model_type)
        y_pred = self.MODEL.predict(X_test)
        mse = mean_squared_error(y_true=np.array([y_test.flatten()]), y_pred=np.array([y_pred]))
        mape = mean_absolute_percentage_error(y_true=np.array([y_test.flatten()]), y_pred=np.array([y_pred]))
        return {"mse":mse, "mape":mape}
    
    def run(self, trained_model, input_config:TrainingConfig ) -> dict:
        # Set up
        self.MODEL, self.config = trained_model, input_config
        self.L = Loader(input_config)
        print(f'Starting validation of {self.config.model_type}')
        
        # Run Validation with test data
        if self.config.model_type == "XGB":
            res = self.validate_xgb()
        elif self.config.model_type == "LSMT":
            res = self.validate_lsmt()
        else:
            raise Exception(f'Failed to train: {self.config.model_type}')
        
        return res

In [None]:
# Validate().run(lsmt_MODEL, lsmt_config)
Validate().run(xgb_MODEL, xgb_config)

In [None]:
class Compare:
    def __init__(self) -> None:
        self.datasets:list = self.get_datasets()
        self.res = []
        self.xgb_config = TrainingConfig(
            dataset_name="co2-1",
            column_name='zone_072_co2',
            model_type="XGB",
            standardized = True,
            epochs=None,
            batch_size = None,
            block_size = 96,
            max_depth=5,
            lr=0.07044179896591762,
            n_estimators=300,
            subsample=0.5681930635231892,
            colsample_bytree=0.8556373196284424
        )
        
        self.lsmt_config = TrainingConfig(
            dataset_name="co2-1",
            column_name='zone_072_co2',
            model_type="LSMT",
            standardized = True,
            epochs = 5,
            batch_size = 32, 
            block_size = 64,
            hidden_dim = 256,
            n_layers = 1,
            lr=0.0017066009255505266
        )

    def get_datasets(self) -> list[dict]:
        datasets = []
        for dataset in ["co2-1","co2-2"]:
            for col in ["zone_022_co2","zone_052_co2","zone_072_co2"]:
                datasets.append({"dataset_name":dataset, "column_name":col})

        datasets.append({"dataset_name":"ele-1", "column_name":"mels_S"})
        datasets.append({"dataset_name":"ele-2", "column_name":"mels_S"})

        return datasets
    
    def update_dataset(self, model_type:str, data:dict) -> TrainingConfig:
        cfg = self.xgb_config if model_type=="XGB" else self.lsmt_config
        cfg.dataset_name = data["dataset_name"]
        cfg.column_name = data["column_name"]
        return cfg
    
    def run_trail(self, cfg:TrainingConfig):
        st = time.perf_counter()
        MODEL = Trainer().run(cfg)
        score = Validate().run(MODEL, cfg)

        # Update results
        results = {
            "model_type":cfg.model_type, "dataset": cfg.dataset_name + "_" +cfg.column_name, "time":time.perf_counter()-st
        }

        results.update(score)
        
        self.res.append(results)
    
    def run(self, export=False)->pd.DataFrame:

        for model in ["XGB","LSMT"]:
            for data in self.datasets:
                trail_cfg = self.update_dataset(model, data)
                self.run_trail(trail_cfg)
        
        df = pd.DataFrame(self.res)

        if export:
            df.to_csv("experiment_results.csv", index=False)

        return df

In [None]:
Compare().run(export=True)

## All LSMT Models

In [None]:
class LSMTModels:
    def __init__(self):
        self.datasets:list = self.get_datasets()
        self.config = TrainingConfig(
            dataset_name="co2-1",
            column_name='zone_072_co2',
            model_type="LSMT",
            standardized = True,
            epochs = 5,
            batch_size = 32, 
            block_size = 64,
            hidden_dim = 256,
            n_layers = 1,
            lr=0.0017066009255505266
        )

    def get_datasets(self) -> list[dict]:
        datasets = []
        for dataset in ["co2-1","co2-2"]:
            for col in ["zone_022_co2","zone_052_co2","zone_072_co2"]:
                datasets.append({"dataset_name":dataset, "column_name":col})

        datasets.append({"dataset_name":"ele-1", "column_name":"mels_S"})
        datasets.append({"dataset_name":"ele-2", "column_name":"mels_S"})

        return datasets
    
    def update_dataset(self, data:dict) -> TrainingConfig:
        cfg = self.config
        cfg.dataset_name = data["dataset_name"]
        cfg.column_name = data["column_name"]
        return cfg

    def _load_data(self):
        self.L = Loader(self.config)
        return self.L.get_total_trainer()

    def lsmt_training(self):
        model = LSTM(config=self.config)
        optimizer = optim.AdamW(model.parameters(), lr=self.config.lr, betas=(0.9, 0.95))
        model.train()  # Set the module in training mode (only affects certain modules)
        data_loader = self._load_data()

        for epoch in range(self.config.epochs): # type: ignore
        # for epoch in range(2): # type: ignore
            for batch_i, (data, target) in enumerate(data_loader): 
                optimizer.zero_grad()                                             
                output = model(data.unsqueeze(-1).type(torch.float32))                                                                   
                loss = F.mse_loss(output.squeeze(), target.type(torch.float32))               
                loss.backward()                                                   
                optimizer.step()                                             

        return model
    
    def train_lsmt_models(self, export=False):
        for dataset in self.datasets:
            st = time.perf_counter()
            self.config = self.update_dataset(dataset)
            model_file_name = f'{self.config.model_type}_{self.config.dataset_name}_{self.config.column_name}.pk'
            MODEL = self.lsmt_training()
            if export:
                torch.save(MODEL, "models/"+model_file_name)
                print(f'Saved {model_file_name} in {(time.perf_counter()-st)/60:.1f} min')

In [None]:
# sets = LSMTModels().train_lsmt_models(export=True)