## Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import os
import sys
import argparse
import torch.nn as nn
import torch.nn.functional as F
import torch.fft
import ray
import random
from ray import tune, train
from ray.train import Checkpoint
from ray.tune.schedulers import ASHAScheduler
from torch.utils.data import Dataset, DataLoader, random_split
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tqdm import tqdm
from torchmetrics.classification import (
    MulticlassAccuracy, MulticlassF1Score, MulticlassPrecision, MulticlassRecall,
    MulticlassConfusionMatrix, MulticlassROC
)
import warnings
warnings.filterwarnings("ignore")

## Utils

In [2]:
def transfer_labels(labels):
    indicies = np.unique(labels)
    num_samples = labels.shape[0]

    for i in range(num_samples):
        new_label = np.argwhere(labels[i] == indicies)[0][0]
        labels[i] = new_label

    return labels

def collate_fn(data, device, max_len=None):
    """Build mini-batch tensors from a list of (X, mask) tuples. Mask input. Create
    Args:
        data: len(batch_size) list of tuples (X, y).
            - X: torch tensor of shape (seq_length, feat_dim); variable seq_length.
            - y: torch tensor of shape (num_labels,) : class indices or numerical targets
                (for classification or regression, respectively). num_labels > 1 for multi-task models
        max_len: global fixed sequence length. Used for architectures requiring fixed length input,
            where the batch length cannot vary dynamically. Longer sequences are clipped, shorter are padded with 0s
    Returns:
        X: (batch_size, padded_length, feat_dim) torch tensor of masked features (input)
        targets: (batch_size, padded_length, feat_dim) torch tensor of unmasked features (output)
        target_masks: (batch_size, padded_length, feat_dim) boolean torch tensor
            0 indicates masked values to be predicted, 1 indicates unaffected/"active" feature values
        padding_masks: (batch_size, padded_length) boolean tensor, 1 means keep vector at this position, 0 means padding
    """

    batch_size = len(data)
    features, labels = zip(*data)

    # Stack and pad features and masks (convert 2D to 3D tensors, i.e. add batch dimension)
    lengths = [X.shape[0] for X in features]  # original sequence length for each time series
    if max_len is None:
        max_len = max(lengths)

    X = torch.zeros(batch_size, max_len, features[0].shape[-1])  # (batch_size, padded_length, feat_dim)
    for i in range(batch_size):
        end = min(lengths[i], max_len)
        X[i, :end, :] = features[i][:end, :]

    targets = torch.stack(labels, dim=0)  # (batch_size, num_labels)

    padding_masks = padding_mask(torch.tensor(lengths, dtype=torch.int16),
                                 max_len=max_len)  # (batch_size, padded_length) boolean tensor, "1" means keep

    return X.to(device), targets.to(device), padding_masks.to(device)


def padding_mask(lengths, max_len=None):
    """
    Used to mask padded positions: creates a (batch_size, max_len) boolean mask from a tensor of sequence lengths,
    where 1 means keep element at this position (time step)
    """
    batch_size = lengths.numel()
    max_len = max_len or lengths.max_val()  # trick works because of overloading of 'or' operator for non-boolean types
    return (torch.arange(0, max_len, device=lengths.device)
            .type_as(lengths)
            .repeat(batch_size, 1)
            .lt(lengths.unsqueeze(1)))

def set_seed(args):
    random.seed(args.random_seed)
    np.random.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)


class HARDataset(Dataset):
    def __init__(self, dataset, target):
        # (num_size, num_dimensions, series_length)
        self.dataset = dataset.permute(0, 2, 1)
        self.target = target

    def __getitem__(self, index):
        return self.dataset[index], self.target[index]

    def __len__(self):
        return len(self.target)
    
def normalize(data_set):
    '''
    The function is the same as normalize_per_series, but can be used for multiple variables.
    '''
    return TimeSeriesScalerMeanVariance().fit_transform(data_set)

## Arguments

In [3]:
def arg_parse():  
    parser = argparse.ArgumentParser()

    # Base setup
    parser.add_argument('--random_seed', type=int, default=42, help='shuffle seed')

    # Dataset setup
    parser.add_argument('--dataset', type=str, default='UCI-HAR', help='dataset name [UCI-HAR, mHealth, PAMAP2]')
    parser.add_argument('--num_classes', type=int, default=0, help='number of class')
    parser.add_argument('--normalize_way', type=str, default='single', help='single or train_set')
    parser.add_argument('--input_size', type=int, default=1, help='input_size')

    # basic config
    parser.add_argument('--freq', type=str, default='h',
                        help='freq for time features encoding, options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly], you can also use more detailed freq like 15min or 3h')

    # forecasting task
    parser.add_argument('--seq_len', type=int, default=96, help='input sequence length')
    parser.add_argument('--pred_len', type=int, default=0, help='prediction sequence length')

    # model define
    parser.add_argument('--top_k', type=int, default=3, help='for TimesBlock')
    parser.add_argument('--num_kernels', type=int, default=6, help='for Inception')
    parser.add_argument('--enc_in', type=int, default=7, help='encoder input size')
    parser.add_argument('--c_out', type=int, default=7, help='output size')
    parser.add_argument('--d_model', type=int, default=64, help='dimension of model')   ###
    parser.add_argument('--n_heads', type=int, default=8, help='num of heads')
    parser.add_argument('--e_layers', type=int, default=3, help='num of encoder layers')
    parser.add_argument('--d_ff', type=int, default=64, help='dimension of fcn')
    parser.add_argument('--dropout', type=float, default=0.1, help='dropout')
    parser.add_argument('--embed', type=str, default='timeF', help='time features encoding, options:[timeF, fixed, learned]')
    parser.add_argument('--att_loc', type=str, default='none', help='attention location, options:[block, model, none]')

    # GPU
    parser.add_argument('--gpu', type=int, default=1, help='gpu')


    # training setup
    parser.add_argument('--optimizer', type=str, default='adam', help='optimizer')
    parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
    parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay')
    parser.add_argument('--batch_size', type=int, default=8, help='')
    parser.add_argument('--epoch', type=int, default=50, help='training epoch')
    parser.add_argument('--device', type=str, default='cuda', help='device')


    args = parser.parse_args(args=[])

    args.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    set_seed(args)
    return args

## Load Data

In [4]:
def load_UCI(args):
    train = pd.read_csv('/home/rifki/research/Kuliah/har-timesnet/dataset/UCI-HAR-Dataset/train.csv')
    test = pd.read_csv('/home/rifki/research/Kuliah/har-timesnet/dataset/UCI-HAR-Dataset/test.csv')

    train = train.drop('subject', axis=1)
    test = test.drop('subject', axis=1)

    labelNames = list(np.unique(train['Activity']))
    labelNames.sort()
    train['Activity'] = train['Activity'].apply(lambda x: labelNames.index(x))
    test['Activity'] = test['Activity'].apply(lambda x: labelNames.index(x))
    # Existing training and test data
    x_train = train.iloc[:, :-1].to_numpy(dtype=np.float32)
    y_train = train.iloc[:, -1].to_numpy(dtype=np.float32)

    x_test = test.iloc[:, :-1].to_numpy(dtype=np.float32)
    y_test = test.iloc[:, -1].to_numpy(dtype=np.float32)

    # Define the number of classes
    num_classes = len(np.unique(y_train))

    x_train = x_train[:, :, np.newaxis]
    x_test = x_test[:, :, np.newaxis]

    args.num_classes = num_classes
    args.seq_len = x_train.shape[1]
    args.input_size = x_train.shape[2]

    args.enc_in = x_train.shape[2]

    while x_train.shape[0] * 0.6 < args.batch_size:
        args.batch_size = args.batch_size // 2

    if args.normalize_way == 'single':
        x_train = normalize(x_train)
        x_test = normalize(x_test)

    # Convert numpy arrays to tensors and create the datasets
    train_set = HARDataset(torch.from_numpy(x_train).type(torch.FloatTensor).to(args.device).permute(0,2,1), torch.from_numpy(y_train).type(torch.FloatTensor).to(args.device).to(torch.int64))
    test_set = HARDataset(torch.from_numpy(x_test).type(torch.FloatTensor).to(args.device).permute(0,2,1), torch.from_numpy(y_test).type(torch.FloatTensor).to(args.device).to(torch.int64))

    # Create DataLoaders for training, validation, and testing sets
    train_loader = DataLoader(train_set, batch_size=args.batch_size, num_workers=0, drop_last=True, 
                            collate_fn=lambda x: collate_fn(x, args.device, max_len=args.seq_len))
    test_loader = DataLoader(test_set, batch_size=args.batch_size, num_workers=0, 
                            collate_fn=lambda x: collate_fn(x, args.device, max_len=args.seq_len))

    return train_loader, test_loader

## Modelling

### Utils

In [5]:
import torch
import torch.nn as nn


class Inception_Block_V1(nn.Module):
    def __init__(self, in_channels, out_channels, num_kernels=6, init_weight=True):
        super(Inception_Block_V1, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.num_kernels = num_kernels
        kernels = []
        for i in range(self.num_kernels):
            kernels.append(nn.Conv2d(in_channels, out_channels, kernel_size=2 * i + 1, padding=i))
        self.kernels = nn.ModuleList(kernels)
        if init_weight:
            self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x):
        res_list = []
        for i in range(self.num_kernels):
            res_list.append(self.kernels[i](x))
        res = torch.stack(res_list, dim=-1).mean(-1)
        return res

In [6]:
import torch
import torch.nn as nn
import math


class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len=25000):
        super(PositionalEmbedding, self).__init__()
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model).float()
        pe.require_grad = False

        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = (torch.arange(0, d_model, 2).float()
                    * -(math.log(10000.0) / d_model)).exp()

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return self.pe[:, :x.size(1)]


class TokenEmbedding(nn.Module):
    def __init__(self, c_in, d_model):
        super(TokenEmbedding, self).__init__()
        padding = 1 if torch.__version__ >= '1.5.0' else 2
        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
                                   kernel_size=3, padding=padding, padding_mode='circular', bias=False)
        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(
                    m.weight, mode='fan_in', nonlinearity='leaky_relu')

    def forward(self, x):
        x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
        return x


class FixedEmbedding(nn.Module):
    def __init__(self, c_in, d_model):
        super(FixedEmbedding, self).__init__()

        w = torch.zeros(c_in, d_model).float()
        w.require_grad = False

        position = torch.arange(0, c_in).float().unsqueeze(1)
        div_term = (torch.arange(0, d_model, 2).float()
                    * -(math.log(10000.0) / d_model)).exp()

        w[:, 0::2] = torch.sin(position * div_term)
        w[:, 1::2] = torch.cos(position * div_term)

        self.emb = nn.Embedding(c_in, d_model)
        self.emb.weight = nn.Parameter(w, requires_grad=False)

    def forward(self, x):
        return self.emb(x).detach()


class TemporalEmbedding(nn.Module):
    def __init__(self, d_model, embed_type='fixed', freq='h'):
        super(TemporalEmbedding, self).__init__()

        minute_size = 4
        hour_size = 24
        weekday_size = 7
        day_size = 32
        month_size = 13

        Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding
        if freq == 't':
            self.minute_embed = Embed(minute_size, d_model)
        self.hour_embed = Embed(hour_size, d_model)
        self.weekday_embed = Embed(weekday_size, d_model)
        self.day_embed = Embed(day_size, d_model)
        self.month_embed = Embed(month_size, d_model)

    def forward(self, x):
        x = x.long()
        minute_x = self.minute_embed(x[:, :, 4]) if hasattr(
            self, 'minute_embed') else 0.
        hour_x = self.hour_embed(x[:, :, 3])
        weekday_x = self.weekday_embed(x[:, :, 2])
        day_x = self.day_embed(x[:, :, 1])
        month_x = self.month_embed(x[:, :, 0])

        return hour_x + weekday_x + day_x + month_x + minute_x


class TimeFeatureEmbedding(nn.Module):
    def __init__(self, d_model, embed_type='timeF', freq='h'):
        super(TimeFeatureEmbedding, self).__init__()

        freq_map = {'h': 4, 't': 5, 's': 6,
                    'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3}
        d_inp = freq_map[freq]
        self.embed = nn.Linear(d_inp, d_model, bias=False)

    def forward(self, x):
        return self.embed(x)


class DataEmbedding(nn.Module):
    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
        super(DataEmbedding, self).__init__()

        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
        self.position_embedding = PositionalEmbedding(d_model=d_model)
        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
            d_model=d_model, embed_type=embed_type, freq=freq)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, x_mark):
        if x_mark is None:
            x = self.value_embedding(x) + self.position_embedding(x)
        else:
            x = self.value_embedding(
                x) + self.temporal_embedding(x_mark) + self.position_embedding(x)
        return self.dropout(x)

In [7]:
def FFT_for_Period(x, k=2):
    # [B, T, C]
    xf = torch.fft.rfft(x, dim=1)
    # find period by amplitudes
    frequency_list = abs(xf).mean(0).mean(-1)
    frequency_list[0] = 0
    _, top_list = torch.topk(frequency_list, k)
    top_list = top_list.detach().cpu().numpy()
    period = x.shape[1] // top_list

    # print("period.shape = ", period.shape, top_list.shape, top_list, period)
    return period, abs(xf).mean(-1)[:, top_list]

### TimesBlock + Attention

In [8]:
class TimesBlockAtt(nn.Module):
    def __init__(self, configs):
        super(TimesBlockAtt, self).__init__()
        self.seq_len = configs.seq_len
        self.pred_len = configs.pred_len
        self.k = configs.top_k
        self.d_model = configs.d_model
        self.num_heads = configs.n_heads

        # Parameter-efficient design
        self.conv = nn.Sequential(
            Inception_Block_V1(configs.d_model, configs.d_ff,
                               num_kernels=configs.num_kernels),
            nn.GELU(),
            Inception_Block_V1(configs.d_ff, configs.d_model,
                               num_kernels=configs.num_kernels)
        )

        # Self-attention layer
        self.attention = nn.MultiheadAttention(embed_dim=self.d_model, num_heads=self.num_heads, dropout=configs.dropout)
        self.layer_norm = nn.LayerNorm(self.d_model)

    def forward(self, x):
        B, T, N = x.size()
        period_list, period_weight = FFT_for_Period(x, self.k)

        res = []
        for i in range(self.k):
            period = period_list[i]
            # Padding
            if (self.seq_len + self.pred_len) % period != 0:
                length = (
                    ((self.seq_len + self.pred_len) // period) + 1) * period
                padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device)
                out = torch.cat([x, padding], dim=1)
            else:
                length = (self.seq_len + self.pred_len)
                out = x

            # Reshape
            out = out.reshape(B, length // period, period,
                      N).permute(0, 3, 1, 2).contiguous()
            out = self.conv(out)

            # Reshape back
            out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
            res.append(out[:, :(self.seq_len + self.pred_len), :])

        res = torch.stack(res, dim=-1)

        # Adaptive aggregation
        period_weight = F.softmax(period_weight, dim=1)
        period_weight = period_weight.unsqueeze(1).unsqueeze(1).repeat(1, T, N, 1)
        res = torch.sum(res * period_weight, -1)

        # Self-attention
        res = res.permute(1, 0, 2)  # Prepare for attention [T, B, N]
        attn_out, _ = self.attention(res, res, res)
        res = self.layer_norm(res + attn_out)  # Residual connection + LayerNorm
        res = res.permute(1, 0, 2)  # Back to [B, T, N]

        # Residual connection
        res = res + x
        return res


class TimesNetAttBlock(nn.Module):
    """
    Paper link: https://openreview.net/pdf?id=ju_Uqw384Oq
    """

    def __init__(self, configs):
        super(TimesNetAttBlock, self).__init__()
        self.configs = configs
        self.model = nn.ModuleList([TimesBlockAtt(configs) for _ in range(configs.e_layers)])
        self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq, configs.dropout)
        self.layer = configs.e_layers
        self.layer_norm = nn.LayerNorm(configs.d_model)
        self.act = nn.GELU()
        self.dropout = nn.Dropout(configs.dropout)

        # Output projection layer
        self.projection = nn.Linear(configs.d_model * configs.seq_len, configs.num_classes)

    def classification(self, x_enc, x_mark_enc):
        # Embedding
        enc_out = self.enc_embedding(x_enc, None)  # [B, T, C]

        # TimesNet Blocks
        for i in range(self.layer):
            enc_out = self.layer_norm(self.model[i](enc_out))

        # Output processing
        output = self.act(enc_out)
        output = self.dropout(output)
        output = output * x_mark_enc.unsqueeze(-1)
        output = output.reshape(output.shape[0], -1)
        output = self.projection(output)
        return output

    def forward(self, x_enc, x_mark_enc, x_dec=None, x_mark_dec=None, mask=None):
        dec_out = self.classification(x_enc, x_mark_enc)
        return dec_out  # [B, N]


### TimesNet + Attention

In [9]:
class TimesBlockBase(nn.Module):
    def __init__(self, configs):
        super(TimesBlockBase, self).__init__()
        self.seq_len = configs.seq_len
        self.pred_len = configs.pred_len
        self.k = configs.top_k
        # parameter-efficient design
        self.conv = nn.Sequential(
            Inception_Block_V1(configs.d_model, configs.d_ff,
                               num_kernels=configs.num_kernels),
            nn.GELU(),
            Inception_Block_V1(configs.d_ff, configs.d_model,
                               num_kernels=configs.num_kernels)
        )

    def forward(self, x):
        # print("Input shape:", x.shape)
        B, T, N = x.size()
        period_list, period_weight = FFT_for_Period(x, self.k)

        res = []
        for i in range(self.k):
            period = period_list[i]
            # padding
            if (self.seq_len + self.pred_len) % period != 0:
                length = (((self.seq_len + self.pred_len) // period) + 1) * period

                # print("length = ", length, self.seq_len, self.pred_len, period)
                padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device)

                # print("padding x shape = ", padding.shape, x.shape)
                out = torch.cat([x, padding], dim=1)
                # print("padding out shape = ", out.shape)
            else:
                length = (self.seq_len + self.pred_len)
                out = x

            # print("out.shape = ", out.shape, length, period, length // period, N )
            # reshape
            out = out.reshape(B, length // period, period,
                              N).permute(0, 3, 1, 2).contiguous()
            # 2D conv: from 1d Variation to 2d Variation
            out = self.conv(out)
            # reshape back
            out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
            res.append(out[:, :(self.seq_len + self.pred_len), :])
        res = torch.stack(res, dim=-1)
        # adaptive aggregation
        period_weight = F.softmax(period_weight, dim=1)
        period_weight = period_weight.unsqueeze(
            1).unsqueeze(1).repeat(1, T, N, 1)
        res = torch.sum(res * period_weight, -1)
        # residual connection
        res = res + x
        return res


class TimesNetAttModel(nn.Module):
    """
    Paper link: https://openreview.net/pdf?id=ju_Uqw384Oq
    """

    def __init__(self, configs):
        super(TimesNetAttModel, self).__init__()
        self.configs = configs
        self.model = nn.ModuleList([TimesBlockBase(configs) for _ in range(configs.e_layers)])
        self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq, configs.dropout)
        self.layer = configs.e_layers
        self.layer_norm = nn.LayerNorm(configs.d_model)
        self.act = nn.GELU()
        self.dropout = nn.Dropout(configs.dropout)

        # Define self-attention layer
        self.attention = nn.MultiheadAttention(embed_dim=configs.d_model, num_heads=configs.n_heads, dropout=configs.dropout)

        self.projection = nn.Linear(configs.d_model * configs.seq_len, configs.num_classes)


    def classification(self, x_enc, x_mark_enc):
        # Embedding
        enc_out = self.enc_embedding(x_enc, None)  # [B, T, C]

        # TimesNet Blocks
        for i in range(self.layer):
            enc_out = self.layer_norm(self.model[i](enc_out))

        # Prepare for attention: [T, B, C]
        enc_out = enc_out.permute(1, 0, 2)

        # Apply self-attention
        attn_out, _ = self.attention(enc_out, enc_out, enc_out)

        # Residual connection and layer normalization
        enc_out = self.layer_norm(enc_out + attn_out)

        # Reshape back: [B, T, C]
        enc_out = enc_out.permute(1, 0, 2)

        # Output processing remains the same
        output = self.act(enc_out)
        output = self.dropout(output)
        output = output * x_mark_enc.unsqueeze(-1)
        output = output.reshape(output.shape[0], -1)
        output = self.projection(output)
        return output

    def forward(self, x_enc, x_mark_enc, x_dec=None, x_mark_dec=None, mask=None):
        dec_out = self.classification(x_enc, x_mark_enc)
        return dec_out  # [B, N]


### Model Final

In [10]:
class TimesBlock(nn.Module):
    def __init__(self, configs):
        super(TimesBlock, self).__init__()
        self.seq_len = configs.seq_len
        self.pred_len = configs.pred_len
        self.k = configs.top_k
        self.d_model = configs.d_model
        self.num_heads = configs.n_heads
        self.att_loc  = configs.att_loc

        # Parameter-efficient design
        self.conv = nn.Sequential(
            Inception_Block_V1(configs.d_model, configs.d_ff,
                               num_kernels=configs.num_kernels),
            nn.GELU(),
            Inception_Block_V1(configs.d_ff, configs.d_model,
                               num_kernels=configs.num_kernels)
        )

        # Self-attention layer
        if self.att_loc == 'block':
            self.attention = nn.MultiheadAttention(embed_dim=self.d_model, num_heads=self.num_heads, dropout=configs.dropout)
            self.layer_norm = nn.LayerNorm(self.d_model)

    def forward(self, x):
        B, T, N = x.size()
        period_list, period_weight = FFT_for_Period(x, self.k)

        res = []
        for i in range(self.k):
            period = period_list[i]
            # Padding
            if (self.seq_len + self.pred_len) % period != 0:
                length = (((self.seq_len + self.pred_len) // period) + 1) * period

                padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device)

                out = torch.cat([x, padding], dim=1)
            else:
                length = (self.seq_len + self.pred_len)
                out = x

            # Reshape
            out = out.reshape(B, length // period, period,
                              N).permute(0, 3, 1, 2).contiguous()
            out = self.conv(out)

            # Reshape back
            out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
            res.append(out[:, :(self.seq_len + self.pred_len), :])

        res = torch.stack(res, dim=-1)

        # Adaptive aggregation
        period_weight = F.softmax(period_weight, dim=1)
        period_weight = period_weight.unsqueeze(1).unsqueeze(1).repeat(1, T, N, 1)
        res = torch.sum(res * period_weight, -1)

        # Self-attention

        if self.att_loc == 'block':
            res = res.permute(1, 0, 2)  # Prepare for attention [T, B, N]
            attn_out, _ = self.attention(res, res, res)
            res = self.layer_norm(res + attn_out)  # Residual connection + LayerNorm
            res = res.permute(1, 0, 2)  # Back to [B, T, N]

        # Residual connection
        res = res + x
        return res


class TimesNetModel(nn.Module):
    """
    Paper link: https://openreview.net/pdf?id=ju_Uqw384Oq
    """

    def __init__(self, configs):
        super(TimesNetModel, self).__init__()
        self.configs = configs
        self.model = nn.ModuleList([TimesBlock(configs) for _ in range(configs.e_layers)])
        self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq, configs.dropout)
        self.layer = configs.e_layers
        self.layer_norm = nn.LayerNorm(configs.d_model)
        self.act = nn.GELU()
        self.dropout = nn.Dropout(configs.dropout)
        self.att_loc = configs.att_loc
    
        # Define self-attention layer
        if self.att_loc == 'model':
            self.attention = nn.MultiheadAttention(embed_dim=configs.d_model, num_heads=configs.n_heads, dropout=configs.dropout)

        self.projection = nn.Linear(configs.d_model * configs.seq_len, configs.num_classes)


    def classification(self, x_enc, x_mark_enc):
        # Embedding
        enc_out = self.enc_embedding(x_enc, None)  # [B, T, C]

        # TimesNet Blocks
        for i in range(self.layer):
            enc_out = self.layer_norm(self.model[i](enc_out))

        
        if self.att_loc == 'model':
            # Prepare for attention: [T, B, C]
            enc_out = enc_out.permute(1, 0, 2)

            # Apply self-attention
            attn_out, _ = self.attention(enc_out, enc_out, enc_out)

            # Residual connection and layer normalization
            enc_out = self.layer_norm(enc_out + attn_out)

            # Reshape back: [B, T, C]
            enc_out = enc_out.permute(1, 0, 2)

        # Output processing remains the same
        output = self.act(enc_out)
        output = self.dropout(output)
        output = output * x_mark_enc.unsqueeze(-1)
        output = output.reshape(output.shape[0], -1)
        output = self.projection(output)
        return output

    def forward(self, x_enc, x_mark_enc, x_dec=None, x_mark_dec=None, mask=None):
        dec_out = self.classification(x_enc, x_mark_enc)
        return dec_out  # [B, N]


## Training

In [None]:
def train_model(configs):
    args = arg_parse()
    # Initialize the model
    for key, value in configs.items():
        setattr(args, key, value)

    # Load the dataset
    train_loader, _ = load_UCI(args)

    if args.att_loc == 'block':
        model = TimesNetAttBlock(args)
    else:
        model = TimesNetAttModel(args)

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    # Initialize accuracy metric
    accuracy_metric = MulticlassAccuracy(num_classes=args.num_classes).to(device)

    # Training loop
    for epoch in range(args.epoch):
        print(f"Epoch {epoch + 1}/{args.epoch}")
        model.train()
        running_loss = 0.0
        accuracy_metric.reset()
        
        for inputs, labels, padding_x_mask in tqdm(train_loader, disable=True):
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs, padding_x_mask)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Accumulate loss
            running_loss += loss.item()

            # Update accuracy metric
            accuracy_metric.update(outputs, labels)

        avg_train_loss = running_loss / len(train_loader)
        train_accuracy = accuracy_metric.compute().item()
        print(f"Training loss: {avg_train_loss:.4f}, Training accuracy: {train_accuracy:.4f}")

        torch.cuda.empty_cache()

        # Report the loss and accuracy to Ray Tune
        # Save the model and optimizer state
        temp_checkpoint_dir = "models/"
        if not os.path.exists(temp_checkpoint_dir):
            os.makedirs(temp_checkpoint_dir)
        
        torch.save(
            (model.state_dict(), optimizer.state_dict()), temp_checkpoint_dir + "checkpoint.pt"
        )
        checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
        
        train.report(
            {"loss": avg_train_loss, "accuracy": train_accuracy},
            checkpoint=checkpoint,
        )
    print("Finished Training")


## Testing

In [12]:
import torch
from torchmetrics.classification import (
    MulticlassAccuracy, MulticlassF1Score, MulticlassPrecision, MulticlassRecall,
    MulticlassConfusionMatrix, MulticlassROC
)
import os

def test_best_model(model, configs):
    args = arg_parse()

    for key, value in configs.items():
        setattr(args, key, value)

    _, test_loader = load_UCI(args)

    # Initialize metrics
    accuracy_metric = MulticlassAccuracy(num_classes=args.num_classes).to(args.device)
    f1_score_metric = MulticlassF1Score(average='macro', num_classes=args.num_classes).to(args.device)
    precision_metric = MulticlassPrecision(average='macro', num_classes=args.num_classes).to(args.device)
    recall_metric = MulticlassRecall(average='macro', num_classes=args.num_classes).to(args.device)

    # Set the model to evaluation mode
    model.eval()
    
    # Loss function
    loss_func = torch.nn.CrossEntropyLoss().to(args.device)

    # No need to compute gradients for evaluation
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for x, y, padding_x_mask in test_loader:
            x, y = x.to(args.device), y.to(args.device)
            # Forward pass
            pred = model(x, padding_x_mask)
            loss = loss_func(pred, y)
            test_loss += loss.item()

            # Update metrics
            accuracy_metric.update(pred, y)
            f1_score_metric.update(pred, y)
            precision_metric.update(pred, y)
            recall_metric.update(pred, y)

            # Calculate correct predictions for accuracy
            _, predicted = torch.max(pred, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()

    # Compute average test loss and finalize metrics
    test_loss /= len(test_loader)
    test_accuracy = accuracy_metric.compute().item()
    test_f1_score = f1_score_metric.compute().item()
    test_precision = precision_metric.compute().item()
    test_recall = recall_metric.compute().item()

    # Print test results
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, "
          f"Test F1 Score: {test_f1_score:.4f}, Test Precision: {test_precision:.4f}, "
          f"Test Recall: {test_recall:.4f}")

## Tuning

In [13]:
search_space = {
    'lr': tune.loguniform(1e-5, 1e-2),
    'batch_size': tune.choice([64, 128, 192, 256]),
    'n_heads': tune.choice([4, 8, 16]),
    'd_model': tune.choice([32, 64, 96]),
    'num_kernels': tune.choice([2, 4, 6]),
    'e_layers': tune.choice([2, 3, 4]),
    'seq_len': tune.choice([64, 128, 192]),
    'dropout': tune.uniform(0.0, 0.5),
    'att_loc': tune.choice(['block', 'model']),
    'num_classes': 6 
}

# Ray Tune's scheduler
scheduler = ASHAScheduler(
        max_t=100,
        grace_period=1,
        reduction_factor=2
)

# Ray Tune's execution
tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(train_model),
        resources={"cpu": 6, "gpu": 1}
    ),
    tune_config=tune.TuneConfig(
        metric="loss",
        mode="min",
        scheduler=scheduler,
        num_samples=20,
        max_concurrent_trials=1
    ),
    param_space=search_space,
)

results = tuner.fit()

0,1
Current time:,2024-12-04 16:07:28
Running for:,00:01:05.97
Memory:,7.2/15.9 GiB

Trial name,# failures,error file
train_model_b4dc3_00000,1,"/tmp/ray/session_2024-12-04_16-06-21_490553_2937416/artifacts/2024-12-04_16-06-22/train_model_2024-12-04_16-06-21/driver_artifacts/train_model_b4dc3_00000_0_att_loc=model,batch_size=64,d_model=32,dropout=0.2226,e_layers=2,lr=0.0000,n_heads=4,num_kernels=2,seq_l_2024-12-04_16-06-22/error.txt"

Trial name,status,loc,att_loc,batch_size,d_model,dropout,e_layers,lr,n_heads,num_kernels,seq_len
train_model_b4dc3_00001,RUNNING,10.34.1.111:2938305,model,192,64,0.133095,4,2.25219e-05,4,6,64
train_model_b4dc3_00000,ERROR,10.34.1.111:2938015,model,64,32,0.222562,2,2.3518e-05,4,2,128


[36m(train_model pid=2938015)[0m Epoch 1/50


2024-12-04 16:06:36,213	ERROR tune_controller.py:1331 -- Trial task failed for trial train_model_b4dc3_00000
Traceback (most recent call last):
  File "/home/rifki/anaconda3/envs/timeseries/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/rifki/anaconda3/envs/timeseries/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/rifki/anaconda3/envs/timeseries/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/rifki/anaconda3/envs/timeseries/lib/python3.9/site-packages/ray/_private/worker.py", line 2753, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/home/rifki/anaconda3/envs/timeseries/lib/python3.9/site-packages/ray/_private/worker.py", line 904, in get_objects
    raise value.as_inst

[36m(train_model pid=2938015)[0m Training loss: 1.3567, Training accuracy: 0.4500
[36m(train_model pid=2938305)[0m Epoch 1/50


2024-12-04 16:07:28,770	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/rifki/ray_results/train_model_2024-12-04_16-06-21' in 0.1243s.
2024-12-04 16:07:38,776	ERROR tune.py:1037 -- Trials did not complete: [train_model_b4dc3_00000]
2024-12-04 16:07:38,777	INFO tune.py:1041 -- Total run time: 76.01 seconds (65.84 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/home/rifki/ray_results/train_model_2024-12-04_16-06-21", trainable=...)


In [None]:
best_result = results.get_best_result("loss", "min")

print("Best trial config: {}".format(best_result.config))
print("Best trial final training loss: {}".format(
    best_result.metrics["loss"]))
print("Best trial final training accuracy: {}".format(
    best_result.metrics["accuracy"]))

test_best_model(best_result, best_result.config)

Best trial config: {'lr': 0.0007060636809167652, 'batch_size': 128, 'n_heads': 8, 'd_model': 32, 'num_kernels': 8, 'e_layers': 2, 'seq_len': 64, 'dropout': 0.22736732797949116, 'att_loc': 'model', 'num_classes': 6}
Best trial final validation loss: 0.01353292826452321
Best trial final validation accuracy: 0.9953708052635193


AttributeError: 'Result' object has no attribute 'eval'