# Use the encoder part of the Informer model for time series classification

The Informer is a transformer-based model for time series forecasting, and its encoder part can be used for extracting features from time series data.

In [None]:
import torch
import torch.nn as nn
from einops import rearrange, repeat
from einops.layers.torch import Rearrange

class TemporalAggregation(nn.Module):
    '''
    In this modified version, the Informer encoder is used to extract features 
    from the input time series data, and a temporal aggregation layer is applied 
    to aggregate the features across the time dimension. Finally, a linear classifier 
    is used to predict the class labels.
    '''
    def __init__(self, dim_model):
        super().__init__()
        self.proj = nn.Conv1d(dim_model, dim_model, 1)

    def forward(self, x):
        x = self.proj(x)
        return x.mean(dim=2)

class InformerEncoder(nn.Module):
    def __init__(self, input_dim, dim_model, n_heads, num_stacks, factor, distil):
        super().__init__()
        self.input_dim = input_dim
        self.dim_model = dim_model
        self.n_heads = n_heads
        self.num_stacks = num_stacks
        self.factor = factor
        self.distil = distil

        self.proj = nn.Conv1d(input_dim, dim_model, 1)
        self.pos_enc = nn.Parameter(torch.randn(1, dim_model, factor))

        self.stacks = nn.ModuleList(
            [InformerStack(dim_model, n_heads, factor, distil) for _ in range(num_stacks)]
        )

        self.aggregation = TemporalAggregation(dim_model)
        self.classifier = nn.Linear(dim_model, num_classes)

    def forward(self, x):
        x = self.proj(x)
        x = rearrange(x, 'b c l -> b l c')
        x += self.pos_enc[:, :, : x.size(1)]

        for stack in self.stacks:
            x = stack(x)

        x = self.aggregation(x)
        x = rearrange(x, 'b l c -> b c l')
        x = self.classifier(x)

        return x

class InformerStack(nn.Module):
    def __init__(self, dim_model, n_heads, factor, distil):
        super().__init__()
        self.attn = ProbSparseSelfAttention(dim_model, n_heads, factor)
        self.ff = FFN(dim_model, distil)
        self.norm1 = nn.LayerNorm(dim_model)
        self.norm2 = nn.LayerNorm(dim_model)

    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.ff(self.norm2(x))
        return x

class ProbSparseSelfAttention(nn.Module):
    # ProbSparse Self-Attention mechanism
    def __init__(self, dim_model, n_heads, factor):
        super().__init__()
        self.dim_model = dim_model
        self.n_heads = n_heads
        self.factor = factor
        self.head_dim = dim_model // n_heads

        self.qkv = nn.Conv1d(dim_model, 3 * dim_model, 1)
        self.proj = nn.Conv1d(dim_model, dim_model, 1)
        self.scale = self.head_dim ** -0.5

    def forward(self, x):
        b, l, c = x.size()
        x = self.qkv(x)
        qkv = rearrange(x, 'b l (three c) -> three b c l', three=3)
        q, k, v = map(lambda t: rearrange(t, 'b c l -> b l (h c)', h=self.n_heads), qkv)

        # Compute attention scores and probabilities
        attn = torch.einsum('b l h d, b l h d -> b l h', q * self.scale, k)
        attn = attn.softmax(dim=1)

        # Sample top-k indices
        topk_indices = attn.topk(self.factor, dim=1).indices

        # Gather values
        v_gather = gather(v, 1, topk_indices)

        # Compute output
        out = torch.einsum('b l h d, b l h d -> b l h', attn[:, :, :, :self.factor], v_gather)
        out = rearrange(out, 'b l (h d) -> b l (h d)', h=self.n_heads)
        out = self.proj(out)
        return out

class FFN(nn.Module):
    def __init__(self, dim_model, distil):
        super().__init__()
        self.fc1 = nn.Conv1d(dim_model, dim_model * 4, 1)
        self.fc2 = nn.Conv1d(dim_model * 4, dim_model, 1)
        self.distil = distil

    def forward(self, x):
        x1 = self.fc1(x)
        x1 = x1.gelu()
        x2 = self.fc2(x1)
        if self.distil:
            x2 = x + x2
        return x2


In [None]:
input_dim = 1  # Number of input features
dim_model = 512  # Model dimension
n_heads = 8  # Number of attention heads
num_stacks = 2  # Number of Informer stacks
factor = 50  # ProbSparse attention factor
distil = True  # Whether to use distillation in FFN
num_classes = 10  # Number of classes for classification

model = InformerEncoder(input_dim, dim_model, n_heads, num_stacks, factor, distil)
