In [18]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
# from layers import PositionalEmbedding, MultiHeadSelfAttention, ConvLayer
import datetime
from tensorflow.python.keras.engine.base_layer import Layer
from math import sqrt

In [219]:
f = np.load("mfcc_fixed.npz")
X, Y = f['X'], f['Y']
X_train =  X[0:900]
Y_train =  Y[0:900]        # fold 10


x_test, y_test = X[900:1000], Y[900:1000]
x_train, y_train = X_train[0:800], Y_train[0:800]
x_val, y_val= X_train[800:900], Y_train[800:900]

In [220]:
batch_size = 32

train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=800, reshuffle_each_iteration=True).batch(batch_size, drop_remainder=True)
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.shuffle(buffer_size=800, reshuffle_each_iteration=True).batch(batch_size, drop_remainder=True)
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_dataset = test_dataset.shuffle(buffer_size=800, reshuffle_each_iteration=True).batch(batch_size, drop_remainder=True)


x = x_train[0:batch_size]
x_rank = tf.rank(x).numpy()
x_norm_resize_shape = [batch_size] + list(tf.ones(tf.rank(x), dtype=tf.int32).numpy())[1:]

In [221]:
class ProbAttention(Layer):
    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1):
        super(ProbAttention, self).__init__()
        self.factor = factor
        self.scale = scale
        self.mask_flag = mask_flag
        self.dropout = tf.keras.layers.Dropout(attention_dropout)

    def _prob_QK(self, Q, K, sample_k, n_top):
        # Q [B, H, L, D]
        B, H, L, E = K.shape
        _, _, S, _ = Q.shape

        # calculate the sampled Q_K
        K_expand = tf.broadcast_to(tf.expand_dims(K, -3), (B, H, S, L, E))

        indx_q_seq = tf.random.uniform((S,), maxval=L, dtype=tf.int32)
        indx_k_seq = tf.random.uniform((sample_k,), maxval=L, dtype=tf.int32)

        K_sample = tf.gather(K_expand, tf.range(S), axis=2)

        K_sample = tf.gather(K_sample, indx_q_seq, axis=2)
        K_sample = tf.gather(K_sample, indx_k_seq, axis=3)

        Q_K_sample = tf.squeeze(tf.matmul(tf.expand_dims(Q, -2), tf.einsum("...ij->...ji", K_sample)))
        # find the Top_k query with sparisty measurement
        M = tf.math.reduce_max(Q_K_sample, axis=-1) - tf.raw_ops.Div(x=tf.reduce_sum(Q_K_sample, axis=-1), y=L)
        M_top = tf.math.top_k(M, n_top, sorted=False)[1]
        batch_indexes = tf.tile(tf.range(Q.shape[0])[:, tf.newaxis, tf.newaxis], (1, Q.shape[1], n_top))
        head_indexes = tf.tile(tf.range(Q.shape[1])[tf.newaxis, :, tf.newaxis], (Q.shape[0], 1, n_top))

        idx = tf.stack(values=[batch_indexes, head_indexes, M_top], axis=-1)

        # use the reduced Q to calculate Q_K
        Q_reduce = tf.gather_nd(Q, idx)

        Q_K = tf.matmul(Q_reduce, tf.transpose(K, [0, 1, 3, 2]))

        return Q_K, M_top
    
    
    def _get_initial_context(self, V, L_Q):
        B, H, L_V, D = V.shape
        if not self.mask_flag:
            V_sum = tf.reduce_sum(V, -2)
            contex = tf.identity(tf.broadcast_to(tf.expand_dims(V_sum, -2), [B, H, L_Q, V_sum.shape[-1]]))
        else:  # use mask
            assert (L_Q == L_V)  # requires that L_Q == L_V, i.e. for self-attention only
            contex = tf.math.cumsum(V, axis=-1)
        return contex

    def _update_context(self, context_in, V, scores, index, L_Q):
        B, H, L_V, D = V.shape

        if self.mask_flag:
            attn_mask = ProbMask(B, H, L_Q, index, scores)

            # scores.masked_fill_(attn_mask.mask, -np.inf)
            num = 3.4 * math.pow(10, 38)
            scores = (scores * attn_mask.mask) + (-((attn_mask.mask * num + num) - num))

        attn = tf.keras.activations.softmax(scores, axis=-1)  # nn.Softmax(dim=-1)(scores)
        batch_indexes = tf.tile(tf.range(V.shape[0])[:, tf.newaxis, tf.newaxis], (1, V.shape[1], index.shape[-1]))
        head_indexes = tf.tile(tf.range(V.shape[1])[tf.newaxis, :, tf.newaxis], (V.shape[0], 1, index.shape[-1]))

        idx = tf.stack(values=[batch_indexes, head_indexes, index], axis=-1)

        context_in = tf.tensor_scatter_nd_update(context_in, idx, tf.matmul(attn, V))

        return tf.convert_to_tensor(context_in)

    def call(self, inputs, attn_mask=None):
        queries, keys, values = inputs
        B, L, H, D = queries.shape
        _, S, _, _ = keys.shape

        queries = tf.reshape(queries, (B, H, L, -1))
        keys = tf.reshape(keys, (B, H, S, -1))
        values = tf.reshape(values, (B, H, S, -1))

        U = self.factor * np.ceil(np.log(S)).astype('int').item()
        u = self.factor * np.ceil(np.log(L)).astype('int').item()

        scores_top, index = self._prob_QK(queries, keys, u, U)
        # add scale factor
        scale = self.scale or 1. / sqrt(D)
        if scale is not None:
            scores_top = scores_top * scale
        # get the context
        context = self._get_initial_context(values, L)
        # update the context with selected top_k queries
        context = self._update_context(context, values, scores_top, index, L)

        return context


class AttentionLayer(Layer):
    def __init__(self, attention, d_model, n_heads, d_keys=None,
                 d_values=None):
        super(AttentionLayer, self).__init__()

        d_keys = d_keys or (d_model//n_heads)
        d_values = d_values or (d_model//n_heads)
        self.d_model = d_model

        self.inner_attention = attention
        self.query_projection = tf.keras.layers.Dense(d_keys * n_heads)
        self.key_projection = tf.keras.layers.Dense(d_keys * n_heads)
        self.value_projection = tf.keras.layers.Dense(d_values * n_heads)
        self.out_projection = tf.keras.layers.Dense(d_model)
        self.n_heads = n_heads

    def build(self, input_shape):
        print(input_shape)
        B, L, _ = input_shape[0]
        _, S, _ = input_shape[1]
        H = self.n_heads

        self.queries = self.add_weight(shape=(B, L, H, self.d_model),
                                 initializer='random_normal',
                                 trainable=True)

        self.keys = self.add_weight(shape=(B, S, H, self.d_model),
                                       initializer='random_normal',
                                       trainable=True)

        self.values = self.add_weight(shape=(B, S, H, self.d_model),
                                       initializer='random_normal',
                                       trainable=True)

    def call(self, inputs, attn_mask=None):
        queries, keys, values = inputs
        B, L, _ = queries.shape
        _, S, _ = keys.shape
        H = self.n_heads

        self.queries = tf.reshape(self.query_projection(queries), (B, L, H, -1))
        self.keys = tf.reshape(self.key_projection(keys), (B, S, H, -1))
        self.values = tf.reshape(self.value_projection(values), (B, S, H, -1))

        out = tf.reshape(self.inner_attention([self.queries, self.keys, self.values], attn_mask=attn_mask), (B, L, -1))

        return self.out_projection(out)
    
    
class ConvLayer(Layer):
    def __init__(self, c_in):
        super(ConvLayer, self).__init__()
        self.downConv = tf.keras.layers.Conv1D(
                                  filters=c_in,
                                  kernel_size=3,
                                  padding='causal')
        self.norm = tf.keras.layers.BatchNormalization()
        self.activation = tf.keras.layers.ELU()
        self.maxPool = tf.keras.layers.MaxPool1D(pool_size=3, strides=2)

    def call(self, x, **kargs):
        x = self.downConv(x)
        x = self.norm(x)
        x = self.activation(x)
        x = self.maxPool(x)
        return x


class EncoderLayer(Layer):
    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
        super(EncoderLayer, self).__init__()
        d_ff = d_ff or 4*d_model
        self.attention = attention
        self.conv1 = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1)
        self.conv2 = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1)
        self.norm1 = tf.keras.layers.LayerNormalization()
        self.norm2 = tf.keras.layers.LayerNormalization()
        self.dropout = tf.keras.layers.Dropout(dropout)
        self.activation = tf.keras.activations.relu if activation == "relu" else tf.keras.activations.gelu

    def call(self, x, attn_mask=None):
        # x [B, L, D]
        print(x.shape)
        print(self.dropout(self.attention(
            [x, x, x],
            attn_mask = attn_mask
        )).shape)
        x = x + self.dropout(self.attention(
            [x, x, x],
            attn_mask = attn_mask
        ))

        y = x = self.norm1(x)
        y = self.dropout(self.activation(self.conv1(y)))
        y = self.dropout(self.conv2(y))
        return self.norm2(x+y)


class Encoder(Layer):
    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
        super(Encoder, self).__init__()
        self.attn_layers = attn_layers
        self.conv_layers = conv_layers if conv_layers is not None else None
        self.norm = norm_layer

    def call(self, x, attn_mask=None):
        # x [B, L, D]
        if self.conv_layers is not None:
            for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers):
                x = attn_layer(x, attn_mask=attn_mask)
                x = conv_layer(x)
            x = self.attn_layers[-1](x)
        else:
            for attn_layer in self.attn_layers:
                x = attn_layer(x, attn_mask=attn_mask)

        if self.norm is not None:
            x = self.norm(x)

        return x
    
    
def positional_encoding(length, depth):
        depth = depth/2

        positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
        depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

        angle_rates = 1 / (10000**depths)         # (1, depth)
        angle_rads = positions * angle_rates      # (pos, depth)

        pos_encoding = np.concatenate(
            [np.sin(angle_rads), np.cos(angle_rads)],
            axis=-1)

        return tf.cast(pos_encoding, dtype=tf.float32)    
    
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, d_model):
        super(PositionalEmbedding, self).__init__()
        self.d_model = d_model
        self.embd = keras.layers.Conv1D(filters=d_model, kernel_size=1)
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embd(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x


In [222]:
class Informer(keras.Model):

    def __init__(self, seq_len, batch_size,
                 factor=5, d_model=512, n_heads=8, e_layers=2, d_layers=2, d_ff=512,
                 dropout=0.1, attn='prob', activation='gelu'):
        super(Informer, self).__init__()
        self.attn = attn
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.gap = keras.layers.GlobalAveragePooling1D(data_format="channels_first")
        self.dense = keras.layers.Dense(5, activation='softmax')
        self.embd = PositionalEmbedding(d_model=512)

        # Attention
        Attn = ProbAttention if attn == 'prob' else FullAttention
        # Encoder
        self.encoder = Encoder(
            [
                EncoderLayer(
                    AttentionLayer(Attn(False, factor, attention_dropout=dropout),
                                   d_model, n_heads),
                    d_model,
                    d_ff,
                    dropout=dropout,
                    activation=activation
                ) for l in range(e_layers)
            ],
            [
                ConvLayer(
                    d_model
                ) for l in range(e_layers - 1)
            ],
            norm_layer=tf.keras.layers.LayerNormalization()
        )
        
    def call(self, inputs):

        x = inputs
        x = self.embd(x)

#         x.set_shape((self.batch_size, self.seq_len, 512))

        x = self.encoder(x)

        x = self.gap(x)
        x = self.dense(x)

        return x

In [223]:
model = Informer(54, 32)

In [224]:
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
lds = lambda x, y: tf.math.reduce_sum(keras.losses.kl_divergence(x, y))
acc_metric = keras.metrics.SparseCategoricalAccuracy()
optimizer = keras.optimizers.Adam(learning_rate=1e-4)

In [225]:
es_callback = keras.callbacks.EarlyStopping(
                                            monitor="val_loss",
                                            min_delta=0,
                                            patience=100,
                                            verbose=1,
                                            mode="auto",
                                            baseline=None,
                                            restore_best_weights=True,
                                            start_from_epoch=0)

model.compile(optimizer=optimizer, loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), metrics=['accuracy'])

In [226]:
model.fit(train_dataset, validation_data=val_dataset, batch_size=batch_size, epochs=2000,
          callbacks=[es_callback])

Epoch 1/2000
(32, 54, 512)
[TensorShape([32, 54, 512]), TensorShape([32, 54, 512]), TensorShape([32, 54, 512])]
(32, 54, 512)
(32, 26, 512)
[TensorShape([32, 26, 512]), TensorShape([32, 26, 512]), TensorShape([32, 26, 512])]
(32, 26, 512)
(32, 54, 512)
(32, 54, 512)
(32, 26, 512)
(32, 26, 512)
(32, 54, 512)
(32, 26, 512)
(32, 26, 512)
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000

<keras.src.callbacks.History at 0x7fc5f337f4f0>

In [232]:
acc_metric.reset_states()
acc_metric.update_state(y_test, model(x_test))
acc = acc_metric.result().numpy()
loss = loss_fn(y_test, model(x_test)).numpy()
print(acc, loss)

(100, 54, 512)
(100, 54, 512)
(100, 26, 512)
(100, 26, 512)
(100, 54, 512)
(100, 54, 512)
(100, 26, 512)
(100, 26, 512)
1.0 0.0008677371


In [175]:
model.summary()

Model: "informer_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 global_average_pooling1d_1  multiple                  0         
 3 (GlobalAveragePooling1D)                                      
                                                                 
 dense_221 (Dense)           multiple                  135       
                                                                 
 positional_embedding_17 (P  multiple                  15872     
 ositionalEmbedding)                                             
                                                                 
Total params: 3961991 (15.11 MB)
Trainable params: 3960967 (15.11 MB)
Non-trainable params: 1024 (4.00 KB)
_________________________________________________________________
