# Mô Hình Phân loại bằng kiến trúc transfomer

## Đọc Dữ Liệu

In [1]:
import librosa
import numpy as np
import os
#lấy các thư mục dưới dạng tên của từ vựng
folder_path = 'data'
ds = os.listdir(folder_path)
#thu thập dự liệu âm thanh của từng labels
labels = []
data = []
for step in ds:
    files = os.listdir(f'data/{step}')
    for i in files:
        audio_data, sr = librosa.load(f'data/{step}/{i}', sr=None) 
        #lầm đều kích thước tranning ~2,5s
        audio_data = np.pad(audio_data, (0, 50000 - len(audio_data)), 'constant')
        #Trích xuất đặc trưng mfccs
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sr)
        data.append(mfccs)
        labels.append(step)  
        
data_s = np.array(data)
labels = np.array(labels)
print(data_s.shape)


(29940, 20, 98)


##  Mã hóa Labels

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
tokenizer = Tokenizer()
tokenizer.fit_on_texts(labels)
labelss = tokenizer.texts_to_sequences(labels)
#mã hóa one-hot
one_hot_labels = to_categorical(labelss)

## Phân Chia Dữ Liệu

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_s, one_hot_labels, test_size=0.2, random_state=42)


##  Tạo Model

In [4]:

from tensorflow.keras import layers

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs, inputs )
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)
    
    # Định nghĩa phương thức get_config để lưu cấu hình
    def get_config(self):
        config = super().get_config()
        config.update({
            'num_heads': self.num_heads,
            'key_dim': self.key_dim,
            'ff_dim': self.ff_dim,
            'dropout_rate': self.dropout_rate,
        })
        return config

    # Định nghĩa from_config để khôi phục lớp từ cấu hình đã lưu
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [5]:
 # Two seperate embedding layers, one for tokens, one for token index (positions)

class TokenAndPositionEmbedding(layers.Layer):
    #def __init__(self, maxlen, vocab_size, embed_dim):
     #   super().__init__()
     #   self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
      #  self.pos_emb = layers.Embedding(input_dim=20, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        sequence = tf.range(maxlen)
        positions = tf.tile(tf.expand_dims(sequence, axis=0), [20, 1])
        positions = tf.cast(positions, tf.float32) 
        return x + positions

In [12]:
from tensorflow import keras
import tensorflow as tf
embed_dim = 98 # Kích thước nhúng cho mỗi mã thông báo
num_heads = 5 # Số lượng đầu chú ý
ff_dim = 128 # Kích thước lớp ẩn trong mạng truyền tiếp bên trong máy biến áp
sequence_len = 98
vocab_size = 2000
inputs = layers.Input(shape=(20,98))
embedding_layer = TokenAndPositionEmbedding()
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(inputs)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(999, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

In [13]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train,validation_data=(X_test, y_test), epochs=50, batch_size=32)

Epoch 1/50
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 29ms/step - accuracy: 8.8552e-04 - loss: 6.9117 - val_accuracy: 8.3500e-04 - val_loss: 6.8138
Epoch 2/50
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 28ms/step - accuracy: 0.0017 - loss: 6.6392 - val_accuracy: 0.0017 - val_loss: 6.2805
Epoch 3/50
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 28ms/step - accuracy: 0.0048 - loss: 6.1659 - val_accuracy: 0.0125 - val_loss: 5.5593
Epoch 4/50
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 28ms/step - accuracy: 0.0144 - loss: 5.4911 - val_accuracy: 0.0324 - val_loss: 4.9313
Epoch 5/50
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 28ms/step - accuracy: 0.0304 - loss: 5.0249 - val_accuracy: 0.0663 - val_loss: 4.3765
Epoch 6/50
[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 28ms/step - accuracy: 0.0559 - loss: 4.5110 - val_accuracy: 0.1119 - val_loss: 3.8258
Epoch 7/

[1m749/749[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 29ms/step - accuracy: 0.7103 - loss: 0.8834 - val_accuracy: 0.8223 - val_loss: 0.5374


<keras.src.callbacks.history.History at 0x2218b8a9c50>

In [31]:
model.save('encoderTrans.keras')