In [None]:
# Import thư viện
import os
import numpy as np
import librosa
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout
from tensorflow.keras.layers import Conv2D, Reshape, MultiHeadAttention, Flatten, GlobalAveragePooling1D

In [None]:
# Trích xuất đặc trưng MFCC
def extract_mfcc(audio, sr=16000, n_mfcc=40):
    hop_length = int(0.01 * sr)  # 10 ms
    win_length = int(0.025 * sr)  # 25 ms
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, win_length=win_length)
    return mfcc

# Load dữ liệu từ thư mục
def load_data(data_dir, sr=16000, n_mfcc=40):
    X, y = [], []
    for label in os.listdir(data_dir):
        label_dir = os.path.join(data_dir, label)
        if os.path.isdir(label_dir):
            for file in os.listdir(label_dir):
                if file.endswith('.wav'):
                    file_path = os.path.join(label_dir, file)
                    audio, _ = librosa.load(file_path, sr=sr)
                    mfcc = extract_mfcc(audio, sr, n_mfcc)
                    X.append(mfcc)
                    y.append(label)
    return np.array(X), np.array(y)


In [None]:
# Vision Transformer Block
def transformer_block(inputs, num_heads, mlp_dim, dropout_rate=0.1):
    # Multi-head Self Attention
    x = LayerNormalization(epsilon=1e-6)(inputs)
    x = MultiHeadAttention(num_heads=num_heads, key_dim=inputs.shape[-1])(x, x)
    x = Dropout(dropout_rate)(x)
    x = x + inputs  # Residual Connection

    # MLP (Feedforward)
    y = LayerNormalization(epsilon=1e-6)(x)
    y = Dense(mlp_dim, activation='gelu')(y)
    y = Dropout(dropout_rate)(y)
    y = Dense(inputs.shape[-1])(y)
    y = Dropout(dropout_rate)(y)
    return x + y  # Residual Connection

# Xây dựng mô hình Vision Transformer
def build_vit(input_shape, num_classes, patch_size=4, num_patches=16, num_heads=4, mlp_dim=128, num_blocks=4):
    inputs = Input(shape=input_shape)

    # Chia thành các "patches"
    x = Conv2D(filters=mlp_dim, kernel_size=patch_size, strides=patch_size, padding='valid')(inputs)
    x = Reshape((num_patches, -1))(x)  # (batch_size, num_patches, embedding_dim)

    # Thêm embedding của lớp học (class token)
    cls_token = tf.Variable(tf.zeros((1, 1, mlp_dim)), trainable=True)
    cls_tokens = tf.repeat(cls_token, repeats=tf.shape(x)[0], axis=0)
    x = tf.concat([cls_tokens, x], axis=1)

    # Transformer Encoder Blocks
    for _ in range(num_blocks):
        x = transformer_block(x, num_heads, mlp_dim)

    # Classification Head
    x = GlobalAveragePooling1D()(x)
    outputs = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [None]:
# Load dữ liệu
train_dir = r'C:\Users\USER\Downloads\SV_NCKH_audio_event\Train'
test_dir = r'C:\Users\USER\Downloads\SV_NCKH_audio_event\Test'

print("Loading training data...")
X_train, y_train = load_data(train_dir)
print("Loading testing data...")
X_test, y_test = load_data(test_dir)

# Chuẩn hóa và chuẩn bị dữ liệu
scaler = StandardScaler()
X_train = np.array([scaler.fit_transform(x) for x in X_train])  # Normalize MFCC
X_test = np.array([scaler.transform(x) for x in X_test])

# Định dạng dữ liệu
X_train = np.expand_dims(X_train, axis=-1)  # (batch_size, time, freq, 1)
X_test = np.expand_dims(X_test, axis=-1)

In [None]:
# Encode nhãn
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Chuyển nhãn sang dạng one-hot
num_classes = len(np.unique(y_train))
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

# Xây dựng và huấn luyện mô hình Vision Transformer
input_shape = X_train.shape[1:]  # Input shape (time, freq, 1)
model = build_vit(input_shape, num_classes)

print("Training Vision Transformer model...")
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=32)

# Đánh giá mô hình
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

# Hiển thị kiến trúc mô hình
model.summary()