In [1]:
import os
import librosa
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [2]:
# Hàm trích xuất MFCC cùng với delta và delta-delta
def extract_mfcc_with_deltas(audio, sr=16000, n_mfcc=13):
    hop_length = int(0.01 * sr)  # Bước nhảy 10 ms
    win_length = int(0.025 * sr)  # Cửa sổ 25 ms
    
    # Tính toán MFCCs
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, win_length=win_length, window='hamming')
    
    # Tính toán delta (đạo hàm bậc nhất)
    mfcc_delta = librosa.feature.delta(mfccs)
    
    # Tính toán delta-delta (đạo hàm bậc hai)
    mfcc_delta2 = librosa.feature.delta(mfccs, order=2)
    
    # Gộp MFCCs, delta và delta-delta
    mfcc_combined = np.vstack([mfccs, mfcc_delta, mfcc_delta2])
    
    return mfcc_combined.T  # Chuyển vị để phù hợp với định dạng (time_steps, features)

In [3]:
# Hàm tải dữ liệu từ thư mục
def load_data(data_dir):
    X = []
    y = []
    labels = os.listdir(data_dir)
    
    for label in labels:
        label_dir = os.path.join(data_dir, label)
        for file_name in os.listdir(label_dir):
            if file_name.endswith(".wav"):
                file_path = os.path.join(label_dir, file_name)
                # Tải file âm thanh
                audio, sr = librosa.load(file_path, sr=None)
                features = extract_mfcc_with_deltas(audio, sr)
                X.append(features)
                y.append(label)
    
    return np.array(X), np.array(y)

In [4]:
# Đường dẫn tới dữ liệu huấn luyện và kiểm thử
train_data_dir = r'C:\Users\USER\Downloads\SV_NCKH_audio_event\Train'
test_data_dir = r'C:\Users\USER\Downloads\SV_NCKH_audio_event\Test'


# Tải dữ liệu
X_train, y_train = load_data(train_data_dir)
X_test, y_test = load_data(test_data_dir)

# Encode labels (chuẩn hóa nhãn)
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# One-hot encode nhãn
y_train = to_categorical(y_train, num_classes=len(np.unique(y_train)))
y_test = to_categorical(y_test, num_classes=len(np.unique(y_test)))

# Chuẩn hóa đặc trưng (z-score normalization)
scaler = StandardScaler()

# Chuẩn hóa theo từng feature trong chuỗi thời gian
def reshape_and_scale(X, scaler=None):
    num_samples = X.shape[0]
    num_time_steps = X.shape[1]
    num_features = X.shape[2]
    
    # Flatten dữ liệu cho chuẩn hóa
    X_flat = X.reshape(-1, num_features)
    
    # Chuẩn hóa
    if scaler:
        X_flat = scaler.transform(X_flat)
    else:
        scaler = StandardScaler().fit(X_flat)
        X_flat = scaler.transform(X_flat)
    
    # Reshape lại dữ liệu sau chuẩn hóa
    X_scaled = X_flat.reshape(num_samples, num_time_steps, num_features)
    
    return X_scaled, scaler

# Chuẩn hóa dữ liệu
X_train, scaler = reshape_and_scale(X_train)
X_test, _ = reshape_and_scale(X_test, scaler)

In [5]:
# Xây dựng mô hình RNN
def create_rnn_model(input_shape, num_classes):
    model = Sequential()
    model.add(LSTM(64, return_sequences=False, input_shape=input_shape))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    
    # Compile mô hình
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [6]:
# Xác định input shape và số lượng class
input_shape = (X_train.shape[1], X_train.shape[2])
num_classes = y_train.shape[1]

# Tạo mô hình RNN
model = create_rnn_model(input_shape, num_classes)
model.summary()

# Huấn luyện mô hình
model.fit(X_train, y_train, batch_size=32, epochs=50, validation_data=(X_test, y_test))

# Đánh giá mô hình trên tập test
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Đánh giá kết quả
accuracy = accuracy_score(y_test_classes, y_pred_classes)
report = classification_report(y_test_classes, y_pred_classes, target_names=le.classes_)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

  super().__init__(**kwargs)


Epoch 1/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - accuracy: 0.1398 - loss: 2.8869 - val_accuracy: 0.3004 - val_loss: 2.3199
Epoch 2/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 73ms/step - accuracy: 0.3506 - loss: 2.1974 - val_accuracy: 0.4005 - val_loss: 1.8957
Epoch 3/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 58ms/step - accuracy: 0.4722 - loss: 1.7610 - val_accuracy: 0.4554 - val_loss: 1.7483
Epoch 4/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 48ms/step - accuracy: 0.5509 - loss: 1.4450 - val_accuracy: 0.4829 - val_loss: 1.7201
Epoch 5/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - accuracy: 0.6042 - loss: 1.3233 - val_accuracy: 0.5460 - val_loss: 1.6127
Epoch 6/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - accuracy: 0.6496 - loss: 1.1738 - val_accuracy: 0.5350 - val_loss: 1.6610
Epoch 7/50
[1m74/74[0m [32m━━━━