In [1]:
import os
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Activation, MaxPooling2D, Add, Flatten, Dense, GlobalAveragePooling2D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

In [2]:
# Hàm trích xuất MFCC
def extract_mfcc(audio, sr=16000, n_mfcc=40):
    hop_length = int(0.01 * sr)  # 10 ms hop
    win_length = int(0.025 * sr)  # 25 ms window
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, win_length=win_length)
    return np.expand_dims(mfcc, axis=-1)  # Thêm chiều kênh (cho CNN/ResNet)

# Hàm tải dữ liệu
def load_data(data_dir):
    X = []
    y = []
    labels = os.listdir(data_dir)
    for label in labels:
        label_dir = os.path.join(data_dir, label)
        for file_name in os.listdir(label_dir):
            if file_name.endswith(".wav"):
                file_path = os.path.join(label_dir, file_name)
                # Tải tệp âm thanh
                audio, sr = librosa.load(file_path, sr=None)
                features = extract_mfcc(audio, sr)
                X.append(features)
                y.append(label)
    return np.array(X), np.array(y)

In [3]:
# Residual Block
def residual_block(x, filters, kernel_size=(3, 3), stride=(1, 1)):
    shortcut = x  # Đường tắt (identity connection)
    
    # Convolutional layer 1
    x = Conv2D(filters, kernel_size, strides=stride, padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    
    # Convolutional layer 2
    x = Conv2D(filters, kernel_size, strides=(1, 1), padding="same")(x)
    x = BatchNormalization()(x)
    
    # Kết nối residual (shortcut connection)
    if shortcut.shape[-1] != filters:
        shortcut = Conv2D(filters, (1, 1), strides=stride, padding="same")(shortcut)
        shortcut = BatchNormalization()(shortcut)
    
    x = Add()([x, shortcut])
    x = Activation("relu")(x)
    return x

In [4]:
# Hàm xây dựng ResNet
def build_resnet(input_shape, num_classes):
    inputs = Input(shape=input_shape)
    
    # Initial convolutional layer
    x = Conv2D(64, (7, 7), strides=(2, 2), padding="same")(inputs)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
    
    # Residual blocks
    x = residual_block(x, 64)
    x = residual_block(x, 128, stride=(2, 2))  # Downsample
    x = residual_block(x, 256, stride=(2, 2))  # Downsample
    x = residual_block(x, 512, stride=(2, 2))  # Downsample
    
    # Global Average Pooling and Output layer
    x = GlobalAveragePooling2D()(x)
    outputs = Dense(num_classes, activation="softmax")(x)
    
    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss="categorical_crossentropy", metrics=["accuracy"])
    return model

In [5]:
# Load dữ liệu
train_data_dir = r'C:\Users\USER\Downloads\SV_NCKH_audio_event\Train'
test_data_dir = r'C:\Users\USER\Downloads\SV_NCKH_audio_event\Test'

X_train, y_train = load_data(train_data_dir)
X_test, y_test = load_data(test_data_dir)

# Encode nhãn
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Chuẩn hóa dữ liệu và one-hot encode nhãn
num_classes = len(np.unique(y_train))
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

# Định dạng đầu vào (ResNet yêu cầu 4D tensor)
X_train = X_train.astype("float32")
X_test = X_test.astype("float32")

In [6]:
# Xây dựng ResNet
input_shape = X_train.shape[1:]
model = build_resnet(input_shape, num_classes)
model.summary()

# Huấn luyện mô hình
model.fit(X_train, y_train, batch_size=32, epochs=50, validation_data=(X_test, y_test))

# Đánh giá mô hình
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc:.4f}")

# Báo cáo kết quả
y_pred = np.argmax(model.predict(X_test), axis=1)
y_true = np.argmax(y_test, axis=1)
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=le.classes_))

Epoch 1/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 492ms/step - accuracy: 0.4942 - loss: 1.6982 - val_accuracy: 0.1454 - val_loss: 17.8322
Epoch 2/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 587ms/step - accuracy: 0.7368 - loss: 0.8322 - val_accuracy: 0.2291 - val_loss: 8.4755
Epoch 3/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 563ms/step - accuracy: 0.8037 - loss: 0.6027 - val_accuracy: 0.2263 - val_loss: 9.8997
Epoch 4/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 790ms/step - accuracy: 0.8386 - loss: 0.4838 - val_accuracy: 0.4088 - val_loss: 5.0600
Epoch 5/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 933ms/step - accuracy: 0.8507 - loss: 0.4546 - val_accuracy: 0.3059 - val_loss: 4.3028
Epoch 6/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 662ms/step - accuracy: 0.8521 - loss: 0.4454 - val_accuracy: 0.6269 - val_loss: 2.2627
Epoch 7/50
[1m74/74

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
