In [1]:
import os
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report

In [2]:
# Hàm trích xuất MFCC
def extract_mfcc(audio, sr=16000, n_mfcc=40):
    hop_length = int(0.01 * sr)  # 10 ms hop
    win_length = int(0.025 * sr)  # 25 ms window
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, win_length=win_length)
    return np.expand_dims(mfcc, axis=-1)  # Thêm chiều kênh (cho CNN)

# Hàm tải dữ liệu
def load_data(data_dir):
    X = []
    y = []
    labels = os.listdir(data_dir)
    for label in labels:
        label_dir = os.path.join(data_dir, label)
        for file_name in os.listdir(label_dir):
            if file_name.endswith(".wav"):
                file_path = os.path.join(label_dir, file_name)
                # Tải tệp âm thanh
                audio, sr = librosa.load(file_path, sr=None)
                features = extract_mfcc(audio, sr)
                X.append(features)
                y.append(label)
    return np.array(X), np.array(y)


In [3]:
# Load dữ liệu
train_data_dir = r'C:\Users\USER\Downloads\SV_NCKH_audio_event\Train'
test_data_dir = r'C:\Users\USER\Downloads\SV_NCKH_audio_event\Test'

X_train, y_train = load_data(train_data_dir)
X_test, y_test = load_data(test_data_dir)

# Encode nhãn
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# One-hot encode nhãn
num_classes = len(np.unique(y_train))
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

# Định dạng đầu vào
X_train = X_train.astype("float32")
X_test = X_test.astype("float32")

In [4]:
# EfficientNet model
def build_efficientnet(input_shape, num_classes):
    # Sử dụng EfficientNetB0 làm base model
    base_model = EfficientNetB0(include_top=False, weights=None, input_shape=input_shape)
    
    # Thêm các lớp phía trên
    x = GlobalAveragePooling2D()(base_model.output)
    outputs = Dense(num_classes, activation="softmax")(x)
    
    model = Model(inputs=base_model.input, outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss="categorical_crossentropy", metrics=["accuracy"])
    return model

# Xây dựng EfficientNet
input_shape = X_train.shape[1:]  # Dữ liệu đầu vào có shape (tần số, thời gian, kênh)
model = build_efficientnet(input_shape, num_classes)
model.summary()

# Huấn luyện mô hình
model.fit(X_train, y_train, batch_size=32, epochs=50, validation_data=(X_test, y_test))

Epoch 1/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 1s/step - accuracy: 0.2058 - loss: 2.6987 - val_accuracy: 0.1207 - val_loss: 2.9059
Epoch 2/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 837ms/step - accuracy: 0.4747 - loss: 1.7244 - val_accuracy: 0.0878 - val_loss: 3.1445
Epoch 3/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 856ms/step - accuracy: 0.5804 - loss: 1.2915 - val_accuracy: 0.1427 - val_loss: 3.1883
Epoch 4/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 1s/step - accuracy: 0.6820 - loss: 1.0660 - val_accuracy: 0.0247 - val_loss: 4.6785
Epoch 5/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 1s/step - accuracy: 0.7437 - loss: 0.8036 - val_accuracy: 0.0864 - val_loss: 6.0068
Epoch 6/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 994ms/step - accuracy: 0.7787 - loss: 0.6787 - val_accuracy: 0.1344 - val_loss: 5.7764
Epoch 7/50
[1m74/74[0m [32

<keras.src.callbacks.history.History at 0x159648fce80>