#**Imports**

In [8]:
!pip install librosa soundfile



In [9]:
import librosa
import tensorflow_datasets as tfds
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, BatchNormalization
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import tqdm

# **Carregar Dataset**

In [10]:
# Carregar o Google Speech Commands dataset (split de treino)
ds, ds_info = tfds.load('speech_commands', split='train', shuffle_files=True, with_info=True)

# Exibir informações sobre o dataset
print(ds_info)

#extrair palavras desejadas
palavras_desejadas = ['yes', 'no', 'up', 'down', 'left', 'right']
batch_size = 32
target_length = 16000

tfds.core.DatasetInfo(
    name='speech_commands',
    full_name='speech_commands/0.0.3',
    description="""
    An audio dataset of spoken words designed to help train and evaluate keyword
    spotting systems. Its primary goal is to provide a way to build and test small
    models that detect when a single word is spoken, from a set of ten target words,
    with as few false positives as possible from background noise or unrelated
    speech. Note that in the train and validation set, the label "unknown" is much
    more prevalent than the labels of the target words or background noise. One
    difference from the release version is the handling of silent segments. While in
    the test set the silence segments are regular 1 second files, in the training
    they are provided as long segments under "background_noise" folder. Here we
    split these background noise into 1 second clips, and also keep one of the files
    for the validation set.
    """,
    homepage='https://arxiv.or

In [11]:
# Mapeamento de rótulos para índices
label_to_index = {label: index for index, label in enumerate(palavras_desejadas)}

# **MFCCS**

In [12]:
# Função para obter MFCCs com comprimento padronizado
def get_mfccs(audio, sr=16000, n_mfcc=13, max_frames=50):
    max_length = sr * 5  # 5 segundos
    if len(audio) > max_length:
        audio = audio[:max_length]
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    if mfccs.shape[1] < max_frames:
        pad_width = max_frames - mfccs.shape[1]
        mfccs = np.pad(mfccs, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs = mfccs[:, :max_frames]
    return mfccs

In [13]:
# Função para padronizar o comprimento do áudio
def pad_or_truncate(audio, target_length=target_length):
    if len(audio) > target_length:
        return audio[:target_length]
    elif len(audio) < target_length:
        return np.pad(audio, (0, target_length - len(audio)), 'constant')
    return audio

In [14]:
# Carregar e processar o dataset com balanceamento de classes
X, y = [], []
for batch in tqdm.tqdm(ds):
    audio = batch['audio'].numpy().astype(np.float32)
    label = batch['label'].numpy()
    label_name = ds_info.features['label'].int2str(label)
    if label_name in palavras_desejadas:
        audio = pad_or_truncate(audio)
        mfccs = get_mfccs(audio)
        X.append(mfccs)
        y.append(label_to_index[label_name])

X = np.array(X)[..., np.newaxis]
y = np.array(y)

100%|██████████| 85511/85511 [04:59<00:00, 285.07it/s]


In [15]:
# Aplicar oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X.reshape(X.shape[0], -1), y)
X_resampled = X_resampled.reshape(-1, X.shape[1], X.shape[2], 1)

In [16]:
# One-hot encoding dos rótulos
y_categorical = to_categorical(y_resampled, num_classes=len(palavras_desejadas))

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_categorical, test_size=0.2, random_state=42)


# **CNN Modelo**

In [18]:
# Criar o modelo
model = Sequential()

# Primeira camada convolucional
model.add(Conv2D(64, kernel_size=(1, 3), activation='relu', input_shape=(13, 50, 1)))
model.add(BatchNormalization())  # Normalização
model.add(MaxPooling2D(pool_size=(1, 2)))
model.add(Dropout(0.3))

# Segunda camada convolucional
model.add(Conv2D(128, kernel_size=(1, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(1, 2)))
model.add(Dropout(0.3))

# Terceira camada convolucional
model.add(Conv2D(256, kernel_size=(1, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(1, 2)))
model.add(Dropout(0.3))

# Flatten e camadas densas
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(6, activation='softmax'))

In [19]:
# Compilar o modelo
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Treinamento do modelo
model.fit(X_train, y_train, epochs=40, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/40
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 196ms/step - accuracy: 0.3396 - loss: 1.7763 - val_accuracy: 0.6469 - val_loss: 1.0074
Epoch 2/40
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 196ms/step - accuracy: 0.5162 - loss: 1.2005 - val_accuracy: 0.7230 - val_loss: 0.8254
Epoch 3/40
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 202ms/step - accuracy: 0.5926 - loss: 1.0441 - val_accuracy: 0.7788 - val_loss: 0.6779
Epoch 4/40
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 194ms/step - accuracy: 0.6438 - loss: 0.9037 - val_accuracy: 0.8185 - val_loss: 0.5491
Epoch 5/40
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 196ms/step - accuracy: 0.6852 - loss: 0.8186 - val_accuracy: 0.8400 - val_loss: 0.4918
Epoch 6/40
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 204ms/step - accuracy: 0.7107 - loss: 0.7578 - val_accuracy: 0.8394 - val_loss: 0.4855
Epoch 7/4

<keras.src.callbacks.history.History at 0x7f5f17551690>

In [None]:
# Avaliação do modelo
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Acurácia do modelo: {accuracy:.2f}')

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9290 - loss: 0.2062
Acurácia do modelo: 0.93


In [None]:
model.summary()

In [None]:
# Verificar previsões
for i in range(10):  # Testando em 10 exemplos de teste
    pred = model.predict(X_test[i:i+1])
    predicted_label = palavras_desejadas[np.argmax(pred)]
    true_label = palavras_desejadas[np.argmax(y_test[i])]
    print(f"Predição: {predicted_label}, Rótulo Real: {true_label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
Predição: no, Rótulo Real: no
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Predição: up, Rótulo Real: left
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Predição: up, Rótulo Real: up
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Predição: right, Rótulo Real: right
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Predição: down, Rótulo Real: down
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Predição: no, Rótulo Real: no
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Predição: no, Rótulo Real: no
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Predição: yes, Rótulo Real: yes
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Predição: no, Rótulo Real: no
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/s