## Importamos librerias

In [25]:
import numpy as np
import os
import librosa
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from sklearn.calibration import LabelEncoder
from keras.layers import LSTM, Dense, Dropout

## Preparativos antes de la RNN

In [26]:
def preprocess_audio(file_path,max_len):
    # Cargar el audio
    signal, sr = librosa.load(file_path,sr=96000)
    # Realizar preénfasis
    #filter_audio = librosa.effects.preemphasis(signal)
    # Extraer MFCCs
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13)

    # Realizar padding o recorte
    if mfcc.shape[1] < max_len:
        num_zeros = max_len - mfcc.shape[1]
        padded_mfcc = np.pad(mfcc, ((0, 0), (0, num_zeros)), mode='constant', constant_values=0)
        return padded_mfcc
    else:
        mfcc = mfcc[:, :max_len]
        return mfcc

In [27]:
DATA_DIR = 'Data'
LABELS = ['happy','cat', 'bed']

mfccs = []
labels = []

padding = 500

for label in LABELS:
    path_file = DATA_DIR + f'/{label}'
    for file in os.listdir(path_file):
        file_path = path_file + f'/{file}'
        
        mfcc = preprocess_audio(file_path,padding)
        
        mfccs.append(mfcc)
        labels.append(label)

In [28]:
#Convertimos las listas a numpy array
mfccs_array = np.array(mfccs) #Matriz 3d (5187, 13, 500)

In [29]:
# Convertir etiquetas a números
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

print("Codificación realizada:")
for idx, label in enumerate(le.classes_):
    print(f"{label} -> {idx}")

Codificación realizada:
bed -> 0
cat -> 1
happy -> 2


In [30]:
# Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(mfccs_array, labels_encoded, test_size=0.2, random_state=312)

# Crear modelo LSTM (Long short-term memory)
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(128))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(3, activation='softmax'))  # 3 clases: 'happy', 'cat', 'bed'

# Compilar el modelo
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

## Entrenamiento

In [31]:
# Entrenar el modelo
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1dee8209610>

## Evaluación

In [32]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

Loss: 0.39247018098831177
Accuracy: 0.8535645604133606


## Prueba

In [33]:
file_paths = ['audio_bed.wav','audio_cat.wav','audio_happy.wav']

for file_path in file_paths:
    print(file_path)
    preprocessed_audio = preprocess_audio(file_path,padding)
    preprocessed_audio = preprocessed_audio.reshape(1, preprocessed_audio.shape[0], preprocessed_audio.shape[1])  # Convertir a formato (1, features, tiempo)
    prediction = model.predict(preprocessed_audio)

    print(f"{le.inverse_transform([0])[0]} : {prediction[0][0] * 100}%")
    print(f"{le.inverse_transform([1])[0]} : {prediction[0][1] * 100}%")
    print(f"{le.inverse_transform([2])[0]} : {prediction[0][2] * 100}%")

    predicted_label_encoded = np.argmax(prediction, axis=1)[0]
    predicted_label = le.inverse_transform([predicted_label_encoded])[0]
    print(f"\nLa palabra predicha es: {predicted_label}\n")

audio_bed.wav
bed : 40.73328077793121%
cat : 14.031212031841278%
happy : 45.23550570011139%

La palabra predicha es: happy

audio_cat.wav
bed : 24.552831053733826%
cat : 60.27589440345764%
happy : 15.171276032924652%

La palabra predicha es: cat

audio_happy.wav
bed : 24.098268151283264%
cat : 62.86776065826416%
happy : 13.033977150917053%

La palabra predicha es: cat



## Guardamos el modelo

In [35]:
# Entrenar el modelo
model.save('GUI/modelo.h5')