In [1]:
!pip install tensorflow librosa numpy matplotlib


Collecting librosa
  Downloading librosa-0.10.2.post1-py3-none-any.whl (260 kB)
     ---------------------------------------- 0.0/260.1 kB ? eta -:--:--
     - -------------------------------------- 10.2/260.1 kB ? eta -:--:--
     ---- -------------------------------- 30.7/260.1 kB 330.3 kB/s eta 0:00:01
     ---- -------------------------------- 30.7/260.1 kB 330.3 kB/s eta 0:00:01
     ---- -------------------------------- 30.7/260.1 kB 330.3 kB/s eta 0:00:01
     ----- ------------------------------- 41.0/260.1 kB 151.3 kB/s eta 0:00:02
     ----- ------------------------------- 41.0/260.1 kB 151.3 kB/s eta 0:00:02
     ----------- ------------------------- 81.9/260.1 kB 286.7 kB/s eta 0:00:01
     ----------- ------------------------- 81.9/260.1 kB 286.7 kB/s eta 0:00:01
     ------------- ----------------------- 92.2/260.1 kB 262.6 kB/s eta 0:00:01
     ------------- ----------------------- 92.2/260.1 kB 262.6 kB/s eta 0:00:01
     ----------------- ------------------ 122.9/260.1


[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os
import numpy as np
import tensorflow as tf
import librosa
import librosa.display
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

In [None]:
# Указываем классы
CLASSES = ["left", "right", "up", "down"]
DATASET_PATH = "speech_commands/"

# есть ли файлы команд?
for command in CLASSES:
    path = os.path.join(DATASET_PATH, command)
    if not os.path.exists(path):
        print(f"Папка с командой {command} не найдена!")
    else:
        print(f"Найдено {len(os.listdir(path))} файлов для {command}")


Найдено 2353 файлов для left
Найдено 2367 файлов для right
Найдено 2375 файлов для up
Найдено 2359 файлов для down


In [None]:
# Параметры звука
SR = 16000  # Частота дискретизации
N_MFCC = 13  # Количество MFCC коэффициентов

def extract_mfcc(file_path, sr=SR, n_mfcc=N_MFCC):
    audio, _ = librosa.load(file_path, sr=sr)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)  # Передаем параметр y=
    return np.mean(mfcc, axis=1)  # Усредняем MFCC по времени


# массивы для хранения данных
X, y = [], []

# Проходим по каждому классу команд
for label, command in enumerate(CLASSES):
    folder_path = os.path.join(DATASET_PATH, command)
    for file in os.listdir(folder_path):
        if file.endswith(".wav"):
            file_path = os.path.join(folder_path, file)
            features = extract_mfcc(file_path)
            X.append(features)
            y.append(label)

X = np.array(X)
y = np.array(y)

print(f"Обработано {len(X)} аудиофайлов.")


Обработано 9454 аудиофайлов.


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Размер обучающей выборки: {len(X_train)}, тестовой: {len(X_test)}")


Размер обучающей выборки: 7563, тестовой: 1891


In [8]:
# Создаем модель
model = models.Sequential([
    layers.Dense(64, activation="relu", input_shape=(N_MFCC,)),  # Входной слой (13 признаков)
    layers.Dense(32, activation="relu"),
    layers.Dense(len(CLASSES), activation="softmax")  # 4 выхода (left, right, up, down)
])

# Компилируем модель
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Обучаем модель
model.fit(X_train, y_train, epochs=30, batch_size=16, validation_data=(X_test, y_test))

# Проверяем точность
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Точность модели: {test_acc:.2f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.3344 - loss: 5.1626 - val_accuracy: 0.4744 - val_loss: 1.5049
Epoch 2/30
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 971us/step - accuracy: 0.4941 - loss: 1.2678 - val_accuracy: 0.5108 - val_loss: 1.2418
Epoch 3/30
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 967us/step - accuracy: 0.4856 - loss: 1.2732 - val_accuracy: 0.4918 - val_loss: 1.2211
Epoch 4/30
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 912us/step - accuracy: 0.4957 - loss: 1.2317 - val_accuracy: 0.4939 - val_loss: 1.1708
Epoch 5/30
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 946us/step - accuracy: 0.4944 - loss: 1.2366 - val_accuracy: 0.5325 - val_loss: 1.1615
Epoch 6/30
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 985us/step - accuracy: 0.5113 - loss: 1.1718 - val_accuracy: 0.5209 - val_loss: 1.2174
Epoch 7/30
[1m473

In [None]:
CLASSES = ["left", "right", "up", "down"]
DATASET_PATH = "speech_commands/"
SR = 16000  # Частота дискретизации
N_MFCC = 20  # Количество MFCC увеличено

# Указываем фиксированную длину для MFCC 
FIXED_LENGTH = 32

def extract_mfcc(file_path, sr=SR, n_mfcc=N_MFCC, fixed_length=FIXED_LENGTH):
    audio, _ = librosa.load(file_path, sr=sr)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)

    # Приводим к фиксированной длине 
    if mfcc.shape[1] < fixed_length:
        pad_width = fixed_length - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :fixed_length]

    return np.expand_dims(mfcc, axis=-1)  # Добавляем ось для CNN


# Собираем данные
X, y = [], []
for label, command in enumerate(CLASSES):
    folder_path = os.path.join(DATASET_PATH, command)
    for file in os.listdir(folder_path):
        if file.endswith(".wav"):
            file_path = os.path.join(folder_path, file)
            features = extract_mfcc(file_path)
            X.append(features)
            y.append(label)

X = np.array(X)
y = np.array(y)

# Разделяем данные
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Добавляем 4-ю ось 
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

print(f"Данные загружены! Размер обучающей выборки: {len(X_train)}, тестовой: {len(X_test)}")

# Создаем улучшенную модель CNN
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation="relu", input_shape=(N_MFCC, 32, 1)),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(64, (3, 3), activation="relu"),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),

    layers.Flatten(),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.3),  
    layers.Dense(len(CLASSES), activation="softmax")  # 4 выхода
])

# Компилируем
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

# Обучаем
history = model.fit(X_train, y_train, epochs=40, batch_size=16, validation_data=(X_test, y_test))

# Оцениваем точность
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Улучшенная точность модели: {test_acc:.2f}")

Данные загружены! Размер обучающей выборки: 7563, тестовой: 1891
Epoch 1/40


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.6745 - loss: 0.8944 - val_accuracy: 0.9286 - val_loss: 0.2066
Epoch 2/40
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9244 - loss: 0.2062 - val_accuracy: 0.9492 - val_loss: 0.1439
Epoch 3/40
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9493 - loss: 0.1455 - val_accuracy: 0.9529 - val_loss: 0.1250
Epoch 4/40
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9672 - loss: 0.0925 - val_accuracy: 0.9302 - val_loss: 0.1807
Epoch 5/40
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9760 - loss: 0.0722 - val_accuracy: 0.9545 - val_loss: 0.1247
Epoch 6/40
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9786 - loss: 0.0617 - val_accuracy: 0.9487 - val_loss: 0.1635
Epoch 7/40
[1m473/473[0m [32m━━━━━━━

In [None]:
import tensorflow.lite as tflite

# Конвертация модели в TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Сохраняем модель
with open("model.tflite", "wb") as f:
    f.write(tflite_model)

print("Модель успешно конвертирована в TFLite!")


INFO:tensorflow:Assets written to: C:\Users\ELIZAV~1\AppData\Local\Temp\tmpc7z47ybo\assets


INFO:tensorflow:Assets written to: C:\Users\ELIZAV~1\AppData\Local\Temp\tmpc7z47ybo\assets


Saved artifact at 'C:\Users\ELIZAV~1\AppData\Local\Temp\tmpc7z47ybo'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 20, 32, 1), dtype=tf.float32, name='keras_tensor_4')
Output Type:
  TensorSpec(shape=(None, 4), dtype=tf.float32, name=None)
Captures:
  2832536214112: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2832536220272: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2832536212528: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2832531447952: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2832536210064: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2834676908448: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2834676914432: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2834676917776: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2834676922704: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2834676916544: TensorSpec(shape=(), dtype=tf.resource, name=None)
  28

In [1]:
!pip install jiwer


Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Collecting rapidfuzz>=3.9.7
  Downloading rapidfuzz-3.12.2-cp310-cp310-win_amd64.whl (1.6 MB)
     ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
      --------------------------------------- 0.0/1.6 MB 435.7 kB/s eta 0:00:04
      --------------------------------------- 0.0/1.6 MB 435.7 kB/s eta 0:00:04
      --------------------------------------- 0.0/1.6 MB 435.7 kB/s eta 0:00:04
     - -------------------------------------- 0.0/1.6 MB 163.4 kB/s eta 0:00:10
     - -------------------------------------- 0.0/1.6 MB 163.4 kB/s eta 0:00:10
     - -------------------------------------- 0.1/1.6 MB 192.5 kB/s eta 0:00:09
     -- ------------------------------------- 0.1/1.6 MB 229.0 kB/s eta 0:00:07
     -- ------------------------------------- 0.1/1.6 MB 218.5 kB/s eta 0:00:08
     -- ------------------------------------- 0.1/1.6 MB


[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import jiwer

# Эталонные 
y_true = ["left", "right", "up", "down", "left", "up"]
# Предсказанные
y_pred = ["left", "right", "up", "up", "right", "up"]

# Вычисляем WER
wer_score = jiwer.wer(y_true, y_pred)
print(f"WER: {wer_score:.2f}")


WER: 0.33


In [None]:
import os
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
import jiwer 

CLASSES = ["left", "right", "up", "down"]
DATASET_PATH = "speech_commands/"
SR = 16000  # Частота дискретизации
N_MFCC = 20  # Количество MFCC
FIXED_LENGTH = 32  # Фиксированная длина MFCC

def extract_mfcc(file_path, sr=SR, n_mfcc=N_MFCC, fixed_length=FIXED_LENGTH):
    audio, _ = librosa.load(file_path, sr=sr)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)

    # Приводим к фиксированной длине
    if mfcc.shape[1] < fixed_length:
        pad_width = fixed_length - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :fixed_length]

    return np.expand_dims(mfcc, axis=-1)  # Добавляем ось для CNN

# Собираем данные
X, y = [], []
for label, command in enumerate(CLASSES):
    folder_path = os.path.join(DATASET_PATH, command)
    for file in os.listdir(folder_path):
        if file.endswith(".wav"):
            file_path = os.path.join(folder_path, file)
            features = extract_mfcc(file_path)
            X.append(features)
            y.append(label)

X = np.array(X)
y = np.array(y)

# Разделяем данные
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Добавляем 4-ю ось
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

print(f"Данные загружены! Размер обучающей выборки: {len(X_train)}, тестовой: {len(X_test)}")

# Создаем улучшенную модель CNN
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation="relu", input_shape=(N_MFCC, 32, 1)),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(64, (3, 3), activation="relu"),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),

    layers.Flatten(),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(len(CLASSES), activation="softmax")  # 4 выхода

])

# Компилируем
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

# Обучаем
history = model.fit(X_train, y_train, epochs=40, batch_size=16, validation_data=(X_test, y_test))

# Оцениваем точность
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Улучшенная точность модели: {test_acc:.2f}")


# Предсказание на тестовых данных
y_pred_probs = model.predict(X_test)
y_pred_classes = np.argmax(y_pred_probs, axis=1)

# Преобразуем в текстовые метки
y_true_words = [CLASSES[i] for i in y_test]
y_pred_words = [CLASSES[i] for i in y_pred_classes]

# Вычисляем WER
wer_score = jiwer.wer(y_true_words, y_pred_words)
print(f"WER: {wer_score:.2f}")


Данные загружены! Размер обучающей выборки: 7563, тестовой: 1891
Epoch 1/40


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6595 - loss: 0.9912 - val_accuracy: 0.9186 - val_loss: 0.2321
Epoch 2/40
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9289 - loss: 0.2133 - val_accuracy: 0.9276 - val_loss: 0.1789
Epoch 3/40
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9491 - loss: 0.1447 - val_accuracy: 0.9413 - val_loss: 0.1509
Epoch 4/40
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9640 - loss: 0.1098 - val_accuracy: 0.9513 - val_loss: 0.1294
Epoch 5/40
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9717 - loss: 0.0777 - val_accuracy: 0.9561 - val_loss: 0.1167
Epoch 6/40
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9830 - loss: 0.0486 - val_accuracy: 0.9355 - val_loss: 0.1934
Epoch 7/40
[1m473/473[0m [32m━━━━━━━

In [4]:
import tensorflow.lite as tflite

# Конвертация модели в TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Сохраняем модель
with open("model.tflite", "wb") as f:
    f.write(tflite_model)

print("Модель успешно конвертирована в TFLite!")


INFO:tensorflow:Assets written to: C:\Users\ELIZAV~1\AppData\Local\Temp\tmphepiigdh\assets


INFO:tensorflow:Assets written to: C:\Users\ELIZAV~1\AppData\Local\Temp\tmphepiigdh\assets


Saved artifact at 'C:\Users\ELIZAV~1\AppData\Local\Temp\tmphepiigdh'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 20, 32, 1), dtype=tf.float32, name='keras_tensor')
Output Type:
  TensorSpec(shape=(None, 4), dtype=tf.float32, name=None)
Captures:
  2537776681328: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2537776686256: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2537776687488: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2537776689248: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2537776679568: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2537776688016: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2537776692064: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2537776690304: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2537776692416: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2537776691536: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2537