# Распознавание голосовых команд

<hr>

С.Ю. Папулин (papulin.study@yandex.ru)

### Содержание

- [Метод опорных векторов](#Метод-опорных-векторов)
- [Распознавание цифр](#Распознавание-цифр)
- [Разпознавание голосовых команд](#Разпознавание-голосовых-команд)
    - [Предобработка аудио данных](#Предобработка-аудио-данных)
    - [Загрузка датасета](#Загрузка-датасета)
    - [Оконное преобразование Фурье](#Оконное-преобразование-Фурье)
    - [Масштабирование спектрограммы](#Масштабирование-спектрограммы)
    - [Обучение и предсказание](#Обучение-и-предсказание)
- [Источники](#Источники)

In [None]:
import os
import pathlib

import matplotlib.pyplot as plt
import numpy as np

In [None]:
from scipy.io import wavfile
from scipy import signal

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
# from sklearn.inspection import DecisionBoundaryDisplay

In [None]:
import sys
sys.path.insert(0, "../lib/")
from plot_utils import CPlot, RPlot

from matplotlib.colors import ListedColormap
from matplotlib import cm

## Метод опорных векторов

In [None]:
from sklearn.datasets import make_moons

In [None]:
X, y = make_moons(n_samples=200, noise=0.3, random_state=12345)
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2, 
    random_state=12345
)

In [None]:
CLR_MAP = ListedColormap(['blue', 'green'])

plt.figure(figsize=(4, 4))
plt.title('Initial dataset')
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=CLR_MAP)
plt.xlabel('X1')
plt.ylabel('X2')
plt.grid()
plt.show()

In [None]:
models = [
    ('Linear SVC', LinearSVC()),
    ('SVC with Linear Kernel (SVC)', SVC(kernel='linear')),
    ('SVC with Poly Kernel', SVC(kernel='poly', degree=4)),
    ('SVC with RBF Kernel', SVC(kernel='rbf', gamma='scale'))
]

for name, model in models:
    model.fit(X_train, y_train)
    print(f'{name}: Accuracy on test = {model.score(X_test, y_test)}')
    CPlot.show_train_test_plots(model, X_train, y_train, X_test, y_test, title=name, cmap=CLR_MAP)

## Распознавание цифр

In [None]:
from sklearn import datasets

In [None]:
# Загрузка исходных данных
digits = datasets.load_digits()

In [None]:
IMAGE_INDX = 3

print("Features:\n", digits["images"][IMAGE_INDX])
print("Target value:", digits.target[IMAGE_INDX])

plt.figure(figsize=(2, 2))
plt.imshow(digits.images[IMAGE_INDX])
plt.axis('off')
# plt.colorbar()
plt.show()

In [None]:
# Преобразование исходных данных
# Замечание: 
#  digits.data уже содержит преобразованные данные
X = digits['images'].reshape(-1, 64)
y = digits['target']
X.shape, y.shape

In [None]:
# Формирование обучающего и тестового подмножеств
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=100)


models = [
    ('Multinomial Logistic Regression', LogisticRegression(
        C=1.0, 
        multi_class='multinomial', 
        solver='newton-cg', 
        max_iter=200, 
        random_state=12345)
    ),
    ('SVC with RBF Kernel', SVC(kernel='rbf', gamma='scale'))
]


for name, model in models:
    model.fit(X_train, y_train)
    print(f'{name}: Accuracy on test = {model.score(X_test, y_test)}')
    ConfusionMatrixDisplay.from_predictions(
        y_true=y_test,
        y_pred=model.predict(X_test)
    )
    plt.show()

## Разпознавание голосовых команд

- Набор данных по [ссылке](http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip) (мини версия ~200MB)
- Описание набора данных: [Speech Commands Dataset](https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.html)

In [None]:
"""
Util functions to load and process audio data
"""


AUDIO_BASE_DIR = '/home/ubuntu/Downloads/mini_speech_commands/mini_speech_commands/'



def pad_audio_data(data, rate):
    """Add pads to 1sec length (16k samples)."""
    pad_width = rate - data.shape[0]
    return np.pad(
        array=data, 
        pad_width=(0, pad_width), 
        mode='constant', 
        constant_values=(0, 0)
    )

    
def normalize_audio_amplitude(data):
    return data / 32767

    
def load_audio_data(file_path):
    rate, data = wavfile.read(file_path)
    data_padded = pad_audio_data(data, rate)
    return normalize_audio_amplitude(data_padded)


def load_audio_dataset():
    dataset_dir = AUDIO_BASE_DIR
    targets_dirs = os.listdir(dataset_dir)
    targets_dirs.remove('README.md')
    n_files = 0
    for target_dir in targets_dirs:
        full_target_dir = os.path.join(dataset_dir, target_dir)
        n_files += len(os.listdir(full_target_dir))
    X = np.zeros((n_files, 16000), dtype=np.float16)
    y = np.zeros(n_files, dtype=int)
    file_list = list()
    i = 0
    for j in range(len(targets_dirs)):
        full_target_dir = os.path.join(dataset_dir, targets_dirs[j])
        for file_name in os.listdir(full_target_dir):
            full_file_name = os.path.join(full_target_dir, file_name)
            X[i] = load_audio_data(full_file_name)
            y[i] = j
            file_list.append(full_file_name) 
            i += 1
    return X, y, targets_dirs, file_list

In [None]:
os.listdir(AUDIO_BASE_DIR)

### Предобработка аудио данных

In [None]:
SAMPLE_PATH = f'{AUDIO_BASE_DIR}/left/1b4c9b89_nohash_3.wav'

In [None]:
from IPython.display import Audio 
Audio(SAMPLE_PATH, autoplay=False)

In [None]:
# Read an audio data from the file
rate, audio_data = wavfile.read(SAMPLE_PATH)
rate, audio_data.shape, audio_data.min(), audio_data.max()

In [None]:
# Add zeros to end if data length less than 16k
audio_data_padded = pad_audio_data(audio_data, rate)
audio_data_padded.shape

In [None]:
# Normalize amplitude
audio_data_normalized = normalize_audio_amplitude(audio_data_padded)
audio_data_normalized.shape, audio_data_normalized.min(), audio_data_normalized.max()

In [None]:
n_samples = np.arange(audio_data.shape[0])

plt.figure(figsize=[12,4])

plt.subplot(1, 2, 1)
plt.title('Command: Left')
plt.plot(n_samples, audio_data)
plt.xlabel('Samples')
plt.ylabel('Amplitude')
plt.xlim([0, 16000])
# plt.ylim([-1, 1])
plt.grid()

plt.subplot(1, 2, 2)
plt.title('Command: Left')
plt.plot(n_samples, audio_data_normalized)
plt.xlabel('Samples')
plt.ylabel('Amplitude')
plt.xlim([0, 16000])
# plt.ylim([-1, 1])
plt.grid()


plt.show()

### Загрузка датасета

In [None]:
# Загрузка всего датасета
X, y, target_names, files = load_audio_dataset()
X.shape, y.shape, target_names, len(files)

In [None]:
# Количество наблюдений по классам
list(zip(target_names, *np.unique(y, return_counts=True)))

In [None]:
DATA_INDEX = 1005
target_names[y[DATA_INDEX]]

In [None]:
X[DATA_INDEX].shape

In [None]:
from IPython.display import Audio 
Audio(files[DATA_INDEX], autoplay=False)

### Оконное преобразование Фурье

#### Построение спектрограммы

<!-- ![image](https://docs.exponenta.ru/signal/ref/iscola_stft.png) -->


<img src="https://docs.exponenta.ru/signal/ref/iscola_stft.png" width="600px">

https://docs.exponenta.ru/signal/ref/stft.html

In [None]:
freq, time_segments, Zxx = signal.stft(X[DATA_INDEX], window='hann', fs=16e3, nperseg=256, noverlap=128)

In [None]:
Zxx.shape

In [None]:
Z = np.abs(Zxx)
Z.shape, Z.min(), Z.max()

In [None]:
plt.figure(figsize=[14,6])

plt.subplot(1, 2, 1)
plt.pcolormesh(time_segments, freq, Z)
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.colorbar()

plt.subplot(1, 2, 2)
plt.pcolormesh(time_segments, freq, np.log(Z + np.finfo(float).eps))
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.colorbar()

plt.show()

In [None]:
X[DATA_INDEX].shape

In [None]:
plt.figure(figsize=[8,6])

time_scale = np.linspace(0.0, 1.0, rate)

plt.subplot(2, 1, 1)
plt.title('Signal')
plt.plot(time_scale, X[DATA_INDEX])
plt.xlabel('Time')
plt.ylabel('Amplitude')
plt.xlim([0, 1])
plt.grid()

plt.subplot(2, 1, 2)
plt.title('Short Time FFT (Spectrogram)')
plt.pcolormesh(time_segments, freq, np.log(Z + np.finfo(float).eps))
plt.xlabel('Time')
plt.ylabel('Frequency')

plt.tight_layout()

plt.show()

#### Спектрограммы для всего набора данных

In [None]:
# Преобразование исходных сигналов в спекторграммы
X_spectrogram = np.zeros((X.shape[0], *Z.shape))
for i in range(X.shape[0]):
    _, _, Zxx = signal.stft(X[i], window='hann', fs=16e3, nperseg=256, noverlap=128)
    X_spectrogram[i] = np.log(np.abs(Zxx) + np.finfo(float).eps)

In [None]:
X_spectrogram.shape

In [None]:
# Отображение 6 случайных сигналов и их спектограмм

cols = 3
rows = 4

np.random.seed(12345)

indxs = np.random.randint(0, 8000, (2, 3))
indxs_ = np.repeat(indxs, repeats=2, axis=0)

plt.figure(figsize=[12,10])

for i in range(rows):
    for j in range(cols):
        plt.subplot(rows, cols, i*cols + j + 1)
        if (i*cols + j) // cols % 2 == 0:
            plt.title(f'{target_names[y[indxs_[i, j]]]}: {indxs_[i, j]}')
            plt.plot(time_scale, X[indxs_[i, j]])
            plt.xlabel('Time')
            plt.ylabel('Amplitude')
            plt.xlim([0, 1])
            plt.grid()
        else:
            plt.title(f'{target_names[y[indxs_[i, j]]]}: {indxs_[i, j]}')
            plt.pcolormesh(time_segments, freq, X_spectrogram[indxs_[i, j]])
            plt.xlabel('Time')
            plt.ylabel('Frequency')
        
plt.tight_layout()

### Масштабирование спектрограммы

In [None]:
from skimage.transform import resize, resize_local_mean, downscale_local_mean

In [None]:
X_resized = np.zeros((X_spectrogram.shape[0], 32, 32))

In [None]:
# Уменьшаем размер спекторграмм до 32x32
for i in range(X_spectrogram.shape[0]):
    X_resized[i] = resize_local_mean(X_spectrogram[i], (32, 32))

In [None]:
plt.title('Short Time FFT (Spectrogram)')
plt.pcolormesh(range(32), range(32), X_resized[DATA_INDEX])
plt.colorbar()
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.show()

### Обучение и предсказание

In [None]:
# Формирование вектора признаков: 32x32 -> 1024
X_features = X_resized.reshape(X_resized.shape[0], -1)
X_features.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.3, random_state=100)

In [None]:
# Обучение и оценка качества на тестовом множестве
pipeline = Pipeline([
    ('standardizer', StandardScaler()),
    ('clf', SVC(kernel='rbf', C=10))
])
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

In [None]:
ConfusionMatrixDisplay.from_predictions(
    y_true=y_test,
    y_pred=pipeline.predict(X_test),
    display_labels=target_names,
)
plt.show()

## Источники

[Simple audio recognition: Recognizing keywords](https://www.tensorflow.org/tutorials/audio/simple_audio?hl=en)