# AP5 - Pattern Recognition
Train a Support Vector Machine (SVM) classifier to recognize 'Call', 'Open' and 'Close' voice commands.

> Name: Jonas Carvalho Fortes

> Mat: 494513

## Importing Libraries

In [44]:
import os
import numpy as np
from scipy.io import savemat
# install also ffmpeg in the system (https://ffmpeg.org/download.html) for extracting the signal from the audio files.
import ffmpeg

## Data preparation

In [45]:
def extract_audio_signal(file_path, max_length):
    """
    Extracts the audio signal from a file and returns a numpy array of the signal
    with a specified maximum length.
    
    Parameters:
        file_path (str): The path to the audio file.
        max_length (int): The maximum length of the signal in samples.
        
    Returns:
        np.ndarray: A numpy array containing the audio signal, padded or truncated to max_length.
    """
    try:
        # Use ffmpeg to extract the audio and convert to wav format in memory
        out, _ = (
            ffmpeg
            .input(file_path)
            .output('pipe:', format='wav')
            .run(capture_stdout=True, capture_stderr=True)
        )

        # Convert the audio signal to a numpy array (assuming 16-bit PCM)
        signal = np.frombuffer(out, dtype=np.int16)

        # If stereo, average the two channels
        if signal.ndim == 2:
            signal = signal.mean(axis=1)

        # Adjust the signal to match the max_length
        if len(signal) > max_length:
            signal = signal[:max_length]  # Truncate the signal if longer
        else:
            signal = np.pad(signal, (0, max_length - len(signal)), 'constant')  # Pad with zeros if shorter

        return signal

    except ffmpeg.Error as e:
        print(f"Error extracting audio signal from {file_path}: {e.stderr.decode()}")
        return None

In [46]:
def create_audio_dataset(base_path, class_dirs, max_length):
    """
    Cria um dataset de sinais de áudio a partir de pastas organizadas por classes.
    
    Parameters:
        base_path (str): Caminho para o diretório base que contém as pastas de classes.
        class_dirs (list): Lista com os nomes das pastas que representam as classes de áudio.
        max_length (int): O tamanho máximo do sinal de áudio (em amostras).

    Returns:
        np.ndarray: Um array 2D de dimensões (45, max_length) contendo os sinais de áudio.
        np.ndarray: Um array 1D de dimensões (45,) contendo os rótulos das classes.
    """
    dataset = []
    labels = []
    
    for label, class_dir in enumerate(class_dirs):
        class_path = os.path.join(base_path, class_dir)
        audio_files = sorted(os.listdir(class_path))  # Organiza os arquivos para garantir ordem consistente
        
        for audio_file in audio_files:
            audio_path = os.path.join(class_path, audio_file)
            print(f"Extracting audio signal from {audio_path}...")
            # Extrai o sinal de áudio usando a função extract_audio_signal
            signal = extract_audio_signal(audio_path, max_length)
            
            if signal is not None:
                dataset.append(signal)
                labels.append(label)  # Adiciona o rótulo correspondente à classe
    
    # Converte a lista para numpy array
    dataset = np.array(dataset)
    labels = np.array(labels)
    
    return dataset, labels



In [56]:
# Exemplo de uso:
base_path = './data/audios'  # Diretório base onde estão as pastas
class_dirs = ['abrir', 'fechar', 'ligar']  # Pastas com as classes de áudio
max_length = 200000  # Define o tamanho máximo do sinal de áudio

# Criar o dataset
dataset, labels = create_audio_dataset(base_path, class_dirs, max_length)

print(f"Dataset shape: {dataset.shape}")  # Deve ser (45, max_length)
print(f"Labels shape: {labels.shape}")    # Deve ser (45,)

Extracting audio signal from ./data/audios\abrir\abrir 01.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 02.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 03.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 04.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 05​.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 06.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 07.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 08.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 09.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 10.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 11.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 12.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 13.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 14.m4a...
Extracting audio signal from ./data/audios\abrir\abrir 15.m4a...
Extracting audio signal 

In [58]:
dataset

array([[18770, 17990,    -1, ...,     0,     0,     0],
       [18770, 17990,    -1, ...,  -140,  -128,  -138],
       [18770, 17990,    -1, ...,     0,     0,     0],
       ...,
       [18770, 17990,    -1, ...,     0,     0,     0],
       [18770, 17990,    -1, ...,     0,     0,     0],
       [18770, 17990,    -1, ...,     0,     0,     0]], dtype=int16)