In [1]:
import warnings
warnings.filterwarnings('ignore')

# import the necessary packages
from audiomentations import AddBackgroundNoise, PolarityInversion
from IPython.display import display as dp
from IPython.display import Audio
from imutils import paths

import soundfile as sf
import numpy as np
import librosa
import random
import os

In [2]:
sample_rate = 16000 # sampling rate in kHz
duration = 1 # audio length in seconds
speeds = [0.8, 1, 1.2] # time stretch rates
pitches = [-2, 0, 2] # pitch scales
gain_min_factor, gain_max_factor  = 0.8, 1.2 # min-max gain factors
noise_percentage_factor = 0.05 # white gaussian noise 
num_background_noise = 1 # number of background noises 

# 35 commands in Kazakh
commands = ["артқа", "алға", "оңға", "солға", "төмен", "жоғары", "жүр", "тоқта", "қос", "өшір", "иә", "жоқ", 
            "үйрен", "орында", "нөл", "бір", "екі", "үш", "төрт", "бес", "алты", "жеті", "сегіз", "тоғыз", "төсек", 
            "құс", "мысық", "ит", "бақытты", "үй", "оқы", "жаз", "ағаш", "көрнекі", "мәссаған"]

In [3]:
# paths to the synthetic, extracted, and background noise datasets
# the background noise can be downloaded form here:
# https://github.com/karolpiczak/ESC-50 
synthetic_dataset_path = "tts_synthetic_dataset"
extracted_dataset_path = "corpus_speech_commands"
background_noise_dataset_path = "/workspace/ESC-50-master/audio" 

# create paths to save augmented speech commands
augmented_dataset_path = "augmented_dataset"
for command in commands:
    path = os.path.join(augmented_dataset_path, command)
    if not os.path.exists(path):
        os.makedirs(path)
    
# the original background noise file are 44100 Hz
# while our speech commands are 16000 Hz.
# it is recommended to change the background noise
# files to 16 kHz to make augmentations faster
# however you should do it only once
change_sr = False

In [4]:
if change_sr:
    # get the list of paths to the background noise audio files
    bnPaths = list(paths.list_files(background_noise_dataset_path, validExts="wav"))

    print("Number of  files (.wav):", len(bnPaths))

    for i, bnPath in enumerate(bnPaths,1):
        print("[INFO] Processing file: {}/{}".format(i,len(bnPaths)))
        signal, _ = librosa.load(bnPath, sr=44100)
        sf.write(bnPath, signal, samplerate=16000)

In [5]:
# https://github.com/musikalkemist/audioDataAugmentationTutorial/blob/main/3/dataaugmentation.py
def add_white_noise(signal, noise_percentage_factor):
    noise = np.random.normal(0, signal.std(), signal.size)
    augmented_signal = signal + noise * noise_percentage_factor
    return augmented_signal


def time_stretch(signal, time_stretch_rate):
    return librosa.effects.time_stretch(y=signal, rate=time_stretch_rate)


def pitch_scale(signal, sample_rate, num_semitones):
    return librosa.effects.pitch_shift(y=signal, sr=sample_rate, n_steps=num_semitones)


def random_gain(signal, min_factor, max_factor):
    gain_rate = random.uniform(min_factor, max_factor)
    augmented_signal = signal * gain_rate
    return augmented_signal


def invert_polarity(signal):
    return signal * -1


add_background_noise = AddBackgroundNoise(sounds_path=background_noise_dataset_path,
                                          min_snr_in_db=5.0,
                                          max_snr_in_db=30.0,
                                          noise_transform=PolarityInversion(),
                                          p=1.0)

In [6]:
dataset_paths = [synthetic_dataset_path, extracted_dataset_path]
total_samples = 0

# loop over the commands
for command in commands:
    count = 0
    
    # loop over the datasets
    for dataset_path in dataset_paths:
        # get the list of wav files
        wavPaths = list(paths.list_files(os.path.join(dataset_path, command), 
                                         validExts="wav"))
        # loop over the wav files
        for wavPath in wavPaths:
            # load the wav file
            raw_signal, sample_rate = librosa.load(wavPath, sr=sample_rate)
    
            # get the wav file name
            org_wav_file_name = wavPath.split('/')[-1].split('.')[0]
        
            # path to save the augmented wav files
            aug_wav_save_path = os.path.join(augmented_dataset_path, command)

            # loop over the time stretch rates
            for speed in speeds:
                # apply time stretch augmentation 
                # to the raw signal 
                ts_signal = time_stretch(raw_signal, speed)     

                # loop over the pitch scales
                for pitch in pitches:
                    # apply pitch scale augmentation
                    ps_signal = pitch_scale(ts_signal, sample_rate, pitch)
                
                    # apply gain scale augmentation
                    gs_signal = random_gain(ps_signal, gain_min_factor, gain_max_factor)
                    
                    # get the difference between the current audio
                    # and expected audio lenghts
                    difference = sample_rate * duration - len(gs_signal) 
                    
                    # add silence to the signal if it is less than 1 sec
                    if difference > 0:
                        gs_signal = np.hstack([np.zeros(difference//2), gs_signal, np.zeros(difference//2)])
                    else:
                        gs_signal = time_stretch(gs_signal, len(gs_signal)/(sample_rate*duration))
                    
                    # save the augmented wav file
                    aug_wav_file_name = "{}-{}-{}-0.wav".format(org_wav_file_name, speed, pitch)
                    sf.write(os.path.join(aug_wav_save_path, aug_wav_file_name), gs_signal, sample_rate)
                    count += 1

                    # add white gaussian noise 
                    gn_signal = add_white_noise(gs_signal, noise_percentage_factor)
                    gn_wav_file_name = "{}-{}-{}-1.wav".format(org_wav_file_name, speed, pitch)
                    sf.write(os.path.join(aug_wav_save_path, gn_wav_file_name), gn_signal, sample_rate)
                    count += 1
                    
                    # add random background noise
                    for i in range(num_background_noise):
                        bn_signal = add_background_noise(gs_signal, sample_rate=sample_rate)
                        bn_wav_file_name = "{}-{}-{}-{}.wav".format(org_wav_file_name, speed, pitch, i+2)
                        sf.write(os.path.join(aug_wav_save_path, bn_wav_file_name), bn_signal, sample_rate)
                        count += 1
                    
    total_samples += count                
    print("[INFO] Command: {}, samples: {}, total samples: {}".format(command, count, total_samples))

[INFO] Command: артқа, samples: 162, total samples: 162
[INFO] Command: алға, samples: 162, total samples: 324
[INFO] Command: оңға, samples: 162, total samples: 486
[INFO] Command: солға, samples: 162, total samples: 648
[INFO] Command: төмен, samples: 162, total samples: 810
[INFO] Command: жоғары, samples: 162, total samples: 972
[INFO] Command: жүр, samples: 162, total samples: 1134
[INFO] Command: тоқта, samples: 162, total samples: 1296
[INFO] Command: қос, samples: 162, total samples: 1458
[INFO] Command: өшір, samples: 162, total samples: 1620
[INFO] Command: иә, samples: 162, total samples: 1782
[INFO] Command: жоқ, samples: 162, total samples: 1944
[INFO] Command: үйрен, samples: 162, total samples: 2106
[INFO] Command: орында, samples: 162, total samples: 2268
[INFO] Command: нөл, samples: 162, total samples: 2430
[INFO] Command: бір, samples: 162, total samples: 2592
[INFO] Command: екі, samples: 162, total samples: 2754
[INFO] Command: үш, samples: 162, total samples: 2916

In [7]:
# play some speech commands
example_commands = ['артқа', 'алға']

for example_command in example_commands:
    wavPaths = list(paths.list_files(os.path.join(augmented_dataset_path, example_command), 
                                     validExts="wav"))
    
    print('Command: {}'.format(example_command))
    for wavPath in wavPaths[:3]:
        dp(Audio(wavPath))

Command: артқа


Command: алға
