# Import libraries

In [1]:
import data_augmentation
from scipy.io import wavfile as wav
import IPython
import librosa

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


# Test data augmentation functions
## Random noise
### Original audio:

In [2]:
fn= "my_recordings/0_khaled_0.wav"
_, original_signal = wav.read(fn)
rate=8000
IPython.display.Audio(original_signal,rate=rate)

### Upper bound

In [3]:
noise_audio = data_augmentation.add_random_noise(original_signal, mu=0, stdev=0.1)

In [4]:
IPython.display.Audio(noise_audio,rate=rate)

A standard deviation equal to 0.1 may be too agressive: the audio is intelligible but there is a lot of noise background. Let's see what happens with 0.05:

In [5]:
noise_audio = data_augmentation.add_random_noise(original_signal, mu=0, stdev=0.05)
IPython.display.Audio(noise_audio,rate=rate)

It sounds better! For the moment let's keep 0.05 as upper bound standard deviation.  Now let's find the lower bound
### Lower bound

In [6]:
noise_audio = data_augmentation.add_random_noise(original_signal, mu=0, stdev=0.001)
IPython.display.Audio(noise_audio,rate=rate)

The noise can be heard distinctly only in the last part. Let's increase it to 0.005

In [7]:
noise_audio = data_augmentation.add_random_noise(original_signal, mu=0, stdev=0.005)
IPython.display.Audio(noise_audio,rate=rate)

Now the noise can be heard also during the speaking part.

**OPEN POINT: we keep 0.001 or 0.005 as lower bound?**

## PITCH SHIFT

## Original audio

In [8]:
IPython.display.Audio(original_signal,rate=rate)

### Lower bound

In [9]:
pitch_shift_audio = data_augmentation.change_pitch(original_signal, sampling_rate=rate, pitch_step = -5)
IPython.display.Audio(pitch_shift_audio,rate=rate)

Nica caveman voice :) Let's try with -10:

In [10]:
pitch_shift_audio = data_augmentation.change_pitch(original_signal, sampling_rate=rate, pitch_step = -10)
IPython.display.Audio(pitch_shift_audio,rate=rate)

The voice is incomprehensible. let's try with a pitch step of -7

In [11]:
pitch_shift_audio = data_augmentation.change_pitch(original_signal, sampling_rate=rate, pitch_step = -7)
IPython.display.Audio(pitch_shift_audio,rate=rate)

And this is -6:

In [12]:
pitch_shift_audio = data_augmentation.change_pitch(original_signal, sampling_rate=rate, pitch_step = -6)
IPython.display.Audio(pitch_shift_audio,rate=rate)

**This is borderline: what do we do?** -7 may be a bit aggressive
### Upper bound

In [13]:
pitch_shift_audio = data_augmentation.change_pitch(original_signal, sampling_rate=rate, pitch_step = 5)
IPython.display.Audio(pitch_shift_audio,rate=rate)

In [14]:
pitch_shift_audio = data_augmentation.change_pitch(original_signal, sampling_rate=rate, pitch_step = 4)
IPython.display.Audio(pitch_shift_audio,rate=rate)

In [15]:
pitch_shift_audio = data_augmentation.change_pitch(original_signal, sampling_rate=rate, pitch_step = 6)
IPython.display.Audio(pitch_shift_audio,rate=rate)

**5 seems a good compromise, what do you think?**

# Enrich dataset

In [16]:
import os

In [17]:
"efhddfhdda".endswith('dda')

True

## Params

In [18]:
MIN_STDEV = 0.001
MAX_STDEV = 0.05
MIN_PITCH_STEP = -6
MAX_PITCH_STEP = 5
SAMPLING_RATE = 8000

In [19]:
MIN_PITCH_STEP

-6

## Enrich dataset Wrapper function

In [20]:
MIN_PITCH_STEP

-6

In [21]:
MAX_PITCH_STEP

5

In [22]:
def enrich_dataset(audio_dir, mode, n_noise, n_pitch, rate=8000):
    enriched_audio_tracks = {}
    for audio_fn in os.listdir(audio_dir):
        # Skip temporary files
        if audio_fn.endswith(".wav"):
            # _, original_signal = wav.read(os.path.join(audio_dir,audio_fn))
            original_signal, _ = librosa.core.load(os.path.join(audio_dir,audio_fn), rate)
            # Create an empty dict for storing the various tracks associated with the current file
            enriched_audio_tracks[audio_fn] = {}
            # Add the current audio
            enriched_audio_tracks[audio_fn]['original'] = [original_signal]
            # Apply various random noises to the original track
            noise_tracks = data_augmentation.augment_audio_with_random_noise(original_signal, MIN_STDEV, MAX_STDEV, n_noise)
            # Add these tracks to the result dictionary
            enriched_audio_tracks[audio_fn]['noise'] = noise_tracks
            # Apply pitch shift only to the original audio
            current_pitch_tracks = data_augmentation.augment_audio_with_pitch_shift(audio_signal=original_signal,
                                                             sampling_rate=SAMPLING_RATE,
                                                             min_pitch_shift=MIN_PITCH_STEP,
                                                             max_pitch_shift=MAX_PITCH_STEP,
                                                             n=n_pitch)
            # Store them
            enriched_audio_tracks[audio_fn]['pitch'] = current_pitch_tracks
            if mode == "all_combinations":
                # Create a list for storing the tracks obtained through pitch shift
                pitch_noise_tracks = []
                # Iterate on the list with noise tracks
                for track in noise_tracks:
                    
                    current_pitch_tracks = data_augmentation.augment_audio_with_pitch_shift(track,
                                                                 SAMPLING_RATE,
                                                                 MIN_PITCH_STEP,
                                                                 MAX_PITCH_STEP,
                                                                 n_pitch)
                    pitch_noise_tracks = pitch_noise_tracks + current_pitch_tracks
                # Add the tracks enriched with pitch shift to the current list
                enriched_audio_tracks[audio_fn]['pitch_noise'] = pitch_noise_tracks

    return enriched_audio_tracks


## Test
### Original tracks

In [23]:
audio_dir = "./my_recordings/"
recs = os.listdir(audio_dir)
recs = [rec for rec in recs if rec.endswith(".wav")]
n_recs = len(recs)
n_recs

300

### Expected results:

mode = *all_combinations*

n_noise = 5

n_pitch = 5

Number of recordings = 300 + 5 * 300 (NOISE on original) + 5 * 300 (SHIFT on original) + 5 * (5 * 300) (SHIFT applied to NOISE) = 3300 + 5 * 1500 = 10800

Let's check the results:

In [24]:
%%time
all_comb_augmentation = enrich_dataset(audio_dir,
                     mode="all_combinations",
                     n_noise=5,
                     n_pitch=5)

CPU times: user 4min 19s, sys: 8.89 s, total: 4min 28s
Wall time: 6min 8s


Let's compute the number of tracks

In [25]:
n_tracks = 0
for fn in all_comb_augmentation.keys():
    n_tracks += len(all_comb_augmentation[fn]['original'])
    n_tracks += len(all_comb_augmentation[fn]['noise'])
    n_tracks += len(all_comb_augmentation[fn]['pitch'])
    n_tracks += len(all_comb_augmentation[fn]['pitch_noise'])

In [26]:
n_tracks

10800

The results seem good! Let's try with the other augmentation mode.

mode = *normal*

n_noise = 5

n_pitch = 5

Number of recordings = 300 + 5 * 300 (NOISE on original) + 5 * 300 (SHIFT on original) = 3300

In [27]:
%%time
normal_augmentation = enrich_dataset(audio_dir,
                     mode="normal",
                     n_noise=5,
                     n_pitch=5)

CPU times: user 40.7 s, sys: 1.52 s, total: 42.3 s
Wall time: 48.6 s


In [28]:
n_tracks = 0
for fn in normal_augmentation.keys():
    n_tracks += len(normal_augmentation[fn]['original'])
    n_tracks += len(normal_augmentation[fn]['noise'])
    n_tracks += len(normal_augmentation[fn]['pitch'])

In [29]:
n_tracks

3300

## Store augmented dataset

In [30]:
def store_list_recordings(track_list, augmentation_strategy, augmentation_dir, base_fn):
    extension = ".wav"
    augmentation_fn_structure = "{directory}/{base_fn}_{augmentation_strategy}_{n}{extension}"
    for i,track in enumerate(track_list):
        librosa.output.write_wav(augmentation_fn_structure.format(directory=augmentation_dir,
                                                                  base_fn=base_fn,
                                                                  augmentation_strategy=augmentation_strategy,
                                                                  n=i,
                                                                  extension=extension),
                                 track,
                                 SAMPLING_RATE)
def store_recordings(augmentation_dir, recordings):
    extension = ".wav"
    for original_fn in recordings.keys():
        current_fn_recordings = recordings[original_fn]
        base_fn = original_fn.replace(extension,"")
        #Store original recording
        librosa.output.write_wav("{directory}/{fn}".format(directory=augmentation_dir,
                                                           fn=original_fn),
                                 current_fn_recordings['original'][0],
                                 SAMPLING_RATE)
        # Store noise recordings
        store_list_recordings(current_fn_recordings['noise'], "noise", augmentation_dir, base_fn)
        #Store pitch recordings
        store_list_recordings(current_fn_recordings['pitch'], "pitch", augmentation_dir, base_fn)
        # Check if combination of pitch and noise were created
        if "pitch_noise" in current_fn_recordings.keys():
            store_list_recordings(current_fn_recordings['pitch_noise'], "pitch_noise", augmentation_dir, base_fn)
        

### Store "normal" augmented recordings

In [31]:
! mkdir ./augmentation_recs

In [32]:
store_recordings("./augmentation_recs", normal_augmentation)

In [33]:
! ls ./augmentation_recs | wc -l

    3300


### Store "all combinations" augmented recordings

In [34]:
! mkdir ./all_combination

In [35]:
store_recordings("./all_combination", all_comb_augmentation)

In [36]:
! ls ./all_combination | wc -l

   10800


### Zip recordings

In [None]:
!zip -r normal_augmentation.zip .augmentation_recs