Reference:
- https://towardsdatascience.com/speech-emotion-recognition-with-convolution-neural-network-1e6bb7130ce3
- https://www.kaggle.com/CVxTz/audio-data-augmentation/data
- https://www.kaggle.com/haqishen/augmentation-methods-for-audio

In [None]:
import lib._util.visualplot as vp

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

import IPython
from IPython.display import Audio

# Plotly
import plotly.express as px
import plotly.graph_objects as go

# LibROSA
import librosa

# Constant Variable

In [None]:
SOURCE_PATH_DATA = 'resources/data/augmentation/'
OUT_PATH_GRAPH   = 'resources/output/augmentation/graph/'

# Data Loading

In [None]:
data_df = pd.read_csv(f'{SOURCE_PATH_DATA}audio.csv', sep=',')
data_df.set_index('fname', inplace=True)

data_df

# Original Audio

In [None]:
# Audio
for row in data_df.itertuples():
    print(row.label)
    IPython.display.display(Audio(f'{SOURCE_PATH_DATA}wav/{row.Index}'))

In [None]:
# Create columns to hold array
for column in ['amplitude', 'mel_spectogram']:
    data_df[column] = data_df.apply(lambda x: [], axis=1)

# Audio signals
for row in data_df.itertuples():
    signals, rate = librosa.load(f'{SOURCE_PATH_DATA}wav/{row.Index}', sr=None)
    
    data_df.at[row.Index, 'rate']      = rate
    data_df.at[row.Index, 'amplitude'] = signals
    
    # Mel Spectogram
    data_df.at[row.Index, 'mel_spectogram'] = librosa.feature.melspectrogram(y=signals, sr=rate)
    data_df.at[row.Index, 'mel_spectogram'] = librosa.power_to_db(data_df.at[row.Index, 'mel_spectogram'], ref=np.max)

### Wave-Plot

In [None]:
# Signal
vp.wave(
    data_df.reset_index().set_index('label'),
    amplitude='amplitude',
    title='Audio Signal',
    out_path=OUT_PATH_GRAPH
)

### Mel Spectogram

In [None]:
# Mel Spectogram
vp.spectogram(
    data_df.reset_index().set_index('label'),
    z='mel_spectogram',
    title='Mel Spectogram',
    y_title='Frequency (Hz)',
    out_path=OUT_PATH_GRAPH
)

# Audio Augmentation

### Noise Injection

In [None]:
# Add white noise
def inject_noise(signals, noise_factor=.005):
    noise       = np.random.randn(len(signals))
    new_signals = signals + noise_factor * noise
    
    return new_signals.astype(signals.dtype)

In [None]:
tmp_df = data_df.copy()
for row in tmp_df.itertuples():
    signals = inject_noise(row.amplitude)
    rate    = row.rate
    
    tmp_df.at[row.Index, 'amplitude'] = signals
    
    # Mel Spectogram
    tmp_df.at[row.Index, 'mel_spectogram'] = librosa.feature.melspectrogram(y=signals, sr=rate)
    tmp_df.at[row.Index, 'mel_spectogram'] = librosa.power_to_db(tmp_df.at[row.Index, 'mel_spectogram'], ref=np.max)

In [None]:
# Audio
for row in tmp_df.itertuples():
    print(row.label)
    IPython.display.display(Audio(row.amplitude, rate=row.rate))

In [None]:
# Signal
vp.wave(
    tmp_df.reset_index().set_index('label'),
    amplitude='amplitude',
    title='Audio Signal - Noise Injection',
    out_path=OUT_PATH_GRAPH
)

In [None]:
# Mel Spectogram
vp.spectogram(
    tmp_df.reset_index().set_index('label'),
    z='mel_spectogram',
    title='Mel Spectogram - Noise Injection',
    y_title='Frequency (Hz)',
    out_path=OUT_PATH_GRAPH
)

### Time Shifting
- Shift audio to left/right with a random second
- Shifting audio to left (fast forward) with x seconds, first x seconds will mark as 0 (i.e. silence)
- Shifting audio to right (back forward) with x seconds, last x seconds will mark as 0 (i.e. silence)
- Reference: https://medium.com/@makcedward/data-augmentation-for-audio-76912b01fdf6

In [None]:
def time_shift(signals, rate, shift_ratio=.1, shift_direction='both'):
    shift_directions = ['both', 'left', 'right']
    assert shift_direction in shift_directions, f'shift_direction not in valid list: {shift_directions}'
    
    assert shift_ratio >= 0 and shift_ratio <= 1, 'shift_ratio not in valid range: [0, 1]'
    
    # Left shift by default
    shift = np.random.randint(rate * shift_ratio)
    
    # Right shift
    if shift_direction == 'right':
        shift = -shift
        
    # Randomly determine left/right shift
    elif shift_direction == 'both':
        direction = np.random.randint(0, 2)
        if direction == 1:
            shift = -shift
    new_signals = np.roll(signals, shift)
    
    # Silence heading due on left shift
    if shift >= 0:
        new_signals[:shift] = 0
        
    # Silence ending due on right shift
    else:
        new_signals[shift:] = 0
        
    return new_signals

In [None]:
tmp_df = data_df.copy()
for row in tmp_df.itertuples():
    signals = time_shift(row.amplitude, row.rate)
    rate    = row.rate
    
    tmp_df.at[row.Index, 'amplitude'] = signals
    
    # Mel Spectogram
    tmp_df.at[row.Index, 'mel_spectogram'] = librosa.feature.melspectrogram(y=signals, sr=rate)
    tmp_df.at[row.Index, 'mel_spectogram'] = librosa.power_to_db(tmp_df.at[row.Index, 'mel_spectogram'], ref=np.max)

In [None]:
# Audio
for row in tmp_df.itertuples():
    print(row.label)
    IPython.display.display(Audio(row.amplitude, rate=row.rate))

In [None]:
# Signal
vp.wave(
    tmp_df.reset_index().set_index('label'),
    amplitude='amplitude',
    title='Audio Signal - Time Shifting',
    out_path=OUT_PATH_GRAPH
)

In [None]:
# Mel Spectogram
vp.spectogram(
    tmp_df.reset_index().set_index('label'),
    z='mel_spectogram',
    title='Mel Spectogram - Time Shifting',
    y_title='Frequency (Hz)',
    out_path=OUT_PATH_GRAPH
)

### Speed Changing

In [None]:
tmp_df = data_df.copy()
for row in tmp_df.itertuples():
    # Rate > 1 = Speed up, Rate < 1 = Slow down
    speed_factor = 3
    signals = librosa.effects.time_stretch(row.amplitude, rate=speed_factor)
    rate    = row.rate
    
    tmp_df.at[row.Index, 'amplitude'] = signals
    
    # Mel Spectogram
    tmp_df.at[row.Index, 'mel_spectogram'] = librosa.feature.melspectrogram(y=signals, sr=rate)
    tmp_df.at[row.Index, 'mel_spectogram'] = librosa.power_to_db(tmp_df.at[row.Index, 'mel_spectogram'], ref=np.max)

In [None]:
# Audio
for row in tmp_df.itertuples():
    print(row.label)
    IPython.display.display(Audio(row.amplitude, rate=row.rate))

In [None]:
# Signal
vp.wave(
    tmp_df.reset_index().set_index('label'),
    amplitude='amplitude',
    title='Audio Signal - Speed Changing',
    out_path=OUT_PATH_GRAPH
)

In [None]:
# Mel Spectogram
vp.spectogram(
    tmp_df.reset_index().set_index('label'),
    z='mel_spectogram',
    title='Mel Spectogram - Speed Changing',
    y_title='Frequency (Hz)',
    out_path=OUT_PATH_GRAPH
)

### Pitch Shifting

In [None]:
tmp_df = data_df.copy()
for row in tmp_df.itertuples():
    pitch_factor = 5
    signals = librosa.effects.pitch_shift(row.amplitude, row.rate, n_steps=pitch_factor)
    rate    = row.rate
    
    tmp_df.at[row.Index, 'amplitude'] = signals
    
    # Mel Spectogram
    tmp_df.at[row.Index, 'mel_spectogram'] = librosa.feature.melspectrogram(y=signals, sr=rate)
    tmp_df.at[row.Index, 'mel_spectogram'] = librosa.power_to_db(tmp_df.at[row.Index, 'mel_spectogram'], ref=np.max)

In [None]:
# Audio
for row in tmp_df.itertuples():
    print(row.label)
    IPython.display.display(Audio(row.amplitude, rate=row.rate))

In [None]:
# Signal
vp.wave(
    tmp_df.reset_index().set_index('label'),
    amplitude='amplitude',
    title='Audio Signal - Pitch Shifting',
    out_path=OUT_PATH_GRAPH
)

In [None]:
# Mel Spectogram
vp.spectogram(
    tmp_df.reset_index().set_index('label'),
    z='mel_spectogram',
    title='Mel Spectogram - Pitch Shifting',
    y_title='Frequency (Hz)',
    out_path=OUT_PATH_GRAPH
)