## Data Augmentation

In [80]:
#load data
import convenience

df_train_val, sample_rates = convenience.load_train()
df_train_val['augmented_data'] = df_train_val.audio
df_train_val.head()

Unnamed: 0,file_name,stratify,accent,gender,audio,length,augmented_data
0,1f_8793.wav,1f,1,f,"[tensor(-0.0001), tensor(-6.1035e-05), tensor(...",7.253312,"[tensor(-0.0001), tensor(-6.1035e-05), tensor(..."
1,4m_4676.wav,4m,4,m,"[tensor(-0.0010), tensor(-0.0009), tensor(-0.0...",9.130625,"[tensor(-0.0010), tensor(-0.0009), tensor(-0.0..."
2,2m_2335.wav,2m,2,m,"[tensor(6.1035e-05), tensor(0.0001), tensor(0....",4.522687,"[tensor(6.1035e-05), tensor(0.0001), tensor(0...."
3,4m_2107.wav,4m,4,m,"[tensor(0.0002), tensor(6.1035e-05), tensor(3....",4.693313,"[tensor(0.0002), tensor(6.1035e-05), tensor(3...."
4,2f_8785.wav,2f,2,f,"[tensor(0.), tensor(-9.1553e-05), tensor(-0.00...",2.425,"[tensor(0.), tensor(-9.1553e-05), tensor(-0.00..."


In [None]:
sr = list(sample_rates)[0]
sr

#initialize sr variable (sampling rate) 

16000

In [None]:
import torch

In [None]:
waveform = df_train_val.audio[0]
print(waveform)
waveform.shape

tensor([-1.2207e-04, -6.1035e-05,  0.0000e+00,  ..., -1.8311e-04,
        -1.8311e-04, -2.1362e-04])


#### Librosa

In this part I apply different types of augmentation techniques. They're programmed in a pipeline, where the output of the previous is the input to the next one. 
If you want to apply the augmentation technique to a 'clean slate', apply it to df_train_val['augmented'].

Code is adapted from: https://www.kaggle.com/code/huseinzol05/sound-augmentation-librosa#apply-hpss 

(Please note it would be a good idea to see for which augmentations the model has the best performance, yet also a good generalizability!)

In [None]:
import numpy as np
import librosa 

import seaborn as sns 
sns.set() #iirc mostly for visuals 
import tensorflow as tf
from IPython.display import Audio

2025-05-25 16:51:35.326330: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-25 16:51:35.345781: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-25 16:51:35.386223: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-25 16:51:35.398677: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748184695.409605   12737 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748184695.41

In [None]:
#pitch shifted audio
#This is the begining of the pipeline, so df_train_val is used. 
pitch_shift_audio = [] #I always create a seperate list to avoid overwriting the original variable
for file in df_train_val['augmented_data']:
    audio_pitch = file.numpy() #data has to be a numpy array for Librosa to work
    bins_per_octave = 12 
    pitch_pm = 2 
    pitch_change =  pitch_pm * 2*(np.random.uniform())  #every audiofile has a random pitch change (can be tweaked)
    
    pitch_shift_audio.append(librosa.effects.pitch_shift(audio_pitch, sr = 16000, n_steps=pitch_change, bins_per_octave=bins_per_octave)) 

print(pitch_shift_audio)

[array([-1.1734670e-04, -5.5809687e-05, -2.3071534e-06, ...,
        9.1125723e-05,  1.1502832e-04,  6.9552014e-05], dtype=float32), array([-8.8885729e-04, -9.2230778e-04, -6.9246982e-04, ...,
        1.1438162e-04,  9.2619346e-05,  6.8494672e-05], dtype=float32), array([ 4.63904398e-05,  1.43725018e-04,  1.21165554e-04, ...,
       -2.74106581e-03, -2.43873359e-03, -3.09867458e-03], dtype=float32), array([1.9413425e-04, 6.6443812e-05, 1.9763491e-05, ..., 4.1393744e-04,
       4.2608520e-04, 2.6027625e-04], dtype=float32), array([-2.296712e-06, -9.843928e-05, -2.647151e-04, ...,  0.000000e+00,
        0.000000e+00,  0.000000e+00], dtype=float32), array([-7.4748117e-05, -1.5711959e-04, -3.9307453e-04, ...,
        7.2119536e-04,  2.7980344e-04,  6.6148164e-04], dtype=float32), array([-0.00047838, -0.00028734, -0.00034332, ...,  0.0003151 ,
        0.00026625,  0.00030197], dtype=float32), array([-0.00028693, -0.00035255, -0.00033508, ..., -0.00328996,
       -0.00337171, -0.00391228], d

In [None]:
#change speed
#notice how pitch_shift_audio is the input for this part (so no .numpy needed anymore)
speed_shift_audio = []
for file in pitch_shift_audio:
    audio_speed = file 
    speed_change = np.random.uniform(low=0.9,high=1.1) #strength of the effect (can be tweaked)
    tmp = librosa.effects.time_stretch(audio_speed, rate = speed_change) 
    minlen = min(audio_speed.shape[0], tmp.shape[0])
    audio_speed *= 0 
    audio_speed [0:minlen] = tmp[0:minlen] 
    
    speed_shift_audio.append(audio_speed)

In [None]:
#distribution noise
noise_dist_audio = []

for file in speed_shift_audio:
    audio_noise = file
    noise_amp = 0.005*np.random.uniform()*np.amax(audio_noise) #random audio noise, can be changed to any distribution from https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.random.html
    audio_noise = audio_noise + noise_amp * np.random.normal(size= audio_noise.shape[0]) #mathy math for noise
    
    noise_dist_audio.append(audio_noise) 

In [None]:
#random shift
rand_shift_audio = []

for file in noise_dist_audio:
    audio_shift = file
    timeshift_fac = 0.2 *2*(np.random.uniform()-0.5)  # up to 20% of length shift (can be tweaked)
    start = int(audio_shift.shape[0] * timeshift_fac)
    if (start > 0): 
        audio_shift = np.pad(audio_shift,(start,0),mode='constant')[0:audio_shift.shape[0]]
    else:
        audio_shift = np.pad(audio_shift,(0,-start),mode='constant')[0:audio_shift.shape[0]]
    Audio(audio_shift, rate= sr)
    
    rand_shift_audio.append(audio_shift)

In [None]:
#stretching
stretch_shift_audio = []

for file in rand_shift_audio:
    input_length = len(file)
    streching = file
    streching = librosa.effects.time_stretch(streching , rate = 1.1) #similar code to speed up due to file needing to fit audio
    if len(streching) > input_length:
        streching = streching[:input_length]
    else:
        streching = np.pad(streching, (0, max(0, input_length - len(streching))), "constant")

    stretch_shift_audio.append(streching)

In [None]:
#supposed to convert the augmented data back into the df, in tensor form, but I was unable to make it work, kept giving dimension errors
#there could be a hidden issue in the functions used where it changes the dimensions, I suspect speed or stretch 
#I would try to run all parts of the pipeline seperately and see from where the issue arrises 

#augmented_data = torch.tensor(stretch_shift_audio)

#df_train_val['augmented_data'] = augmented_data
#df_train_val.head()

ValueError: expected sequence of length 116053 at dim 1 (got 146090)