## Data Augmentation

In [1]:
from scipy.io import wavfile as wav
from scipy.io.wavfile import read, write
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import librosa.display
from pathlib import Path
from sklearn.model_selection import train_test_split

In [77]:
def add_noise(file_path):
    data = librosa.core.load(file_path)[0]
    noise = np.random.randn(len(data))
    data_noise = data + 0.005 * noise
    return data_noise
 
def shift(file_path):
    data = librosa.core.load(file_path)[0]
    return np.roll(data, 300)
 
def stretch(file_path):
    data = librosa.core.load(file_path)[0]
    rate = 1.5
    data = librosa.effects.time_stretch(data, rate)
    return data
 
def write_audio_file(file, data, sample_rate=22050):
    librosa.output.write_wav(file, data, sample_rate)
    
    
def new_name (source_str, insert_str):
    #return source_str[:-4]+insert_str+source_str[-4:]
    return source_str[:34] + str('Train_augment_pad/') + source_str[34:-4] + insert_str + source_str[-4:]

def pad_signal(path, length):
    samples, sample_rate = librosa.load(path)
    name = path  # To rewrite files
    if len(samples) < length:
        y = np.pad(samples, (0, length-len(samples)), 'constant')
    else:
        y = samples
    return librosa.output.write_wav(path=name, y=y, sr=sample_rate)

def pad_signal_rename (path, length, new_folder):
    samples, sample_rate = librosa.load(path)
    name = str(path)[:34] + new_folder + str(path)[34:] # To rename files
    if len(samples) < length:
        y = np.pad(samples, (0, length-len(samples)), 'constant')
    else:
        y = samples
    return librosa.output.write_wav(path=name, y=y, sr=sample_rate)


### Getting all the cleaned data in one list

In [3]:
shuffle_col=[]
pathlist = Path('../../../Source/Clean_train_clips/Shuffle').glob('**/*.wav')
for path in pathlist:
    shuffle_col.append(path)

In [4]:
bc_col = []
pathlist = Path('../../../Source/Clean_train_clips/Ball_change').glob('**/*.wav')
for path in pathlist:
    bc_col.append(path)
    bc_col.sort()

In [5]:
path_col = shuffle_col + bc_col

### Set aside some data for validation

In [6]:
len(path_col)

343

In [7]:
path_copy = path_col.copy()

In [8]:
def get_label(path):
    if path.parts[-3] == 'Shuffle':
        return 1
    else:
        return 0

In [9]:
path_df = pd.DataFrame(path_copy, columns=['Path'])
path_df['Labels'] = [get_label(path_df.loc[idx,'Path']) for idx in range(len(path_df))]

In [10]:
path_df.head()

Unnamed: 0,Path,Labels
0,../../../Source/Clean_train_clips/Shuffle/7/16...,1
1,../../../Source/Clean_train_clips/Shuffle/7/15...,1
2,../../../Source/Clean_train_clips/Shuffle/7/14...,1
3,../../../Source/Clean_train_clips/Shuffle/7/10...,1
4,../../../Source/Clean_train_clips/Shuffle/7/11...,1


In [11]:
X = path_df['Path']
y = path_df['Labels']

In [12]:
X_train, X_extra, y_train, y_extra = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.33,
                                                     random_state=17)

In [13]:
X_train.shape, X_extra.shape
y_train.shape, y_extra.shape

((229,), (114,))

In [15]:
X_test, X_validate, y_test, y_validate = train_test_split(X_extra, y_extra,
                                                    stratify=y_extra, 
                                                    test_size=0.33,
                                                     random_state=17)

In [17]:
X_train.shape, X_test.shape, X_validate.shape
y_train.shape, y_test.shape, y_validate.shape

((229,), (76,), (38,))

In [18]:
# Test Data for later use in models
X_test.to_csv('../../../Source/Reserved_data/X_test_reserved.csv', index=None, header=True)
y_test.to_csv('../../../Source/Reserved_data/y_test_reserved.csv', index=None, header=True)

# Validate Data for later use in models
X_validate.to_csv('../../../Source/Reserved_data/X_validate_reserved.csv', index=None, header=True)
y_validate.to_csv('../../../Source/Reserved_data/y_validate_reserved.csv', index=None, header=True)

In [20]:
path_col = (X_train.tolist())
len(path_col)


'../../../Source/Clean_train_clips/Train_augment_pad/Ball_change/5/20_noise.wav'

### Creating, exporting noisy clips

In [21]:
for i in range (len(path_col)): 
    write_audio_file((new_name(str(path_col[i]), '_noise')),
                      add_noise(path_col[i]))

### Creating, exporting stretched clips

In [22]:
for i in range (len(path_col)): 
    write_audio_file((new_name(str(path_col[i]), '_stretch')),
                      stretch(path_col[i]))

### Creating, exporting shifted clips

In [23]:
for i in range (len(path_col)): 
    write_audio_file((new_name(str(path_col[i]), '_shift')),
                      shift(path_col[i]))

### Export un-augmented data

In [24]:
for i in range (len(path_col)):
    write_audio_file((new_name(str(path_col[i]), '_')),
                     librosa.core.load(path_col[i])[0])

### Pad samples with silence (to create equal lengths) and resave (overwrite existing files)

In [25]:
path_col = []
pathlist = Path('../../../Source/Clean_train_clips/Train_augment_pad').glob('**/*.wav')
for path in pathlist:
    path_col.append(path)

In [26]:
len(path_col) # should be 229*4=916

916

In [29]:
length_list = []
for i in range (len(path_col)):
    samples, sample_rate = librosa.load(path_col[i])
    length_list.append(len(samples))
max_length = max(length_list)
print(max_length)

20772


In [43]:
for i in range(len(path_col)):
    pad_signal(path_col[i], max_length)

In [45]:
# Check to see if it worked
for i in range (len(path_col)):
    samples, sample_rate = librosa.load(path_col[i])
    print(len(samples))

### Repeat padding for test and validate sets (and write files)

In [48]:
# Test Data 
X_test = pd.read_csv('../../../Source/Reserved_data/X_test_reserved.csv')

# Validate Data 
X_validate = pd.read_csv('../../../Source/Reserved_data/X_validate_reserved.csv')

In [58]:
val_col, test_col = [], []
for i in range(len(X_validate)):
    val_col.append(Path(X_validate.loc[i, 'Path']))
    
for j in range(len(X_test)):
    test_col.append(Path(X_test.loc[j, 'Path']))

In [62]:
print (len(val_col) == len(X_validate))
print (len(test_col) == len(X_test))
print (max_length)

True
True
20772


In [78]:
for i in range(len(val_col)):
    pad_signal_rename(val_col[i], max_length, 'Validate_pad/')

In [79]:
for i in range(len(test_col)):
    pad_signal_rename(test_col[i], max_length, 'Test_pad/')

In [85]:
# Check to see if it worked
path_col = []
pathlist = Path('../../../Source/Clean_train_clips/Test_pad').glob('**/*.wav') # For Test_pad and Validate_pad
for path in pathlist:
    path_col.append(path)

for i in range (len(path_col)):
    samples, sample_rate = librosa.load(path_col[i])
    #print(len(samples))