In [12]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm

from sklearn.model_selection import train_test_split

# Augment Spectrogram Training Data
This script will increase the training data by 6 fold, implementing the stretch factors 0.81, 0.93, 1.07, 1.23 and a shift factor of 5.

In [None]:
# setup parameters
# Local path to directory. 
# For some reason ~/Downloads doesn't seem to work.
# PLEASE CHANGE TO YOUR OWN DOWNLOADS DIRECTORY ABSOLUTE PATH.
input_folder = '/Users/jahnavimaddhuri/Downloads/musicgenreclassification/genres_original/'
sample_rate = 22050
stretch_rates = [0.81, 0.93, 1.07, 1.23]
time_shift_seconds = 5
output_npz = 'augmented_spec.npz'

# create file list of train/test data
files = []
y = []
genre_names = [
    "blues",
    "classical",
    "country",
    "disco",
    "hiphop",
    "jazz",
    "metal",
    "pop",
    "reggae",
    "rock",
]
for genre in genre_names:
    files += [f'{genre}.{i:05}.wav' for i in range(100)]
    y += [genre] * 100

# remove jazz 54 bc corrupted
files = files[:554] + files[555:]
y = y[:554] + y[555:]

files_train, files_test, y_train, y_test = train_test_split(
    np.array(files), np.array(y), test_size=0.2, random_state=42, shuffle=True
)

In [15]:
def audio_to_rgb(data, sr):
    '''Convert audio to RGB spectrogram.
    S_rgb: (128, 128, 3)'''
    S = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128, n_fft=2048, hop_length=512, fmax=8000)
    S_dB = librosa.power_to_db(S, ref=np.max)
    resized = cv2.resize(S_dB, (128, 128), interpolation=cv2.INTER_CUBIC)
    S_norm = (resized - resized.min()) / (resized.max() - resized.min())
    S_rgb = plt.get_cmap("magma")(S_norm)[:, :, :3]
    return S_rgb

def time_shift(data, sr, seconds):
    '''Shift audio data by a given number of seconds.'''
    shift = int(sr * seconds)
    return np.roll(data, shift)

def time_stretch_f(data, rate, target_len):
    '''Stretch audio data by a given rate. Maintain audio length.'''
    stretched = librosa.effects.time_stretch(data, rate=rate)
    if len(stretched) > target_len:
        return stretched[:target_len]
    else:
        return np.pad(stretched, (0, target_len - len(stretched)))

In [16]:
spectrogram_augmented = []
labels = []

for i, f in enumerate(files_train):
    genre = y_train[i]
    filepath = input_folder + genre + '/' + f
    data, sr = librosa.load(filepath, sr=sample_rate)
    input_len = len(data)

    # Original
    spectrogram_augmented.append(audio_to_rgb(data, sr))
    labels.append(genre)

    # Time-shifted
    shifted = time_shift(data, sr, time_shift_seconds)
    spectrogram_augmented.append(audio_to_rgb(shifted[:input_len], sr))
    labels.append(genre)

    # Time-stretched
    for rate in stretch_rates:
        stretched = time_stretch_f(data, rate, input_len)
        spectrogram_augmented.append(audio_to_rgb(stretched, sr))
        labels.append(genre)


In [18]:
print('Shape of first spectrogram:')
print(spectrogram_augmented[0].shape)
print('Lengths of spectrogram and labels:')
print(len(spectrogram_augmented))
print(len(labels))

print('First of each:')
print(spectrogram_augmented[0])
print(labels[0])

Shape of first spectrogram:
(128, 128, 3)
Lengths of spectrogram and labels:
4794
4794
First of each:
[[[0.786212 0.241514 0.450184]
  [0.779968 0.238851 0.452765]
  [0.792427 0.244242 0.447543]
  ...
  [0.754737 0.228772 0.462509]
  [0.761077 0.231214 0.460162]
  [0.767398 0.233705 0.457755]]

 [[0.735616 0.221713 0.46918 ]
  [0.488088 0.139186 0.508011]
  [0.652056 0.193986 0.491611]
  ...
  [0.488088 0.139186 0.508011]
  [0.488088 0.139186 0.508011]
  [0.310382 0.069702 0.483186]]

 [[0.99581  0.646344 0.441361]
  [0.761077 0.231214 0.460162]
  [0.996341 0.660969 0.45116 ]
  ...
  [0.852126 0.276106 0.418573]
  [0.904281 0.31961  0.388137]
  [0.921884 0.341098 0.377376]]

 ...

 [[0.238826 0.059517 0.443256]
  [0.178212 0.066576 0.379497]
  [0.291366 0.064553 0.475462]
  ...
  [0.271994 0.060994 0.46566 ]
  [0.556571 0.163269 0.50523 ]
  [0.25222  0.059415 0.453248]]

 [[0.159018 0.068354 0.352688]
  [0.204935 0.062907 0.411514]
  [0.291366 0.064553 0.475462]
  ...
  [0.316654 0.071

In [None]:
# Save to .npz. File is too big so it's not on GH!
X_train = np.array(spectrogram_augmented)
y_train = np.array(labels)
np.savez(output_npz, X_train=X_train, y_train=y_train)

print(f"Saved {X_train.shape[0]} spectrograms to {output_npz}")

Saved 4794 spectrograms to augmented_spec.npz
