In [9]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm

from sklearn.model_selection import train_test_split

# Augment Spectrogram Training Data
This script will increase the training data by 6 fold, implementing the stretch factors 0.81, 0.93, 1.07, 1.23 and a shift factor of 5.

In [17]:

# setup parameters
    ## local path to directory
input_folder = '/Users/jahnavimaddhuri/Downloads/musicgenreclassification/genres_original/'
sample_rate = 22050
stretch_rates = [0.81, 0.93, 1.07, 1.23]
time_shift_seconds = 5
output_npz = 'augmented_spec.npz'

# create file list of train/test data
files = []
y = []
genre_names = [
    "blues",
    "classical",
    "country",
    "disco",
    "hiphop",
    "jazz",
    "metal",
    "pop",
    "reggae",
    "rock",
]
for genre in genre_names:
    files += [f'{genre}.{i:05}.wav' for i in range(100)]
    y += [genre] * 100

files_train, files_test, y_train, y_test = train_test_split(
    np.array(files), np.array(y), test_size=0.2, random_state=42, shuffle=True
)

In [27]:
def audio_to_rgb(data, sr):
    '''Convert audio to RGB spectrogram.
    S_rgb: (128, 128, 3)'''
    S = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128, n_fft=2048, hop_length=512, fmax=8000)
    S_dB = librosa.power_to_db(S, ref=np.max)
    resized = cv2.resize(S_dB, (128, 128), interpolation=cv2.INTER_CUBIC)
    S_norm = (resized - resized.min()) / (resized.max() - resized.min())
    S_rgb = plt.get_cmap("magma")(S_norm)[:, :, :3]
    return S_rgb

def time_shift(data, sr, seconds):
    '''Shift audio data by a given number of seconds.'''
    shift = int(sr * seconds)
    return np.roll(data, shift)

def time_stretch_f(data, rate, target_len):
    '''Stretch audio data by a given rate. Maintain audio length.'''
    stretched = librosa.effects.time_stretch(data, rate=rate)
    if len(stretched) > target_len:
        return stretched[:target_len]
    else:
        return np.pad(stretched, (0, target_len - len(stretched)))

In [28]:
spectrogram_augmented = []
labels = []

for i, f in enumerate(files_train):
    genre = y_train[i]
    filepath = input_folder + genre + '/' + f
    data, sr = librosa.load(filepath, sr=sample_rate)
    input_len = len(data)

    # Original
    spectrogram_augmented.append(audio_to_rgb(data, sr))
    labels.append(genre)

    # Time-shifted
    shifted = time_shift(data, sr, time_shift_seconds)
    spectrogram_augmented.append(audio_to_rgb(shifted[:input_len], sr))
    labels.append(genre)

    # Time-stretched
    for rate in stretch_rates:
        stretched = time_stretch_f(data, rate, input_len)
        spectrogram_augmented.append(audio_to_rgb(stretched, sr))
        labels.append(genre)


In [29]:
print('Shape of first spectrogram:')
print(spectrogram_augmented[0].shape)
print('Lengths of spectrogram and labels:')
print(len(spectrogram_augmented))
print(len(labels))

print('First of each:')
print(spectrogram_augmented[0])
print(labels[0])

Shape of first spectrogram:
(128, 128, 3)
Lengths of spectrogram and labels:
4800
4800
First of each:
[[[0.113094 0.065492 0.276784]
  [0.12938  0.067935 0.305443]
  [0.118405 0.066479 0.286321]
  ...
  [0.069764 0.049726 0.193735]
  [0.06533  0.047318 0.184892]
  [0.00595  0.004843 0.03713 ]]

 [[0.372116 0.092816 0.499053]
  [0.310382 0.069702 0.483186]
  [0.396467 0.102902 0.502658]
  ...
  [0.335308 0.078236 0.491024]
  [0.594508 0.175701 0.501241]
  [0.372116 0.092816 0.499053]]

 [[0.613617 0.181811 0.498536]
  [0.716387 0.214982 0.47529 ]
  [0.96331  0.42539  0.359469]
  ...
  [0.639216 0.189921 0.49415 ]
  [0.792427 0.244242 0.447543]
  [0.52527  0.152569 0.507192]]

 ...

 [[0.245543 0.059352 0.448436]
  [0.060949 0.044794 0.176129]
  [0.29774  0.066117 0.478243]
  ...
  [0.384299 0.097855 0.501002]
  [0.245543 0.059352 0.448436]
  [0.245543 0.059352 0.448436]]

 [[0.211718 0.061992 0.418647]
  [0.021692 0.01832  0.09261 ]
  [0.211718 0.061992 0.418647]
  ...
  [0.366012 0.090

In [30]:
# Save to .npz
X_train = np.array(spectrogram_augmented)
y_train = np.array(labels)
np.savez(output_npz, X_train=X_train, y_train=y_train)

print(f"Saved {X_train.shape[0]} spectrograms to {output_npz}")

Saved 4800 spectrograms to augmented_spec.npz
