# Data augmentation

First, we import some useful libraries and set a random seed to ensure reproducibility.

In [None]:
import albumentations as A

import json

import kagglehub

import matplotlib.image as mpimg

import numpy as np

import os

from keras.utils import to_categorical

import random

from sklearn.model_selection import train_test_split
from skimage.util import random_noise



# Set random state for reproducibility
random_state = 23
random.seed(random_state)
np.random.seed(random_state)

In [None]:
# IF YOU ARE NOT USING KAGGLE
BASE_PATH = kagglehub.dataset_download("andradaolteanu/gtzan-dataset-music-genre-classification")
BASE_PATH += "/Data/"
OUT_BASE_PATH = "./"
# IF YOU ARE USING KAGGLE, ADD THE ORIGINAL DATASET TO THE NOTEBOOOK, COMMENT THE TWO PREVIOUS LINES AND UNCOMMENT THE FOLLOWING ONE
# BASE_PATH = "/kaggle/input/gtzan-dataset-music-genre-classification/Data"
# OUT_BASE_PATH = "/kaggle/working/"

Next, we prepare the environment by defining the variables needed and loading the dataset from Kaggle.

In [None]:
# Set up save paths
original_path = "original/"
original_aug = "original_aug/"
cropped_path = "cropped/"
cropped_aug = "cropped_aug/"
cropped_aug_inj = "cropped_aug_inj/"

original_img_shape = (288, 432, 4)

CONFIGS = [original_path, original_aug, cropped_path, cropped_aug, cropped_aug_inj]
OUT_DIR = OUT_BASE_PATH + "Augmented_GTZAN/"
SPLITS = ["train", "val", "test"]

genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
num_classes = len(genres)
n_images_per_genre = 100
genre_indices = list(range(num_classes))

# One instance is removed (jazz-54) since it was corrupted
X = np.zeros((n_images_per_genre*num_classes - 1, *original_img_shape))
y = np.array([genre_idx 
              for genre_idx in genre_indices  
            for _ in range(n_images_per_genre)])
# We are deleting the corrupted instance 
y = np.delete(y, 500)

unique, counts = np.unique(y, return_counts=True)
print(unique, counts)

[0 1 2 3 4 5 6 7 8 9] [100 100 100 100 100  99 100 100 100 100]


In [38]:
def save_imgs(X, y, genres, out_dir, split, indices=None, num_classes=10):
    y_cat = to_categorical(y)
    labels = np.argmax(y_cat, axis=1)
    genre_counters = [0 for _ in range(num_classes)]
    new_indices = np.empty((len(X), 2), dtype=object)

    for instance_idx, (img, label) in enumerate(zip(X, labels)):

        genre = genres[label]
        idx = genre_counters[label]
        filename = f"{genre}{idx:05d}.png"

        save_path = os.path.join(out_dir, split, genre, filename)

        if indices is not None:
            new_indices[instance_idx][0] = indices[instance_idx]
            save_path = save_path.replace("\\", "/")
            new_indices[instance_idx][1] = save_path

        mpimg.imsave(save_path, img)
        genre_counters[label] += 1

    if indices is not None:
        return new_indices.tolist()

In [None]:
img_index = 0

for genre_name in genres:
    for image_number in range(n_images_per_genre):
        # Skip corrupted file
        if genre_name == 'jazz' and image_number == 54:
            continue
        curr_path = f"{BASE_PATH}/images_original/{genre_name}/{genre_name}{image_number:05d}.png"
        img = mpimg.imread(curr_path)
        X[img_index, :, :, :] = img
        img_index += 1

In [40]:
# Create the out folders
for conf in CONFIGS:
    for split in SPLITS:
        for genre_name in genres:
            os.makedirs(os.path.join(OUT_DIR, conf, split, genre_name), exist_ok=True)

In [41]:
pipeline_transform = A.Compose([
    A.FrequencyMasking(freq_mask_param = 50, p=0.6),
    A.TimeMasking(time_mask_param = 65, p=0.6),
    A.TimeReverse(p=0.5),
])

  original_init(self, **validated_kwargs)
  original_init(self, **validated_kwargs)
  original_init(self, **validated_kwargs)


## Original dataset

First, we divide the original dataset by saving indexes in the relative splits wihtout applying any modification to the original images.

In [None]:
test_size = 0.20
val_size = 0.1
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, range(len(X)), test_size = test_size, random_state = random_state, stratify=y)
X_train, X_val, y_train, y_val, indices_train, indices_val = train_test_split(X_train, y_train, indices_train, test_size = val_size, random_state = random_state, stratify=y_train)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape)

# Train
train_ind = save_imgs(X_train, y_train, genres, OUT_DIR+original_path, SPLITS[0], indices=indices_train)
# Validation
val_ind = save_imgs(X_val, y_val, genres, OUT_DIR+original_path, SPLITS[1], indices=indices_val)
# Test
test_ind = save_imgs(X_test, y_test, genres, OUT_DIR+original_path, SPLITS[2], indices=indices_test)


with open(OUT_BASE_PATH + "indices_train.json", 'w+') as f:
    f.write(json.dumps(train_ind))
with open(OUT_BASE_PATH + "indices_test.json",'w+') as f:
    f.write(json.dumps(test_ind))
with open(OUT_BASE_PATH + "indices_val.json", 'w+') as f:
    f.write(json.dumps(val_ind))


(719, 288, 432, 4) (719,) (200, 288, 432, 4) (200,) (80, 288, 432, 4) (80,)


## 1 - Original Dataset Augmentation

In [43]:
test_size = 0.30
val_size = 0.20
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X, y, test_size = test_size, random_state = random_state, stratify=y)
X_train_res, X_val_res, y_train_res, y_val_res = train_test_split(X_train_res, y_train_res, test_size = val_size, random_state = random_state, stratify=y_train_res)
print(X_train_res.shape, y_train_res.shape, X_test_res.shape, y_test_res.shape, X_val_res.shape, y_val_res.shape)

(559, 288, 432, 4) (559,) (300, 288, 432, 4) (300,) (140, 288, 432, 4) (140,)


In [44]:
# Num imgs is the number of images to reach after augmentation
num_imgs = len(X_train_res) * 3
X_train_transformed = np.zeros((num_imgs, *original_img_shape))
y_train_transformed = np.zeros((num_imgs))

# First, we copy the original imgs
X_train_transformed[:len(X_train_res), :, :, :] = X_train_res[:, :, :, :]
y_train_transformed[:len(y_train_res)] = y_train_res[:]

# Then, we add augmented images
cur_idx = len(X_train_res)
while cur_idx < num_imgs:
    for image, label in zip(X_train_res, y_train_res):
        transformed_image = pipeline_transform(image=image)["image"]
        X_train_transformed[cur_idx, :, :, :] = transformed_image[:, :, :]
        y_train_transformed[cur_idx] = label
        cur_idx = cur_idx + 1
        # Exit when the number of images is reached
        if cur_idx == num_imgs:
            break
    
# Train
save_imgs(X_train_transformed, y_train_transformed, genres, OUT_DIR+original_aug, SPLITS[0])
# Validation
save_imgs(X_val_res, y_val_res, genres, OUT_DIR+original_aug, SPLITS[1])
# Test
save_imgs(X_test_res, y_test_res, genres, OUT_DIR+original_aug, SPLITS[2])

## 2 - Cropped Images Augmentation

Since the images include white space around the spectrograms, we removed these regions to check whether excluding irrelevant information improves the neural network performance.

In [45]:
img = X[0]
top_padding = 0
bottom_padding = 0
left_padding = 0
right_padding = 0
GRAY_THRESHOLD = 0.99
IMG_H = img.shape[0]
IMG_W = img.shape[1]
CENTER_H = IMG_H // 2
CENTER_W = IMG_W // 2


# Y' = 0.299 R + 0.587 G + 0.114 B 
gray_img = 0.299 * img[:, :, 0] + 0.587 * img[:, :, 1] +  0.114 * img[:, :, 2]



# Top padding
for i in range(IMG_H):
    if gray_img[i, CENTER_W] < GRAY_THRESHOLD:
        top_padding = i
        break

# Left padding
for i in range(IMG_W):
    if gray_img[CENTER_H, i] < GRAY_THRESHOLD:
        left_padding = i
        break

# Bottom padding
for i in range(IMG_H):
    if gray_img[IMG_H - 1 - i, CENTER_W] < GRAY_THRESHOLD:
        bottom_padding = i
        break

# Right padding
for i in range(IMG_W):
    if gray_img[CENTER_H, IMG_W - 1 - i] < GRAY_THRESHOLD:
        right_padding = i
        break

print(top_padding) #35
print(left_padding) #54
print(bottom_padding) #35
print(right_padding) #42

# Need an extra pixel for bottom and right
bottom_padding += 1
right_padding += 1

35
54
35
42


In [46]:
cropped_img_shape = (original_img_shape[0] - top_padding - top_padding, original_img_shape[1] - left_padding - right_padding, original_img_shape[2])

X_resized = np.zeros((n_images_per_genre*num_classes - 1, *cropped_img_shape))

i = 0
for img in X:
    X_resized[i, :, :, :] = img[top_padding:-top_padding, left_padding:-right_padding, :]
    i = i + 1

test_size = 0.20
val_size = 0.1
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_resized, y, test_size = test_size, random_state = random_state, stratify=y)
X_train_res, X_val_res, y_train_res, y_val_res = train_test_split(X_train_res, y_train_res, test_size = val_size, random_state = random_state, stratify=y_train_res)
print(X_train_res.shape, y_train_res.shape, X_test_res.shape, y_test_res.shape, X_val_res.shape, y_val_res.shape)

(719, 218, 335, 4) (719,) (200, 218, 335, 4) (200,) (80, 218, 335, 4) (80,)


Here we save the cropped images (not augmented yet)

In [47]:
#### 2. RESIZED
# Train
save_imgs(X_train_res, y_train_res, genres, OUT_DIR+cropped_path, SPLITS[0])
# Validation
save_imgs(X_val_res, y_val_res, genres, OUT_DIR+cropped_path, SPLITS[1])
# Test
save_imgs(X_test_res, y_test_res, genres, OUT_DIR+cropped_path, SPLITS[2])

And now we can augment the cropped images

In [48]:
test_size = 0.30
val_size = 0.20
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_resized, y, test_size = test_size, random_state = random_state, stratify=y)
X_train_res, X_val_res, y_train_res, y_val_res = train_test_split(X_train_res, y_train_res, test_size = val_size, random_state = random_state, stratify=y_train_res)
print(X_train_res.shape, y_train_res.shape, X_test_res.shape, y_test_res.shape, X_val_res.shape, y_val_res.shape)


(559, 218, 335, 4) (559,) (300, 218, 335, 4) (300,) (140, 218, 335, 4) (140,)


In [49]:
num_imgs = len(X_train_res) * 3
X_train_transformed = np.zeros((num_imgs, *cropped_img_shape))
y_train_transformed = np.zeros((num_imgs))

# Copy the original imgs
X_train_transformed[:len(X_train_res), :, :, :] = X_train_res[:, :, :, :]
y_train_transformed[:len(y_train_res)] = y_train_res[:]

i = len(X_train_res)
while i < num_imgs:
    for image, label in zip(X_train_res, y_train_res):
        transformed_image = pipeline_transform(image=image)["image"]
        X_train_transformed[i, :, :, :] = transformed_image[:, :, :]
        y_train_transformed[i] = label
        i = i + 1
        if i == num_imgs:
            break

#### 3. AUGMENTATION
# Train
save_imgs(X_train_transformed, y_train_transformed, genres, OUT_DIR+cropped_aug, SPLITS[0])
# Validation
save_imgs(X_val_res, y_val_res, genres, OUT_DIR+cropped_aug, SPLITS[1])
# Test
save_imgs(X_test_res, y_test_res, genres, OUT_DIR+cropped_aug, SPLITS[2])

## 3 - Cropped Data Augmentation + Noise Injection

In [50]:
num_imgs = len(X_train_transformed) * 2
X_train_transformed_noise = np.zeros((num_imgs, *cropped_img_shape))
y_train_transformed_noise = np.zeros((num_imgs))

# Copy the original imgs
X_train_transformed_noise[:len(X_train_transformed), :, :, :] = X_train_transformed[:, :, :, :]
y_train_transformed_noise[:len(y_train_transformed)] = y_train_transformed[:]

noise_modes = ["s&p", "gaussian", "poisson"]


i = len(X_train_transformed)
while i < num_imgs:
    for image, label in zip(X_train_transformed, y_train_transformed):

        n = random.randint(0, 2)
        
        # Add choosen noise to the image
        noise = random_noise(image, mode=noise_modes[n])
        
        X_train_transformed_noise[i, :, :, :] = noise[:, :, :]
        y_train_transformed_noise[i] = label
        i = i + 1
        if i == num_imgs:
            break


In [51]:
#### 4. NOISE INJECTION
# Train
save_imgs(X_train_transformed_noise, y_train_transformed_noise, genres, OUT_DIR+cropped_aug_inj, SPLITS[0])
# Validation
save_imgs(X_val_res, y_val_res, genres, OUT_DIR+cropped_aug_inj, SPLITS[1])
# Test
save_imgs(X_test_res, y_test_res, genres, OUT_DIR+cropped_aug_inj, SPLITS[2])