In [None]:
!pip install -q livelossplot

# Importing the Required Libraries

In [None]:
import random, os, glob 
import numpy as np 
import tensorflow as tf
from keras.models import Sequential 
from keras.layers import Dropout, Dense, Conv2D, MaxPool2D, Flatten, Reshape, BatchNormalization, GlobalAveragePooling2D # layers I will incorporate
from keras.callbacks import EarlyStopping 
from tensorflow.keras.applications import VGG19 
from keras import backend
from livelossplot import PlotLossesKeras 
import librosa 
from librosa.display import specshow
import matplotlib.pyplot as plt
import IPython.display as ipd
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Defining Utility Functions

In [None]:
def setRandom():
    seed = 0 # random seed value
    os.environ["PYTHONHASHSEED"] = str(seed) # if this is not set, a random value is used to seed the hashes of some objects
    random.seed(seed) # sets the base python and numpy random seeds
    np.random.seed(seed)
    tf.random.set_seed(seed) # sets the tensorflow random seed
    tf.compat.v1.set_random_seed(seed)

# __Preparing Data__

> To fulfil the aim of this investigation, I need to predict a song's genre from a randomly chosen 30-second snippet. This means that the dependent variable (the one we are attempting to measure) is the __*song genre*__.The first step in Preparing the dataset is Removing samples that will most likely cause errors when training


## 1) Removing Erroneous Values
> To ensure that the dataframe is consistent and will not cause errors in the future, I need to check for null or missing values.

In [None]:
source = "../input/gtzan-dataset-music-genre-classification/Data/images_original/" # source folder path
genres = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"] # list with the genre folder names

for genre in genres: # iterate through each genre folder
    path = os.path.join(source, genre)
    pngs = [i for i in os.listdir(path) if i[-4:] == ".png"] # get a list of .png files in the genre folder
    print(f"Size of {genre} dataset: {len(pngs)} files.")

## Splitting the Data

> To ensure reliable model evaluation, the dataset is divided into three parts — training, validation, and testing. This separation allows the model to learn from one subset, tune its parameters on another, and finally be evaluated on completely unseen data, providing a true measure of its generalization capability in real-world situations.For this work, an 80:9:10 ratio is used for the training, validation, and testing sets respectively, as illustrated in the dictionaries shown below.


In [None]:
setRandom()
split = [80, 9, 10]
train, val, test = {}, {}, {} # empty dictionaries to store the filepaths
trainLen, valLen, testLen = {}, {}, {} # empty dictionaries to store the number of files under each genre for each dataset
dictionaries = [train, val, test]

for d in dictionaries:
    if d == train: num = slice(0, split[0])
    elif d == val: num = slice(split[0], split[0] + split[1])
    else: num = slice(split[0] + split[1], split[0] + split[1] + split[2])
    for genre in genres: # iterate through each genre folder
        path = os.path.join(source, genre)
        pngs = glob.glob(os.path.join(path, "*.png")) # get a list of .png filepaths in the genre folder
        selected = pngs[num] # take the first 80 files
        d[genre] = selected # store the selected files in the dictionary

lenDictionaries = [{genre: len(d[genre]) for genre in genres} for d in dictionaries]        

print(f"\033[1mTraining:\033[0m {lenDictionaries[0]}")
print(f"\033[1mValidation:\033[0m {lenDictionaries[1]}")
print(f"\033[1mTest:\033[0m {lenDictionaries[2]}")

In [None]:
import os, glob
from random import shuffle

# Shuffle the data for randomness
setRandom()
split_ratio = {'train': 80, 'val': 9, 'test': 10}

# Initialize dictionaries to store filepaths and counts
datasets = {'train': {}, 'val': {}, 'test': {}}

# Loop through each genre folder
for genre in genres:
    genre_path = os.path.join(source, genre)
    image_paths = glob.glob(os.path.join(genre_path, "*.png"))
    total_images = len(image_paths)

    shuffle(image_paths)

    # Calculate split indices
    train_end = int((split_ratio['train'] / 100) * total_images)
    val_end = train_end + int((split_ratio['val'] / 100) * total_images)

    # Split images into train, val, test
    datasets['train'][genre] = image_paths[:train_end]
    datasets['val'][genre] = image_paths[train_end:val_end]
    datasets['test'][genre] = image_paths[val_end:]

# Count files per genre for each dataset
dataset_lengths = {
                    split: {genre: len(datasets[split][genre]) 
                            for genre in genres}
                                for split in datasets
                    }


print(f"\033[1mTraining:\033[0m {dataset_lengths['train']}")
print(f"\033[1mValidation:\033[0m {dataset_lengths['val']}")
print(f"\033[1mTest:\033[0m {dataset_lengths['test']}")


## Data Formatting
> The code below prepares the dataset so a TensorFlow model can read and train on it, and the prep() function applies the optimizations recommended in the Kaggle Computer Vision course..

In [None]:
import tensorflow as tf
import os

batchSize = 32  # typical batch size for a neural network

genreMap = {
    "blues": 0, "classical": 1, "country": 2, "disco": 3, "hiphop": 4,
    "jazz": 5, "metal": 6, "pop": 7, "reggae": 8, "rock": 9
}
inverseGenreMap = {v: k for k, v in genreMap.items()}

IMAGE_SIZE = (288, 432)  # (height, width)
AUTOTUNE = tf.data.experimental.AUTOTUNE

def _load_and_preprocess(path, label):
    """Read an image from disk, decode, resize and normalize to [0,1]."""
    image = tf.io.read_file(path)
    image = tf.image.decode_png(image, channels=3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image, label

def create_dataset_from_dict(d: dict, shuffle_buffer: int = 1000):
    """
    Create a tf.data.Dataset from a dictionary mapping genres -> list_of_paths.
    This keeps images on-disk and loads them lazily (more memory friendly).
    """
    paths, labels = [], []
    for genre, filepaths in d.items():
        paths.extend(filepaths)
        labels.extend([genreMap[genre]] * len(filepaths))

    # Build dataset from paths and integer labels
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    ds = ds.shuffle(buffer_size=min(shuffle_buffer, max(1, len(paths))))

    # Map to actual images and preprocess
    ds = ds.map(lambda p, l: _load_and_preprocess(p, l), num_parallel_calls=AUTOTUNE)
    ds = ds.batch(batchSize, drop_remainder=False)
    return ds

def prep(ds: tf.data.Dataset):
    """
    Apply final dataset optimizations recommended for TF pipelines:
    - ensures dtype is float32
    - caches examples (in memory or disk)
    - prefetches for performance
    """
    ds = ds.map(lambda images, labels: (tf.image.convert_image_dtype(images, tf.float32), labels),
                num_parallel_calls=AUTOTUNE)
    ds = ds.cache()
    ds = ds.prefetch(AUTOTUNE)
    return ds

# Create and prepare datasets (assuming train, val, test dicts exist)
training = prep(create_dataset_from_dict(train))
validation = prep(create_dataset_from_dict(val))
testing = prep(create_dataset_from_dict(test))

print("Datasets created.")


Below are some examples of the spectrograms in each genre for each dataset. There are some clearly noticeable patterns in the data suchas the fairly consistently loud nature of metal, but as usual there are still a few outliers most notably in blues and pop (both of which are quite wide genres).

In [None]:
def view_dataset(dataset):
    genreExamples = {}  # dictionary to store examples for each label
    
    for images, labels in dataset:
        for image, label in zip(images, labels):
            label = int(label.numpy())  # convert label tensor to integer
            if label not in genreExamples:
                genreExamples[label] = image
                
        if len(genreExamples) == len(genres):
            break
    
    # display the randomly chosen examples
    plt.figure(figsize = (30, 20))
    for label, image in genreExamples.items():
        ax = plt.subplot(1, len(genres), label + 1)
        plt.imshow(image)
        plt.title(inverseGenreMap[label])
        plt.axis("off")
    plt.show()

print("\033[1mTraining Examples:\033[0m"); view_dataset(training) # shows a labelled example of a mel spectrogram from each genre from each dataset
print("\033[1mValidation Examples:\033[0m"); view_dataset(validation)
print("\033[1mTesting Examples:\033[0m"); view_dataset(testing)

## Preprocessing and Data Augmentation

> Because of the nature of the dataset, applying data augmentation would be inappropriate. All spectrogram images share the same structure and formatting, so transformations would be unnecessary and could even negatively affect training. Therefore, data augmentation is intentionally omitted in this task.As part of preprocessing, the pixel values of the images were normalized to a range between 0 and 1 by dividing each RGB value by 255.0. This normalization step is crucial for enabling the models to properly interpret and learn from the dataset.


<a id="Choosing-and-Training-a-Model"></a>
# __Choosing and Training a Model__


#### **Transfer Learning Model with InceptionV3 Base (*CNN*)**

> * After researching and testing with a number of pre-trained bases, the InceptionV3 architecture appeared the most promising for this application. * I will be using the default ImageNet weights as they have already learnt to effectively extract hierarchical features and other abstractions from images. Furthermore, ImageNet weights are typically optimised for detecting spatial patterns in images, which would be helpful for detecting patterns in frequency and volume over time.* The weights will also be frozen to retain valuable pre-learnt features, prevent overfitting, reduce computational cost, and ensure consistency. Doing this leverages the power of pre-trained models while allowing you to fine-tune only the top layers for your specific task. 


#### **Custom Convolutional Neural Network (*CNN*)**

> * By designing my own base, I'm hoping that I will be able to compete with the transfer model.The following code is the result of hours of hyperparameter tuning and layer customisation in both models.


# **Transfer Learning Model with InceptionV3 Base (*CNN*)**

In [None]:
inputShape = [288, 432, 3] # the shape of the images (288px tall, 432px wide, and 3 colour channels/RGB)

earlyStopping = EarlyStopping( 
    min_delta = 0.001,
    patience = 20, 
    restore_best_weights = True 
)

In [None]:
from tensorflow.keras.applications import InceptionV3 # transfer learning model
baseModel = InceptionV3(input_shape = inputShape, weights = "imagenet", include_top = False, pooling = "avg")

for layer in baseModel.layers:
    layer.trainable = False # freeze the pre-trained layers

transfer = Sequential([
    baseModel,
    
    Flatten(),
    BatchNormalization(),
    Dense(512, activation = "relu"),
    Dropout(0.3),
    Dense(256, activation = "relu"),
    Dropout(0.3), # dropout layer to prevent overfitting
    Dense(128, activation = "relu"),
    Dropout(0.3),
    Dense(len(genres), activation = "softmax")
])

optimiser = tf.keras.optimizers.SGD(learning_rate = 0.0001)

transfer.compile(optimiser, loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])
transfer.summary()

In [None]:
setRandom()
from livelossplot import PlotLossesKeras
transferHistory = transfer.fit(training, validation_data = validation, batch_size = batchSize, epochs = 2, verbose = 1, callbacks = [earlyStopping, PlotLossesKeras()])

# **Custom Convolutional Neural Network (*CNN*)**


In [None]:
cnn = Sequential([
    BatchNormalization(input_shape = inputShape),
    
    Conv2D(32, (3, 3), activation = "relu"),
    MaxPool2D((2, 2)),
    
    Conv2D(64, (3, 3), activation = "relu"),
    MaxPool2D((2, 2)),
    
    Conv2D(128, (3, 3), activation = "relu"),
    MaxPool2D((2, 2)),
    
    Conv2D(256, (3, 3), activation = "relu"),
    MaxPool2D((2, 2)),
    
    Conv2D(512, (3, 3), activation = "relu"),
    MaxPool2D((2, 2)),
    
    Flatten(),
    Dense(1024, activation = "relu"),
    Dropout(0.5),
    Dense(512, activation = "relu"),
    Dropout(0.5),
    BatchNormalization(),
    Dense(len(genres), activation = "softmax")
])

optimiser = tf.keras.optimizers.SGD(learning_rate = 0.001)
optimiser.learning_rate.assign(0.01)

cnn.compile(optimiser, loss = "sparse_categorical_crossentropy", metrics = ["accuracy"]) # "sparse_categorical_crossentropy" because labels are integers
cnn.summary()

In [None]:
setRandom()
cnn.fit(training, validation_data = validation, batch_size = batchSize, epochs = 2, verbose = 1, callbacks = [earlyStopping, PlotLossesKeras()])

<a id="Test-Data-Predictions"></a>
# __Test Data Predictions__
Below is the code to visually represent the accuracy of both models through confusion matrices.

In [None]:
def confusionMatrix(model, name):
    trueLabels = np.concatenate([y for x, y in testing], axis = 0) # get the true labels from the testing dataset

    predictedLabels = np.argmax(model.predict(testing, verbose = 0), axis = 1) # get the predicted labels from the model

    matrix = confusion_matrix(trueLabels, predictedLabels) # create the confusion matrix

    plt.figure() # plot the confusion matrix using seaborn for the heatmap
    sns.heatmap(matrix, annot = True, cmap = "Greens", xticklabels = genres, yticklabels = genres)
    plt.xlabel("Predicted Genre")
    plt.ylabel("True Genre")
    plt.title(f"{name} Model: Confusion Matrix")
    plt.show()
    
    trainStats, valStats, testStats = model.evaluate(training, verbose = 0), model.evaluate(validation, verbose = 0), model.evaluate(testing, verbose = 0)
    print(f"\033[1m{name} Model\033[0m")
    print(f"Training Accuracy: {round(trainStats[1] * 100, 4)}% \nTrain Loss: {round(trainStats[0], 4)}\n")
    print(f"Validation Accuracy: {round(valStats[1] * 100, 4)}% \nTest Loss: {round(valStats[0], 4)}\n")
    print(f"Testing Accuracy: {round(testStats[1] * 100, 4)}% \nTest Loss: {round(testStats[0], 4)}")

confusionMatrix(transfer, "Transfer")
# confusionMatrix(cnn, "Custom CNN")