In [None]:
import resampy
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
)

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Activation

# load FMA dataset - they are weird they put the genres in a .csv file
TRACKS_CSV = r"C:\Users\Jacky\Documents\fma_metadata\tracks.csv"
tracks = pd.read_csv(TRACKS_CSV, index_col = 0, header = [0, 1])

# get genre labels for FMA-small
genres = tracks['track']['genre_top']

# dataset path
DATASET_PATH = r"C:\Users\Jacky\Documents\fma_small\fma_small"

# converting audio to mel-spectrograms
# parameters
SR = 22050 # sampling rate --> default to librosa
N_FFT = 2048 # window size
HOP_LENGTH = 512 # 1/4th of window size
N_MELS = 128 
DURATION = 30 # GTZAN clips duration 
FIXED_FRAMES = 1290


# feature extraction function
def extract_mel_spectrogram(file_path):
    try:
        audio, sr = librosa.load(
            file_path,
            sr = SR,
            duration = DURATION,
            res_type = "kaiser_fast"
        )

        mel_spec = librosa.feature.melspectrogram(
            y = audio,
            sr = sr,
            n_fft = N_FFT,
            hop_length = HOP_LENGTH,
            n_mels = N_MELS
        )

        mel_spec_db = librosa.power_to_db(mel_spec, ref = np.max)
       
        # truncate to fixed length
        if mel_spec_db.shape[1] < FIXED_FRAMES:
            pad_width = FIXED_FRAMES - mel_spec_db.shape[1]
            mel_spec_db = np.pad(
                mel_spec_db,
                pad_width = ((0, 0), (0, pad_width)),
                mode = 'constant'
            )
        else:
            mel_spec_db = mel_spec_db[:, :FIXED_FRAMES]
            
        return mel_spec_db

    except Exception as e:
        print(f"Skipping file {file_path} : {e}")
        return None

# FMA dataset builder
X = []
y = []

for root, _, files in os.walk(DATASET_PATH):
    for file in files:
        if file.endswith(".mp3"):
            file_path = os.path.join(root, file)
            track_id = int(file.replace(".mp3", ""))

            genre = genres.get(track_id)
            if pd.isna(genre):
                continue

            mel_spec = extract_mel_spectrogram(file_path)
            if mel_spec is not None:
                X.append(mel_spec)
                y.append(genre)

# limiting number of samples based on my RAM
X, y = shuffle(X, y, random_state = 42)

MAX_SAMPLES = 2000 

X = X[:MAX_SAMPLES]
y = y[:MAX_SAMPLES]

# convert X to numpy
X = np.array(X, dtype = np.float32)

# encode labels
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# 80/10/10 split
X_train, X_temp, y_train_enc, y_temp_enc = train_test_split(
    X, y_encoded, test_size = 0.2, random_state = 42, stratify = y_encoded
)

X_val, X_test, y_val_enc, y_test_enc = train_test_split(
    X_temp, y_temp_enc, test_size = 0.5, random_state = 42, stratify = y_temp_enc
)

y_train = to_categorical(y_train_enc)
y_val = to_categorical(y_val_enc)
y_test = to_categorical(y_test_enc)

# compute mean and standard deviation from training set
mu = np.mean(X_train)
sigma = np.std(X_train)

X_train = (X_train - mu) / sigma
X_val = (X_val - mu) / sigma
X_test = (X_test - mu) / sigma

# reshape for CNN input
X_train = X_train[..., np.newaxis]
X_val = X_val[..., np.newaxis]
X_test = X_test[..., np.newaxis]

input_shape = X_train.shape[1:]
num_classes = y_train.shape[1]

# set constant learning rate
LR = 0.001

# CNN architecture (base model)
def build_base_cnn():
    model = Sequential([
        Conv2D(32, (3, 3), activation = 'relu', input_shape = input_shape),
        MaxPooling2D((2, 2)),

        Conv2D(64, (3,3), activation = 'relu'), 
        MaxPooling2D((2, 2)),   

        Flatten(),
        Dense(64, activation = 'relu'),
        Dense(num_classes, activation = 'softmax')
    ])

    return model

# model variants

# 1 no regularization (basline for comparison)
baseline_model = build_base_cnn()

baseline_model.compile(
    optimizer = Adam(learning_rate = LR),
    loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)

# Dropout model
def build_dropout_cnn():
    model = Sequential([
        Conv2D(32, (3,3), activation = 'relu', input_shape = input_shape),
        MaxPooling2D((2,2)),

        Conv2D(64, (3,3), activation = 'relu'),
        MaxPooling2D((2,2)),

        Flatten(),
        Dense(64, activation = 'relu'),
        Dropout(0.5),
        Dense(num_classes, activation = 'softmax')
    ])
    return model

dropout_model = build_dropout_cnn()
dropout_model.compile(
    optimizer = Adam(learning_rate = LR),
    loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)

# batch normalization model
def build_batchnorm_cnn():
    model = Sequential([
        Conv2D(32, (3,3), use_bias = False, input_shape = input_shape),
        BatchNormalization(),
        Activation('relu'),
        MaxPooling2D((2,2)),

        Conv2D(64, (3,3), use_bias = False),
        BatchNormalization(),
        Activation('relu'),
        MaxPooling2D((2,2)),

        Flatten(),
        Dense(64, activation = 'relu'),
        Dense(num_classes, activation = 'softmax')
    ])
    return model

batchnorm_model = build_batchnorm_cnn()
batchnorm_model.compile(
    optimizer = Adam(learning_rate = LR),
    loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)


# training (no early stopping) 
EPOCHS = 20
BATCH_SIZE = 32

history_baseline = baseline_model.fit(
    X_train, y_train,
    validation_data = (X_val, y_val),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE
)

history_dropout = dropout_model.fit(
    X_train, y_train,
    validation_data = (X_val, y_val),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE
)

history_batchnorm = batchnorm_model.fit(
    X_train, y_train,
    validation_data = (X_val, y_val),
    epochs = EPOCHS,
    batch_size = BATCH_SIZE
)

# evaluation
baseline_test_acc = baseline_model.evaluate(X_test, y_test, verbose = 0)[1]
dropout_test_acc = dropout_model.evaluate(X_test, y_test, verbose = 0)[1]
batchnorm_test_acc = batchnorm_model.evaluate(X_test, y_test, verbose = 0)[1]

print("Baseline Test Accuracy:", baseline_test_acc)
print("Dropout Test Accuracy:", dropout_test_acc)
print("BatchNorm Test Accuracy:", batchnorm_test_acc)

# analyzing overfitting issue

def plot_history(history, title):
    plt.plot(history.history['accuracy'], label = 'Train')
    plt.plot(history.history['val_accuracy'], label = 'Validation')
    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

plot_history(history_baseline, "Baseline Model")
plot_history(history_dropout, "Dropout Model")
plot_history(history_batchnorm, "Batch Normalization Model")


  audio, sr = librosa.load(
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Skipping file C:\Users\Jacky\Documents\fma_small\fma_small\098\098565.mp3 : 
Skipping file C:\Users\Jacky\Documents\fma_small\fma_small\098\098567.mp3 : 
Skipping file C:\Users\Jacky\Documents\fma_small\fma_small\098\098569.mp3 : 
Skipping file C:\Users\Jacky\Documents\fma_small\fma_small\099\099134.mp3 : 
Skipping file C:\Users\Jacky\Documents\fma_small\fma_small\108\108925.mp3 : 
Skipping file C:\Users\Jacky\Documents\fma_small\fma_small\133\133297.mp3 : 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 2s/step - accuracy: 0.1325 - loss: 5.1062 - val_accuracy: 0.1400 - val_loss: 2.0785
Epoch 2/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 2s/step - accuracy: 0.2419 - loss: 1.9902 - val_accuracy: 0.2700 - val_loss: 1.9591
Epoch 3/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 2s/step - accuracy: 0.3938 - loss: 1.6963 - val_accuracy: 0.3150 - val_loss: 1.8092
Epoch 4/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 2s/step - accuracy: 0.6175 - loss: 1.1675 - val_accuracy: 0.3100 - val_loss: 1.9305
Epoch 5/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 2s/step - accuracy: 0.8450 - loss: 0.5647 - val_accuracy: 0.3400 - val_loss: 2.2353
Epoch 6/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 2s/step - accuracy: 0.9606 - loss: 0.1983 - val_accuracy: 0.3400 - val_loss: 2.3782
Epoch 7/20
[1m50/50[0m [32m━━━━━━━━━━