<a href="https://colab.research.google.com/github/Koks-creator/MusicGeneresClassification/blob/main/CNNAudio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow



In [None]:
from google.colab import files
from typing import Tuple
from zipfile import ZipFile
import os
import json
import math
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras
from tensorflow.keras import layers


uploaded = files.upload()

!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [None]:
!kaggle datasets download -d andradaolteanu/gtzan-dataset-music-genre-classification

Downloading gtzan-dataset-music-genre-classification.zip to /content
100% 1.21G/1.21G [00:14<00:00, 198MB/s]
100% 1.21G/1.21G [00:14<00:00, 89.0MB/s]


In [None]:
FILE_NAME = "/content/gtzan-dataset-music-genre-classification.zip"
DATASET_PATH = "/content/Data/genres_original"
OUTPUT_PATH = "/content/data.json"
DURATION = 30  # length of music file in seconds
SAMPLE_RATE = 22050
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

In [None]:
with ZipFile(FILE_NAME, 'r') as zip:
  zip.extractall()
  print("Done")

Done


In [None]:
def save_mfcc(dataset_path, json_path, samples_per_track: int, num_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):
    """Extracts MFCCs from music dataset and saves them into a json file along witgh genre labels.

        :param dataset_path (str): Path to dataset
        :param json_path (str): Path to json file used to save MFCCs
        :param num_mfcc (int): Number of coefficients to extract
        :param n_fft (int): Interval we consider to apply FFT. Measured in # of samples
        :param hop_length (int): Sliding window for FFT. Measured in # of samples
        :param: num_segments (int): Number of segments we want to divide sample tracks into
        :return:
        """

    data = {
        "mapping": [],
        "labels": [],
        "mfcc": []
    }

    samples_per_segment = int(samples_per_track / num_segments)
    num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)

    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        if dirpath is not dataset_path:
            semantic_label = dirpath.split("/")[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing: {}".format(semantic_label))

            for f in filenames:
                try:
                  file_path = os.path.join(dirpath, f)
                  signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)

                  for d in range(num_segments):
                      start = samples_per_segment * d
                      finish = start + samples_per_segment
                      mfcc = librosa.feature.mfcc(y=signal[start:finish],
                                                  sr=sample_rate,
                                                  n_mfcc=num_mfcc,
                                                  n_fft=n_fft, hop_length=hop_length)
                      mfcc = mfcc.T

                      if len(mfcc) == num_mfcc_vectors_per_segment:
                          data["mfcc"].append(mfcc.tolist())
                          data["labels"].append(i-1)
                          print("{}, segment:{}".format(file_path, d+1))
                except Exception as e:
                  print(e)
                  print(f)

    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)

In [None]:
save_mfcc(DATASET_PATH, OUTPUT_PATH, SAMPLES_PER_TRACK, num_segments=10)

[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
/content/Data/genres_original/country/country.00089.wav, segment:8
/content/Data/genres_original/country/country.00089.wav, segment:9
/content/Data/genres_original/country/country.00089.wav, segment:10
/content/Data/genres_original/country/country.00067.wav, segment:1
/content/Data/genres_original/country/country.00067.wav, segment:2
/content/Data/genres_original/country/country.00067.wav, segment:3
/content/Data/genres_original/country/country.00067.wav, segment:4
/content/Data/genres_original/country/country.00067.wav, segment:5
/content/Data/genres_original/country/country.00067.wav, segment:6
/content/Data/genres_original/country/country.00067.wav, segment:7
/content/Data/genres_original/country/country.00067.wav, segment:8
/content/Data/genres_original/country/country.00067.wav, segment:9
/content/Data/genres_original/country/country.00067.wav, segment:10
/content/Data/genres_original/country/country.00

  signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)



jazz.00054.wav
/content/Data/genres_original/jazz/jazz.00089.wav, segment:1
/content/Data/genres_original/jazz/jazz.00089.wav, segment:2
/content/Data/genres_original/jazz/jazz.00089.wav, segment:3
/content/Data/genres_original/jazz/jazz.00089.wav, segment:4
/content/Data/genres_original/jazz/jazz.00089.wav, segment:5
/content/Data/genres_original/jazz/jazz.00089.wav, segment:6
/content/Data/genres_original/jazz/jazz.00089.wav, segment:7
/content/Data/genres_original/jazz/jazz.00089.wav, segment:8
/content/Data/genres_original/jazz/jazz.00089.wav, segment:9
/content/Data/genres_original/jazz/jazz.00089.wav, segment:10
/content/Data/genres_original/jazz/jazz.00091.wav, segment:1
/content/Data/genres_original/jazz/jazz.00091.wav, segment:2
/content/Data/genres_original/jazz/jazz.00091.wav, segment:3
/content/Data/genres_original/jazz/jazz.00091.wav, segment:4
/content/Data/genres_original/jazz/jazz.00091.wav, segment:5
/content/Data/genres_original/jazz/jazz.00091.wav, segment:6
/conten

In [None]:
def load_data(dataset_path: str) -> Tuple[np.array, np.array]:
  with open(dataset_path) as f:
    data = json.load(f)


  X = np.array(data["mfcc"])
  y = np.array(data["labels"])
  mapping = data["mapping"]

  return X, y, mapping

In [None]:
def prepare_dataset(test_size: float, validation_size: float, X: np.array, y: np.array) -> Tuple[np.array, np.array, np.array,
                                                                                          np.array, np.array, np.array]:

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
  X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=validation_size)

  return X_train, X_test, X_validation, y_train, y_test, y_validation

In [None]:
X, y, mapping = load_data(OUTPUT_PATH)

X_train, X_test, X_validation, y_train, y_test, y_validation = prepare_dataset(test_size=.25, validation_size=.2, X=X, y=y)

In [None]:
X_train[0].shape  # 13 * 10s, 13 mfcc per 'time bin'

(130, 13)

In [None]:
X_train.shape[1], X_train.shape[2]

(130, 13)

In [None]:
X_train = np.reshape(X_train, X_train.shape + (1,))
X_test = np.reshape(X_test, X_test.shape + (1,))
X_validation = np.reshape(X_validation, X_validation.shape + (1,))

In [None]:
X_train.shape

(7489, 130, 13, 1)

In [None]:
model = keras.Sequential()

model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3])))
model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.Conv2D(128, (2, 2), activation='relu'))
model.add(keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.4))

model.add(keras.layers.Dense(10, activation='softmax'))

In [None]:
optimiser = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimiser,
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"]
              )

In [None]:
model.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_47 (Conv2D)          (None, 128, 11, 32)       320       
                                                                 
 max_pooling2d_38 (MaxPoolin  (None, 64, 6, 32)        0         
 g2D)                                                            
                                                                 
 batch_normalization_43 (Bat  (None, 64, 6, 32)        128       
 chNormalization)                                                
                                                                 
 flatten_6 (Flatten)         (None, 12288)             0         
                                                                 
 dense_11 (Dense)            (None, 128)               1572992   
                                                                 
 batch_normalization_44 (Bat  (None, 128)            

In [None]:
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_validation, y_validation),
    batch_size=32,
    epochs=30
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)

63/63 - 2s - loss: 1.1220 - accuracy: 0.7181 - 2s/epoch - 29ms/step


In [None]:
def test_model_prediction(model: keras.models.Sequential, X: np.array, expected_y: np.int64) -> bool:
  prediction = model.predict(np.array([X]))
  label = np.argmax(prediction)

  if expected_y == label:
    return True

  print(f"Expected label {expected_y}, predicted label: {label}")
  return False

In [None]:
for i in range(100):
  X = X_test[i]
  y = y_test[i]

  print(test_model_prediction(model, X, y))

True
True
True
True
True
True
True
True
True
Expected label 7, predicted label: 0
False
True
Expected label 4, predicted label: 7
False
Expected label 4, predicted label: 9
False
True
Expected label 5, predicted label: 6
False
True
True
True
Expected label 5, predicted label: 0
False
Expected label 3, predicted label: 9
False
Expected label 7, predicted label: 0
False
Expected label 6, predicted label: 2
False
Expected label 0, predicted label: 3
False
True
Expected label 0, predicted label: 1
False
Expected label 3, predicted label: 8
False
Expected label 1, predicted label: 2
False
True
True
True
True
True
True
True
True
True
Expected label 3, predicted label: 6
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
Expected label 2, predicted label: 0
False
Expected label 6, predicted label: 8
False
True
Expected label 2, predicted label: 0
False
True
Expected label 7, predicted label: 8
False
True
True
Expe

In [None]:
mapping

['country',
 'blues',
 'metal',
 'disco',
 'jazz',
 'reggae',
 'hiphop',
 'rock',
 'pop',
 'classical']

In [None]:
model.save("CNN_Audio.h5")