In [None]:
%tensorflow_version 1.x
!pip install pydub

TensorFlow 1.x selected.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import tensorflow as tf
import numpy as np
import scipy
from scipy import misc
import glob
from PIL import Image
import os
import matplotlib.pyplot as plt
import librosa
from keras import layers
from keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D
from keras.models import Model, load_model
from keras.preprocessing import image
from keras.utils import layer_utils
from keras.utils.data_utils import get_file
from keras.applications.imagenet_utils import preprocess_input
import pydot
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model
from keras.optimizers import Adam
from keras.initializers import glorot_uniform
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from pydub import AudioSegment
import shutil
from keras.preprocessing.image import ImageDataGenerator
import random
print(tf.__version__)

1.15.2


Using TensorFlow backend.


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import numpy as np
import librosa, librosa.display
import matplotlib.pyplot as plt

In [None]:
import json
import os
import math

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
DATASET_PATH = "/content/gdrive/MyDrive/GTZAN1/Data/genres_original"
JSON_PATH = "data_11.json"
SAMPLE_RATE = 22050
TRACK_DURATION = 30 # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION

In [None]:
def save_mfcc(dataset_path, json_path, num_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):
    """Extracts MFCCs from music dataset and saves them into a json file along with genre labels.
        """

    # dictionary to store mapping, labels, and MFCCs
    data = {
        "mapping": [],
        "labels": [],
        "mfcc": []
    }

    samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)

    # loop through all genre sub-folder
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        # ensure we're processing a genre sub-folder level
        if dirpath is not dataset_path:

            # save genre label (i.e., sub-folder name) in the mapping
            semantic_label = dirpath.split("/")[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing: {}".format(semantic_label))

            # process all audio files in genre sub-dir
            for f in filenames:

        # load audio file
                file_path = os.path.join(dirpath, f)
                signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)

                # process all segments of audio file
                for d in range(num_segments):

                    # calculate start and finish sample for current segment
                    start = samples_per_segment * d
                    finish = start + samples_per_segment


                    # extract mfcc
                    mfcc = librosa.feature.mfcc(signal[start:finish], sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
                    mfcc = mfcc.T
                    # store only mfcc feature with expected number of vectors
                    if len(mfcc) == num_mfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i-1)
                        print("{}, segment:{}".format(file_path, d+1))

    # save MFCCs to json file
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)

In [None]:
save_mfcc(DATASET_PATH, JSON_PATH, num_segments=10)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
/content/gdrive/MyDrive/GTZAN1/Data/genres_original/rock/rock.00001.wav, segment:6
/content/gdrive/MyDrive/GTZAN1/Data/genres_original/rock/rock.00001.wav, segment:7
/content/gdrive/MyDrive/GTZAN1/Data/genres_original/rock/rock.00001.wav, segment:8
/content/gdrive/MyDrive/GTZAN1/Data/genres_original/rock/rock.00001.wav, segment:9
/content/gdrive/MyDrive/GTZAN1/Data/genres_original/rock/rock.00001.wav, segment:10
/content/gdrive/MyDrive/GTZAN1/Data/genres_original/rock/rock.00008.wav, segment:1
/content/gdrive/MyDrive/GTZAN1/Data/genres_original/rock/rock.00008.wav, segment:2
/content/gdrive/MyDrive/GTZAN1/Data/genres_original/rock/rock.00008.wav, segment:3
/content/gdrive/MyDrive/GTZAN1/Data/genres_original/rock/rock.00008.wav, segment:4
/content/gdrive/MyDrive/GTZAN1/Data/genres_original/rock/rock.00008.wav, segment:5
/content/gdrive/MyDrive/GTZAN1/Data/genres_original/rock/rock.00008.wav, segment:6
/content/gdrive/MyDri

In [None]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

import matplotlib.pyplot as plt
import random

import librosa
import math

In [None]:
# path to json
DATA_PATH = "/content/data_11.json"

In [None]:
def load_data(data_path):

    with open(data_path, "r") as f:
        data = json.load(f)

    # convert lists to numpy arrays
    X = np.array(data["mfcc"])
    y = np.array(data["labels"])

    print("Data succesfully loaded!")

    return  X, y

In [None]:
# load data
X, y = load_data(DATA_PATH)

Data succesfully loaded!


In [None]:
# create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# build network topology
model = tf.keras.Sequential([

    # input layer
    tf.keras.layers.Flatten(input_shape=(X.shape[1], X.shape[2])),

    # 1st dense layer
    tf.keras.layers.Dense(512, activation='relu'),

    # 2nd dense layer
    tf.keras.layers.Dense(256, activation='relu'),

    # 3rd dense layer
    tf.keras.layers.Dense(64, activation='relu'),

    # output layer
    tf.keras.layers.Dense(10, activation='softmax')
])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [None]:
# compile model
optimiser = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimiser,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# train model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=10)

Train on 5597 samples, validate on 2399 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
def plot_history(history):

    fig, axs = plt.subplots(2)

    # create accuracy sublpot
    axs[0].plot(history.history["accuracy"], label="train accuracy")
    axs[0].plot(history.history["val_accuracy"], label="test accuracy")
    axs[0].set_ylabel("Accuracy")
    axs[0].legend(loc="lower right")
    axs[0].set_title("Accuracy eval")

    # create error sublpot
    axs[1].plot(history.history["loss"], label="train error")
    axs[1].plot(history.history["val_loss"], label="test error")
    axs[1].set_ylabel("Error")
    axs[1].set_xlabel("Epoch")
    axs[1].legend(loc="upper right")
    axs[1].set_title("Error eval")
    
    plt.show()

In [None]:
# build network topology
model_regularized = tf.keras.Sequential([

    # input layer
    tf.keras.layers.Flatten(input_shape=(X.shape[1], X.shape[2])),

    # 1st dense layer
    tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.3),

    # 2nd dense layer
    tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.3),

    # 3rd dense layer
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.3),

    # output layer
    tf.keras.layers.Dense(10, activation='softmax')
])

In [None]:
# compile model
optimiser = tf.keras.optimizers.Adam(learning_rate=0.0001)
model_regularized.compile(optimizer=optimiser,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# train model
history = model_regularized.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=10)

Train on 5597 samples, validate on 2399 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# create train, validation and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2)

# add an axis to input sets
X_train = X_train[..., np.newaxis]
X_validation = X_validation[..., np.newaxis]
X_test = X_test[..., np.newaxis]

In [None]:
input_shape = (X_train.shape[1], X_train.shape[2], 1)

In [None]:
# build the CNN
model_cnn = tf.keras.Sequential()

# 1st conv layer
model_cnn.add(tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
model_cnn.add(tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
model_cnn.add(tf.keras.layers.BatchNormalization())

# 2nd conv layer
model_cnn.add(tf.keras.layers.Conv2D(32, (3, 3), activation='relu'))
model_cnn.add(tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
model_cnn.add(tf.keras.layers.BatchNormalization())

# 3rd conv layer
model_cnn.add(tf.keras.layers.Conv2D(32, (2, 2), activation='relu'))
model_cnn.add(tf.keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
model_cnn.add(tf.keras.layers.BatchNormalization())

# flatten output and feed it into dense layer
model_cnn.add(tf.keras.layers.Flatten())
model_cnn.add(tf.keras.layers.Dense(64, activation='relu'))
model_cnn.add(tf.keras.layers.Dropout(0.3))

# output layer
model_cnn.add(tf.keras.layers.Dense(10, activation='softmax'))

In [None]:
# compile model
optimiser = tf.keras.optimizers.Adam(learning_rate=0.0001)
model_cnn.compile(optimizer=optimiser,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model_cnn.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 128, 11, 32)       320       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 64, 6, 32)         0         
_________________________________________________________________
batch_normalization (BatchNo (None, 64, 6, 32)         128       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 62, 4, 32)         9248      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 31, 2, 32)         0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 31, 2, 32)         128       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 30, 1, 32)        

In [None]:
# train model
history = model_cnn.fit(X_train, y_train, validation_data=(X_validation, y_validation), batch_size=32, epochs=30)

Train on 4797 samples, validate on 1200 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
# evaluate model on Test Set
test_loss, test_acc = model_cnn.evaluate(X_test, y_test, verbose=2)
print('\nTest accuracy:', test_acc)

1999/1999 - 1s - loss: 0.6706 - acc: 0.7589

Test accuracy: 0.7588794


In [None]:
model_cnn.save("Music_Genre_11_CNN")

In [None]:
model_cnn.save("Music_Genre_11_CNN.h5")

In [None]:
# pick a sample to predict from the test set
X_to_predict = X_test[100]
y_to_predict = y_test[100]

In [None]:
X_to_predict.shape

(130, 13, 1)

In [None]:
print("Real Genre:", y_to_predict)

Real Genre: 7


In [None]:
# add a dimension to input data for sample - model.predict() expects a 4d array in this case
X_to_predict = X_to_predict[np.newaxis, ...] # array shape (1, 130, 13, 1)

In [None]:
X_to_predict.shape

(1, 130, 13, 1)

In [None]:
# perform prediction
prediction = model_cnn.predict(X_to_predict)

In [None]:
# get index with max value
predicted_index = np.argmax(prediction, axis=1)

print("Predicted Genre:", int(predicted_index))

Predicted Genre: 7


In [None]:
# pick a sample to predict from the test set
X_to_predict = X_test[300]
y_to_predict = y_test[300]

print("Real Genre:", y_to_predict)

X_to_predict = X_to_predict[np.newaxis, ...]

prediction = model_cnn.predict(X_to_predict)

# get index with max value
predicted_index = np.argmax(prediction, axis=1)

print("Predicted Genre:", int(predicted_index))

Real Genre: 4
Predicted Genre: 2


In [None]:
len(X_test)

1999

In [None]:
for n in range(10):

  i = random.randint(0,len(X_test))
  # pick a sample to predict from the test set
  X_to_predict = X_test[i]
  y_to_predict = y_test[i]

  print("\nReal Genre:", y_to_predict)

  X_to_predict = X_to_predict[np.newaxis, ...]

  prediction = model_cnn.predict(X_to_predict)

  # get index with max value
  predicted_index = np.argmax(prediction, axis=1)

  print("Predicted Genre:", int(predicted_index))


Real Genre: 0
Predicted Genre: 3

Real Genre: 4
Predicted Genre: 4

Real Genre: 2
Predicted Genre: 2

Real Genre: 6
Predicted Genre: 6

Real Genre: 5
Predicted Genre: 5

Real Genre: 0
Predicted Genre: 0

Real Genre: 6
Predicted Genre: 6

Real Genre: 3
Predicted Genre: 3

Real Genre: 7
Predicted Genre: 7

Real Genre: 2
Predicted Genre: 2


In [None]:
# Audio files pre-processing
def process_input(audio_file, track_duration):

  SAMPLE_RATE = 22050
  NUM_MFCC = 13
  N_FTT=2048
  HOP_LENGTH=512
  TRACK_DURATION = track_duration # measured in seconds
  SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION
  NUM_SEGMENTS = 10

  samples_per_segment = int(SAMPLES_PER_TRACK / NUM_SEGMENTS)
  num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / HOP_LENGTH)

  signal, sample_rate = librosa.load(audio_file, sr=SAMPLE_RATE)
  
  for d in range(10):

    # calculate start and finish sample for current segment
    start = samples_per_segment * d
    finish = start + samples_per_segment

    # extract mfcc
    mfcc = librosa.feature.mfcc(signal[start:finish], sample_rate, n_mfcc=NUM_MFCC, n_fft=N_FTT, hop_length=HOP_LENGTH)
    mfcc = mfcc.T

    return mfcc

In [None]:
genre_dict = {0:"blues",1:"rock",2:"country",3:"metal",4:"hiphop",5:"classical",6:"pop",7:"Disco"}

In [None]:
new_input_mfcc = process_input("/content/gdrive/MyDrive/Maroon_5_-_She_Will_Be_Loved_Offic_(getmp3.pro).mp3", 30)



In [None]:
X_to_predict = new_input_mfcc[np.newaxis, ..., np.newaxis]
X_to_predict.shape

(1, 130, 13, 1)

In [None]:
prediction = model_cnn.predict(X_to_predict)

# get index with max value
predicted_index = np.argmax(prediction, axis=1)

print("Predicted Genre:", genre_dict[int(predicted_index)])

Predicted Genre: pop


In [None]:
new_input_mfcc = process_input("/content/gdrive/MyDrive/ARCH ENEMY - The Eagle Flies Alone (OFFICIAL VIDEO).mp3", 30)



In [None]:
X_to_predict = new_input_mfcc[np.newaxis, ..., np.newaxis]
X_to_predict.shape

(1, 130, 13, 1)

In [None]:
prediction = model_cnn.predict(X_to_predict)

# get index with max value
predicted_index = np.argmax(prediction, axis=1)

print("Predicted Genre:", genre_dict[int(predicted_index)])

Predicted Genre: metal


In [None]:
new_input_mfcc = process_input("/content/gdrive/MyDrive/Nirvana - Smells Like Teen Spirit (Official Music Video).mp3", 30)



In [None]:
X_to_predict = new_input_mfcc[np.newaxis, ..., np.newaxis]
X_to_predict.shape

(1, 130, 13, 1)

In [None]:
prediction = model_cnn.predict(X_to_predict)

# get index with max value
predicted_index = np.argmax(prediction, axis=1)

print("Predicted Genre:", genre_dict[int(predicted_index)])

Predicted Genre: metal


In [None]:
new_input_mfcc = process_input("/content/gdrive/MyDrive/Lobo Loco - The Loco Sheriff (ID 1713).mp3", 30)



In [None]:
X_to_predict = new_input_mfcc[np.newaxis, ..., np.newaxis]
X_to_predict.shape

(1, 130, 13, 1)

In [None]:
prediction = model_cnn.predict(X_to_predict)

# get index with max value
predicted_index = np.argmax(prediction, axis=1)

print("Predicted Genre:", genre_dict[int(predicted_index)])

Predicted Genre: blues


In [None]:
new_input_mfcc = process_input("/content/gdrive/MyDrive/Slayer _ Angel of Death (Lyrics).mp3", 30)



In [None]:
X_to_predict = new_input_mfcc[np.newaxis, ..., np.newaxis]
X_to_predict.shape

(1, 130, 13, 1)

In [None]:
prediction = model_cnn.predict(X_to_predict)

# get index with max value
predicted_index = np.argmax(prediction, axis=1)

print("Predicted Genre:", genre_dict[int(predicted_index)])

Predicted Genre: metal


In [None]:
new_input_mfcc = process_input("/content/gdrive/MyDrive/Dust My Broom.mp3", 30)



In [None]:
X_to_predict = new_input_mfcc[np.newaxis, ..., np.newaxis]
X_to_predict.shape

(1, 130, 13, 1)

In [None]:
prediction = model_cnn.predict(X_to_predict)

# get index with max value
predicted_index = np.argmax(prediction, axis=1)

print("Predicted Genre:", genre_dict[int(predicted_index)])

Predicted Genre: blues
