In [1]:
import os
import numpy
from tensorflow import keras
import librosa

In [2]:
def get_mfcc(wav_file_path):
    y, sr = librosa.load(wav_file_path, offset=0, duration=30)
    mfcc = numpy.array(librosa.feature.mfcc(y=y, sr=sr))
    return mfcc

In [3]:
def get_melspectrogram(wav_file_path):
    y, sr = librosa.load(wav_file_path, offset=0, duration=30)
    melspectrogram = numpy.array(librosa.feature.melspectrogram(y=y, sr=sr))
    return melspectrogram

In [4]:
def get_chroma_vector(wav_file_path):
    y, sr = librosa.load(wav_file_path)
    chroma = numpy.array(librosa.feature.chroma_stft(y=y, sr=sr))
    return chroma

In [5]:
def get_tonnetz(wav_file_path):
    y, sr = librosa.load(wav_file_path)
    tonnetz = numpy.array(librosa.feature.tonnetz(y=y, sr=sr))
    return tonnetz

In [6]:
def get_feature(file_path):
    # Extracting MFCC feature
    mfcc = get_mfcc(file_path)
    mfcc_mean = mfcc.mean(axis=1)
    mfcc_min = mfcc.min(axis=1)
    mfcc_max = mfcc.max(axis=1)
    mfcc_feature = numpy.concatenate((mfcc_mean, mfcc_min, mfcc_max))

    # Extracting Mel Spectrogram feature
    melspectrogram = get_melspectrogram(file_path)
    melspectrogram_mean = melspectrogram.mean(axis=1)
    melspectrogram_min = melspectrogram.min(axis=1)
    melspectrogram_max = melspectrogram.max(axis=1)
    melspectrogram_feature = numpy.concatenate((melspectrogram_mean, melspectrogram_min, melspectrogram_max))

    # Extracting chroma vector feature
    chroma = get_chroma_vector(file_path)
    chroma_mean = chroma.mean(axis=1)
    chroma_min = chroma.min(axis=1)
    chroma_max = chroma.max(axis=1)
    chroma_feature = numpy.concatenate((chroma_mean, chroma_min, chroma_max))

    # Extracting tonnetz feature
    tntz = get_tonnetz(file_path)
    tntz_mean = tntz.mean(axis=1)
    tntz_min = tntz.min(axis=1)
    tntz_max = tntz.max(axis=1)
    tntz_feature = numpy.concatenate((tntz_mean, tntz_min, tntz_max)) 
    
    feature = numpy.concatenate((chroma_feature, melspectrogram_feature, mfcc_feature, tntz_feature))
    return feature

In [7]:
# Calculating features for the full dataset
directory = 'Music Data'
genres = ['classical', 'country', 'hiphop', 'jazz', 'rock']
features = []
labels = []
for genre in genres:
    print("Calculating features for genre: " + genre)
    for file in os.listdir(directory + "/" + genre):
        file_path = directory + '/' + genre + '/' + file

        features.append(get_feature(file_path))
        label = genres.index(genre)
        labels.append(label)

Calculating features for genre: classical
Calculating features for genre: country
Calculating features for genre: hiphop
Calculating features for genre: jazz
Calculating features for genre: rock


In [8]:
permutations = numpy.random.permutation(300)
features = numpy.array(features)[permutations]
labels = numpy.array(labels)[permutations]

In [9]:
# Splits the dataset into training, validation and testing parts: 60%, 20% and 20% respectively
features_train = features[0:180]
labels_train = labels[0:180]

features_val = features[180:240]
labels_val = labels[180:240]

features_test = features[240:300]
labels_test = labels[240:300]

In [10]:
# Training the model
inputs = keras.Input(shape=(498,), name="feature")
x = keras.layers.Dense(300, activation="relu", name="dense_1")(inputs)
x = keras.layers.Dense(200, activation="relu", name="dense_2")(x)
outputs = keras.layers.Dense(3, activation="softmax", name="predictions")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    # Optimizer
    optimizer=keras.optimizers.RMSprop(),
    # Loss function to minimize
    loss=keras.losses.SparseCategoricalCrossentropy(),
    # List of metrics to monitor
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
model.fit(x=features_train,y=labels_train,verbose=1,validation_data=(features_val, labels_val), epochs=64)

Epoch 1/64
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - loss: 211.2534 - sparse_categorical_accuracy: 0.4192 - val_loss: 31.7849 - val_sparse_categorical_accuracy: 0.7167
Epoch 2/64
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 13.3204 - sparse_categorical_accuracy: 0.8134 - val_loss: 30.4228 - val_sparse_categorical_accuracy: 0.7833
Epoch 3/64
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 7.5498 - sparse_categorical_accuracy: 0.8835 - val_loss: 13.9466 - val_sparse_categorical_accuracy: 0.8333
Epoch 4/64
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 17.6305 - sparse_categorical_accuracy: 0.7784 - val_loss: 7.2123 - val_sparse_categorical_accuracy: 0.8833
Epoch 5/64
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 3.0470 - sparse_categorical_accuracy: 0.9623 - val_loss: 9.8508 - val_sparse_categorical_accuracy: 0.8833
Epoch 6/

<keras.src.callbacks.history.History at 0x24c37ef8d30>

In [11]:
score = model.evaluate(x=features_test,y=labels_test, verbose=0)
print('Accuracy: ' + str(score[1]*100) + '%')

Accuracy: 85.00000238418579%


In [12]:
from pydub import AudioSegment



In [13]:
t1 = 30000
t2 = 60000
waveFile = AudioSegment.from_file("sample_audio_full.wav")
waveFile = waveFile[t1:t2]
waveFile.export('audio_sample_30s.wav', format="wav")

<_io.BufferedRandom name='audio_sample_30s.wav'>

In [14]:
file_path = "audio_sample_30s.wav"
feature = get_feature(file_path)
y = model.predict(feature.reshape(1,498))
ind = numpy.argmax(y)
genres[ind]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step


'hiphop'