In [None]:
# This project is almost identical to the tutorial by Valerio Velardo: https://www.youtube.com/watch?v=szyGiObZymo

# What this script does is it loads all the audio file in the GTZAN_Data.zip file and creates JSON files with all the saved MFCC matrices

import json
import os
import math
import librosa

In [None]:
# I needed to connect my Google Drive because that was where my dataset was stored

from google.colab import drive
drive.mount('/content/gdrive') 

Mounted at /content/gdrive


In [None]:
ORIGINAL_DATA_LOCATION = "/content/gdrive/MyDrive/ML_Datasets/GTZAN_Data.zip" #The original data was on my google drive, edit this to suit your needs
LOCAL_FILE_LOCATION = "/content/GTZAN_Data.zip" #This is where the data will be stored by default if using Google Colab

In [None]:
!cp -r {ORIGINAL_DATA_LOCATION} .
!ls

gdrive	GTZAN_Data.zip	sample_data


In [None]:
!mkdir -p GTZAN_Data
!ls

gdrive	GTZAN_Data  GTZAN_Data.zip  sample_data


In [None]:
# We unzip all the audio files into the GTZAN_Data folder

%cd GTZAN_Data
!unzip {LOCAL_FILE_LOCATION}
!ls

In [None]:
# The file jazz.00054.wav was corrupted in the download for me
%cd genres_original/jazz/ 
!ls

In [None]:
# Because the data file was corrupted, I needed to remove it to allow the rest of the script to run (you may not have this problem)
!rm jazz.00054.wav
!ls

In [None]:
#Arbitrarily copied jazz.00019.wav as the new jazz.00054.wav (because the data file was corrupted, you may not need this step)
!cp jazz.00019.wav jazz.00054.wav
!ls

In [None]:
DATASET_PATH = "/content/GTZAN_Data/genres_original" 
JSON_PATH = "/content/gdrive/MyDrive/ML_Datasets/data_10.json" # You can edit this location to where you want to store the resulting JSON file
SAMPLE_RATE = 22050
TRACK_DURATION = 30 # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION

In [None]:
def save_mfcc(dataset_path, json_path, num_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):
  
    #Extracts MFCCs from music dataset and saves them into a json file along with genre labels.
        # dataset_path = Path to dataset
        # json_path = Path to json file used to save MFCCs
        # num_mfcc = Number of coefficients to extract
        # n_fft = Interval we consider to apply FFT (in terms of samples)
        # hop_length = Sliding window for FFT. Measured in # of samples
        # num_segments = Number of segments we want to divide sample tracks into

    # dictionary to store mapping, labels, and MFCCs
    data = {
        "mapping": [],
        "labels": [],
        "mfcc": []
    }

    samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)

    # loop through all genre sub-folder
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        # ensure we're processing a genre sub-folder level
        if dirpath is not dataset_path:

            # save genre label (i.e., sub-folder name) in the mapping
            semantic_label = dirpath.split("/")[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing: {}".format(semantic_label))

            # process all audio files in genre sub-dir
            for f in filenames:

		# load audio file
                file_path = os.path.join(dirpath, f)
                signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)

                # process all segments of audio file
                for d in range(num_segments):

                    # calculate start and finish sample for current segment
                    start = samples_per_segment * d
                    finish = start + samples_per_segment

                    # extract mfcc
                    mfcc = librosa.feature.mfcc(signal[start:finish], sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
                    mfcc = mfcc.T

                    # store only mfcc feature with expected number of vectors
                    if len(mfcc) == num_mfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i-1)
                        print("{}, segment:{}".format(file_path, d+1))
    
    # save MFCCs to json file
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)
        

In [None]:
if __name__ == "__main__":
    save_mfcc(DATASET_PATH, JSON_PATH, num_segments=10)