In [None]:
import os
import pathlib
import tensorflow as tf

import librosa
import os
import json

In [None]:
data_dir = pathlib.Path('data/favi_speech_dataset_v02')
if not data_dir.exists():
  tf.keras.utils.get_file(
      'favi_speech_dataset_v02.zip',
      origin="https://storage.googleapis.com/financial_speech_dataset_id/data/favi_speech_dataset_v02.zip",
      extract=True,
      cache_dir='.', cache_subdir='data')

In [None]:
DATASET_PATH = data_dir
JSON_PATH = "favi_dataset02.json"
SAMPLES_TO_CONSIDER = 16000 # /one second audio file

In [None]:
def preprocess_dataset(dataset_path, json_path, num_mfcc=13, n_fft=2048, hop_length=512):

    #dictionary of the dataset
    data = {
        "mapping": [],
        "labels": [],
        "MFCCs": [],
        "files": []
    }

    #loop through all sub-dirs
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        if dirpath is not dataset_path:

            #save label in the mapping
            label = dirpath.split("/")[-1]
            data["mapping"].append(label)
            print("\nProcessing: '{}'".format(label))

            #process all audio files in sub-dir and store MFCCs
            for f in filenames:
                file_path = os.path.join(dirpath, f)

                #load audio file and slice it to ensure length consistency among different files
                signal, sample_rate = librosa.load(file_path)

                #drop audio files with less than pre-decided number of samples
                if len(signal) >= SAMPLES_TO_CONSIDER:

                    # ensure consistency of the length of the signal
                    signal = signal[:SAMPLES_TO_CONSIDER]

                    # extract MFCCs feature
                    MFCCs = librosa.feature.mfcc(signal, sample_rate, n_mfcc=num_mfcc, n_fft=n_fft,
                                                 hop_length=hop_length)

                    # store data for analysed track
                    data["MFCCs"].append(MFCCs.T.tolist())
                    data["labels"].append(i-1)
                    data["files"].append(file_path)
                    print("{}: {}".format(file_path, i-1))

    # save data in json file
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)

In [None]:
if __name__ == "__main__":
    preprocess_dataset(DATASET_PATH, JSON_PATH)

In [None]:
from google.colab import files
files.download('favi_dataset02.json')