In [None]:
durationCheck = 10.      # Only consider files with 10 or more seconds of audio.
deltaT        = 0.2      # Audio frame size is 0.2 seconds.
noisy         = 0.1      # This sets the limit for static, i.e. pauses in speech.
lim1 = 10; lim2 = 410    # Lower and upper frequencies. 
                         # For the above parameters and 16 kHz sampling, this range is about 50 - 2000 Hz.  

audioType  = ".flac"               # Flac files. 
path       = "LibriSpeech/"        # Path where audio files are located.
trainSet   = "train-clean/"        # Training set.
cvSet      = "cv-clean/"           # Cross-Validation set.
tstSet     = "test-clean/"         # Test set. 
maxminFile = "min_max_values.dat"  # File to store max and min values for each frequency bin.
outfolder  = "networks/"           # Folder to store trained networks.
stem       = "nn"                  # Output network filename stem.    

# These are the speakers.
# Female: 19, 32, 39, 40, 83
# Male: 26, 27, 78, 405, 196
speakers = ["19/",
            "26/",
            "32/",
            "27/",
            "39/",
            "78/",
            "40/",
            "405/", 
            "83/",
            "196/"]

numFeatures = lim2-lim1
numSpeakers = len(speakers)

In [2]:
import librosa
import os
import math
import json

In [3]:

DATASET_PATH = "../LibriSpeech/train-clean"
UNSEENDATA_PATH = "../LibriSpeech/test-clean"
JSON_PATH = "train_data.json"
SAMPLE_RATE = 44100
DURATION = 1
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

In [4]:
def save_mfcc_train(dataset_path, json_path,n_mfcc=13,n_fft=2048,hop_length=512,num_segments=10):
    # dictionary to store data
    data = {
        "mapping": [],
        "mfcc": [],
        "labels": []
    }
    num_samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    expected_num_nfcc_vectors_per_segment = math.ceil(num_samples_per_segment / hop_length) # 1.2 -> 2
    # loop through all the chunks
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        
        # ensure that we're not at root level
        if dirpath is not dataset_path:
            
            # save the semantic label
            dirpath_components = dirpath.split("/")
            semantic_label = dirpath_components[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing {}".format(semantic_label))
            # process files for specific human
            for f in filenames:
                # load audio file
                file_path = os.path.join(dirpath,f)
                signal, sr = librosa.load(file_path,sr=sr)

                # process segments extracting mfcc and storing data
                for s in range(num_segments):
                    start_sample = num_samples_per_segment * s
                    finish_sample = start_sample + num_samples_per_segment
                    
                    mfcc = librosa.feature.mfcc(y=signal[start_sample:finish_sample], 
                                                sr=SAMPLE_RATE, 
                                                n_fft=n_fft, 
                                                n_mfcc=n_mfcc, 
                                                hop_length=hop_length)
                    mfcc = mfcc.T

                    # Store mfcc for segment if it has the expected length
                    if len(mfcc) == expected_num_nfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i-1)
                        print("{}, segment:{}".format(file_path,s))
    with open(json_path,"w") as fp:
        json.dump(data,fp, indent=4)

In [None]:
save_mfcc_train()