In [1]:
durationCheck = 10.      # Only consider files with 10 or more seconds of audio.
deltaT        = 0.2      # Audio frame size is 0.2 seconds.
noisy         = 0.1      # This sets the limit for static, i.e. pauses in speech.
lim1 = 10; lim2 = 410    # Lower and upper frequencies. 
                         # For the above parameters and 16 kHz sampling, this range is about 50 - 2000 Hz.  
# These are the speakers.
# Female: 19, 32, 39, 40, 83
# Male: 26, 27, 78, 405, 196
speakers = ["19/",
            "26/",
            "32/",
            "27/",
            "39/",
            "78/",
            "40/",
            "405/", 
            "83/",
            "196/"]

numFeatures = lim2-lim1
numSpeakers = len(speakers)

In [2]:
import librosa
import os
import math
import json
import numpy as np

In [3]:

DATASET_PATH = "../../LibriSpeech/train-clean"
UNSEENDATA_PATH = "../../LibriSpeech/test-clean"
JSON_PATH = "train_data.json"
JSON_TEST_PATH = "test_data.json"
SAMPLE_RATE = 44100
DURATION = 1
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

In [4]:

def save_mfcc_train(dataset_path, json_path,n_mfcc=13,n_fft=2048,hop_length=512,num_segments=10):
    # dictionary to store data
    data = {
        "mapping": [],
        "mfcc": [],
        "labels": []
    }
    num_samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    expected_num_nfcc_vectors_per_segment = math.ceil(num_samples_per_segment / hop_length) # 1.2 -> 2
    # loop through all the chunks
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        
        # ensure that we're not at root level
        if dirpath is not dataset_path:
            
            # save the semantic label
            dirpath_components = dirpath.split("/")
            semantic_label = dirpath_components[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing {}".format(semantic_label))
            # process files for specific human
            for f in filenames:
                # load audio file
                file_path = os.path.join(dirpath,f)
                signal, sr = librosa.load(file_path,sr=SAMPLE_RATE)

                # process segments extracting mfcc and storing data
                for s in range(num_segments):
                    start_sample = num_samples_per_segment * s
                    finish_sample = start_sample + num_samples_per_segment
                    
                    mfcc = librosa.feature.mfcc(y=signal[start_sample:finish_sample], 
                                                sr=SAMPLE_RATE, 
                                                n_fft=n_fft, 
                                                n_mfcc=n_mfcc, 
                                                hop_length=hop_length)
                    mfcc = mfcc.T

                    # Store mfcc for segment if it has the expected length
                    if len(mfcc) == expected_num_nfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append([i-1])
                        print("{}, segment:{}".format(file_path,s))
    with open(json_path,"w") as fp:
        json.dump(data,fp, indent=4)
                    


In [5]:
save_mfcc_train(DATASET_PATH,JSON_PATH,num_segments=10)



Processing 32
../../LibriSpeech/train-clean/32/32-21625-0013.flac, segment:0
../../LibriSpeech/train-clean/32/32-21625-0013.flac, segment:1
../../LibriSpeech/train-clean/32/32-21625-0013.flac, segment:2
../../LibriSpeech/train-clean/32/32-21625-0013.flac, segment:3
../../LibriSpeech/train-clean/32/32-21625-0013.flac, segment:4
../../LibriSpeech/train-clean/32/32-21625-0013.flac, segment:5
../../LibriSpeech/train-clean/32/32-21625-0013.flac, segment:6
../../LibriSpeech/train-clean/32/32-21625-0013.flac, segment:7
../../LibriSpeech/train-clean/32/32-21625-0013.flac, segment:8
../../LibriSpeech/train-clean/32/32-21625-0013.flac, segment:9
../../LibriSpeech/train-clean/32/32-21631-0005.flac, segment:0
../../LibriSpeech/train-clean/32/32-21631-0005.flac, segment:1
../../LibriSpeech/train-clean/32/32-21631-0005.flac, segment:2
../../LibriSpeech/train-clean/32/32-21631-0005.flac, segment:3
../../LibriSpeech/train-clean/32/32-21631-0005.flac, segment:4
../../LibriSpeech/train-clean/32/32-2163

In [6]:
def load_data(dataset_path):
    with open(dataset_path,"r") as fp:
        data = json.load(fp)
    # convert lists into numpy arrays
    inputs = np.array(data["mfcc"])
    targets = np.array(data["labels"])
    return inputs, targets
    
def load_data_unseen(dataset_path):
    with open(dataset_path,"r") as fp:
        data = json.load(fp)
    # convert lists into numpy arrays
    inputs = np.array(data["mfcc"])
    return inputs

In [7]:
from sklearn.model_selection import train_test_split

# Load data
inputs, targets = load_data(JSON_PATH)
# Train Test Split
inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs,
                                                                          targets,
                                                                          test_size= 0.2)

In [8]:
# build the architecture
from tensorflow import keras
model = keras.Sequential([
    # input layer
    keras.layers.Flatten(input_shape=(inputs_train.shape[1],inputs_train.shape[2])),

    # 1st Hidden layer
    keras.layers.Dense(512, activation="relu"),

    # 2nd Hidden layer
    keras.layers.Dense(256, activation="relu"),

    # 3rd Hidden layer
    keras.layers.Dense(64, activation="relu"),

    # Output Layer
    keras.layers.Dense(10, activation="softmax")   # This 2 means targets (mert and demet for this case)
])
# compile network
optimizer = keras.optimizers.Adam(learning_rate=0.0001) # Adam is an optimizer.
model.compile(optimizer=optimizer,
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"],
              run_eagerly=True) 
model.summary()


2023-05-21 16:34:35.300234: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 117)               0         
                                                                 
 dense (Dense)               (None, 512)               60416     
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dense_2 (Dense)             (None, 64)                16448     
                                                                 
 dense_3 (Dense)             (None, 10)                650       
                                                                 
Total params: 208,842
Trainable params: 208,842
Non-trainable params: 0
_________________________________________________________________


In [9]:
targets_train

array([[2],
       [4],
       [2],
       ...,
       [4],
       [9],
       [6]])

In [22]:
# Train Network
history = model.fit(inputs_train, targets_train, 
          validation_data=(inputs_test, targets_test),
          epochs=50,
          batch_size=32) # Batch size is important customery is 32

Epoch 1/50


2023-05-21 16:33:08.073402: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at sparse_xent_op.cc:103 : INVALID_ARGUMENT: Received a label value of 405 which is outside the valid range of [0, 10).  Label values: 19 39 39 27 78 19 39 83 83 19 26 196 78 78 27 83 78 26 39 40 83 196 40 83 405 196 39 26 27 196 32 405


InvalidArgumentError: {{function_node __wrapped__SparseSoftmaxCrossEntropyWithLogits_device_/job:localhost/replica:0/task:0/device:CPU:0}} Received a label value of 405 which is outside the valid range of [0, 10).  Label values: 19 39 39 27 78 19 39 83 83 19 26 196 78 78 27 83 78 26 39 40 83 196 40 83 405 196 39 26 27 196 32 405 [Op:SparseSoftmaxCrossEntropyWithLogits]

In [None]:
def save_mfcc_unseen(dataset_path, json_path,n_mfcc=13,n_fft=2048,hop_length=512,num_segments=10):
    
    data = {
        "mfcc": [],
    }
    num_samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    expected_num_nfcc_vectors_per_segment = math.ceil(num_samples_per_segment / hop_length) # 1.2 -> 2
    # loop through all the chunks
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        
        
        # save the semantic label
        # process files for specific human
        for f in filenames:
            # load audio file
            file_path = os.path.join(dirpath,f)
            signal, sr = librosa.load(file_path,sr=SAMPLE_RATE)

            # process segments extracting mfcc and storing data
            for s in range(num_segments):
                start_sample = num_samples_per_segment * s
                finish_sample = start_sample + num_samples_per_segment
                
                mfcc = librosa.feature.mfcc(y=signal[start_sample:finish_sample], 
                                            sr=SAMPLE_RATE, 
                                            n_fft=n_fft, 
                                            n_mfcc=n_mfcc, 
                                            hop_length=hop_length)
                mfcc = mfcc.T

                # Store mfcc for segment if it has the expected length
                if len(mfcc) == expected_num_nfcc_vectors_per_segment:
                    data["mfcc"].append(mfcc.tolist())
    with open(json_path,"w") as fp:
        json.dump(data,fp, indent=4)
                    

In [None]:
save_mfcc_unseen(UNSEENDATA_PATH, JSON_TEST_PATH,num_segments=10)

7880