In [None]:
import tensorflow as tf
import os
from os.path import isfile, join
import numpy as np
import shutil
from tensorflow import keras
from pathlib import Path
from IPython.display import display, Audio
import subprocess
import random

Mounting google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [None]:
!cp -r "../content/gdrive/MyDrive/speaker-recognition-dataset" ./

Getting data directories

In [None]:
data_directory = "./speaker-recognition-dataset/16000_pcm_speeches"
audio_folder = "audio"
noise_folder = "noise"

audio_path = os.path.join(data_directory, audio_folder)
noise_path = os.path.join(data_directory, noise_folder)

In [None]:
audio_path

'./speaker-recognition-dataset/16000_pcm_speeches/audio'

In [None]:
valid_split = 0.1

shuffle_seed = 43

sample_rate = 16000

scale = 0.5

batch_size = 64

epochs = 10

Arrange audio and noise

In [None]:
for folder in os.listdir(data_directory):
    if os.path.isdir(os.path.join(data_directory, folder)):
        if folder in [audio_folder, noise_folder]:

            continue
        elif folder in ["other", "_background_noise_"]:

            shutil.move(
                os.path.join(data_directory, folder),
                os.path.join(noise_path, folder),
            )
        else:
            shutil.move(
                os.path.join(data_directory, folder),
                os.path.join(audio_path, folder),
            )

In [None]:
noise_paths = []
for subdir in os.listdir(noise_path):
    subdir_path = Path(noise_path) / subdir
    if os.path.isdir(subdir_path):
        noise_paths += [
            os.path.join(subdir_path, filepath)
            for filepath in os.listdir(subdir_path)
            if filepath.endswith(".wav")
        ]


In [None]:
noise_paths


['speaker-recognition-dataset/16000_pcm_speeches/noise/other/exercise_bike.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/noise/other/pink_noise.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/noise/_background_noise_/dude_miaowing.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/noise/_background_noise_/10convert.com_Audience-Claps_daSG5fwdA7o.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/noise/_background_noise_/doing_the_dishes.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/noise/_background_noise_/running_tap.wav']

In [None]:
command = (
    "for dir in `ls -1 " + noise_path + "`; do "
    "for file in `ls -1 " + noise_path + "/$dir/*.wav`; do "
    "sample_rate=`ffprobe -hide_banner -loglevel panic -show_streams "
    "$file | grep sample_rate | cut -f2 -d=`; "
    "if [ $sample_rate -ne 16000 ]; then "
    "ffmpeg -hide_banner -loglevel panic -y "
    "-i $file -ar 16000 temp.wav; "
    "mv temp.wav $file; "
    "fi; done; done"
)


In [None]:

os.system(command)
def load_noise_sample(path):
    sample, sampling_rate = tf.audio.decode_wav(
        tf.io.read_file(path), desired_channels=1
    )
    if sampling_rate == sample_rate:
        slices = int(sample.shape[0] / sample_rate)
        sample = tf.split(sample[: slices * sample_rate], slices)
        return sample
    else:
        print("Sampling rate for",path, "is incorrect")
        return None




Selecting portion of dataset for training

In [None]:

# Define the percentage of data to use (e.g., 50%)
subset_percentage = 0.5

# Get a list of all noise files
all_noise_files = []
for dirpath, dirnames, filenames in os.walk(noise_path):
    for filename in filenames:
        if filename.endswith('.wav'):
            all_noise_files.append(os.path.join(dirpath, filename))

# Randomly select a subset of noise files
subset_size = int(len(all_noise_files) * subset_percentage)
selected_noise_files = random.sample(all_noise_files, subset_size)

# Load only the selected noise files
noises = []
for path in selected_noise_files:
    sample = load_noise_sample(path)
    if sample:
        noises.extend(sample)
noises = tf.stack(noises)


In [None]:
speaker_paths = [
    os.path.join(audio_path, speaker)
    for speaker in os.listdir(audio_path)
    if os.path.isdir(os.path.join(audio_path, speaker))
]

def get_audio_files(folder):
    audio_files = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith(".wav"):
                audio_files.append(os.path.join(root, file))
    return audio_files

In [None]:
samples_per_speaker = 500  # Adjust as needed

# Get a list of all audio files for each speaker
selected_speaker_files = []

for speaker_path in speaker_paths:
    audio_files = get_audio_files(speaker_path)
    random.shuffle(audio_files)

    # Remove './' prefix from each path
    audio_files = [path[2:] if path.startswith("./") else path for path in audio_files]

    # Extend selected_speaker_files with modified paths
    selected_speaker_files.extend(audio_files[:samples_per_speaker])




In [None]:
selected_speaker_files

['speaker-recognition-dataset/16000_pcm_speeches/audio/Julia_Gillard/1282.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/audio/Julia_Gillard/95.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/audio/Julia_Gillard/1184.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/audio/Julia_Gillard/803.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/audio/Julia_Gillard/1159.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/audio/Julia_Gillard/1331.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/audio/Julia_Gillard/476.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/audio/Julia_Gillard/832.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/audio/Julia_Gillard/791.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/audio/Julia_Gillard/1027.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/audio/Julia_Gillard/1471.wav',
 'speaker-recognition-dataset/16000_pcm_speeches/audio/Julia_Gillard/174.wav',
 'speaker-recognition-dataset/16000_pcm_speeche

Dataset Generation

In [None]:
def paths_and_labels_to_dataset(audio_paths, labels, sample_rate):
    def path_to_audio(path, label):
        audio = tf.io.read_file(path)
        audio, _ = tf.audio.decode_wav(audio, 1, sample_rate)
        return audio, label

    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
    label_ds = tf.data.Dataset.from_tensor_slices(labels)

    dataset = tf.data.Dataset.zip((path_ds, label_ds))
    dataset = dataset.map(lambda x, y: path_to_audio(x, y))

    return dataset


Noise Addition

In [None]:
def add_noise(audio, noises=None, scale=0.5):
    if noises is not None:
        tf_rnd = tf.random.uniform(
            (tf.shape(audio)[0],), 0, noises.shape[0], dtype=tf.int32
        )
        noise = tf.gather(noises, tf_rnd, axis=0)

        prop = tf.math.reduce_max(audio, axis=1) / tf.math.reduce_max(noise, axis=1)
        prop = tf.repeat(tf.expand_dims(prop, axis=1), tf.shape(audio)[1], axis=1)

        audio = audio + noise * prop * scale

    return audio

def audio_to_fft(audio):
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(
        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )
    fft = tf.expand_dims(fft, axis=-1)

    return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])

In [None]:
class_names = os.listdir(audio_path)
print(class_names,)

['Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela', 'Benjamin_Netanyau', 'Jens_Stoltenberg']


In [None]:
audio_paths = []
labels = []

for label, name in enumerate(class_names):
    print("Speaker:",(name))
    dir_path = Path(audio_path) / name
    speaker_sample_paths = [
        os.path.join(dir_path, filepath)
        for filepath in os.listdir(dir_path)
        if filepath.endswith(".wav")
    ]

    # Filter speaker_sample_paths based on selected_speaker_files
    filtered_paths = [path for path in speaker_sample_paths if path in selected_speaker_files]

    # Add filtered paths to audio_paths
    audio_paths += filtered_paths

    # Extend labels list with the corresponding label
    labels += [label] * len(filtered_paths)



Speaker: Julia_Gillard
Speaker: Magaret_Tarcher
Speaker: Nelson_Mandela
Speaker: Benjamin_Netanyau
Speaker: Jens_Stoltenberg


In [None]:
# Shuffle to generate random data
rng = np.random.RandomState(shuffle_seed)
rng.shuffle(audio_paths)
rng = np.random.RandomState(shuffle_seed)
rng.shuffle(labels)

num_val_samples = int(valid_split * len(audio_paths))
train_audio_paths = audio_paths[:-num_val_samples]
train_labels = labels[:-num_val_samples]


valid_audio_paths = audio_paths[-num_val_samples:]
valid_labels = labels[-num_val_samples:]

train_ds = paths_and_labels_to_dataset(train_audio_paths, train_labels, sample_rate)
train_ds = train_ds.shuffle(buffer_size=batch_size * 8, seed=shuffle_seed).batch(batch_size)

valid_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels, sample_rate)
valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=shuffle_seed).batch(32)


# Feature Extraction

In [None]:
train_ds = train_ds.map(
    lambda x, y: (add_noise(x, noises, scale=scale), y),
    num_parallel_calls=tf.data.experimental.AUTOTUNE,
)
train_ds = train_ds.map(
    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.experimental.AUTOTUNE
)

train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE)

valid_ds = valid_ds.map(
    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.experimental.AUTOTUNE
)
valid_ds = valid_ds.prefetch(tf.data.experimental.AUTOTUNE)

# Model

In [None]:
from tensorflow.keras.layers import Conv1D
def residual_block(x, filters, conv_num = 3, activation = "relu"):
    s = keras.layers.Conv1D(filters, 1, padding = "same")(x)

    for i in range(conv_num - 1):
        x = keras.layers.Conv1D(filters, 3, padding = "same")(x)
        x = keras.layers.Activation(activation)(x)

    x = keras.layers.Conv1D(filters, 3, padding = "same")(x)
    x = keras.layers.Add()([x, s])
    x = keras.layers.Activation(activation)(x)

    return keras.layers.MaxPool1D(pool_size = 2, strides = 2)(x)

def build_model(input_shape, num_classes):
    inputs = keras.layers.Input(shape = input_shape, name = "input")

    x = residual_block(inputs, 16, 2)
    x = residual_block(inputs, 32, 2)
    x = residual_block(inputs, 64, 3)
    x = residual_block(inputs, 128, 3)
    x = residual_block(inputs, 128, 3)
    x = keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dense(128, activation="relu")(x)

    outputs = keras.layers.Dense(num_classes, activation = "softmax", name = "output")(x)

    return keras.models.Model(inputs = inputs, outputs = outputs)

model = build_model((sample_rate // 2, 1), len(class_names))

model.summary()

model.compile(optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

model_save_filename = "model.h5"

earlystopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(model_save_filename, monitor="val_accuracy", save_best_only=True)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input (InputLayer)          [(None, 8000, 1)]            0         []                            
                                                                                                  
 conv1d_15 (Conv1D)          (None, 8000, 128)            512       ['input[0][0]']               
                                                                                                  
 activation_10 (Activation)  (None, 8000, 128)            0         ['conv1d_15[0][0]']           
                                                                                                  
 conv1d_16 (Conv1D)          (None, 8000, 128)            49280     ['activation_10[0][0]']       
                                                                                              

# Training

In [None]:
history = model.fit(
    train_ds,
    epochs=epochs,
    validation_data=valid_ds,
    callbacks=[earlystopping_cb, mdlcheckpoint_cb],
)

Epoch 1/10

  saving_api.save_model(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Accuracy

In [None]:
print("Accuracy of model:",model.evaluate(valid_ds))

Accuracy of model: [0.1756058931350708, 0.9480000138282776]


# Testing

In [None]:
SAMPLES_TO_DISPLAY = 10

test_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels, sample_rate)
test_ds = test_ds.shuffle(buffer_size=32 * 8, seed=shuffle_seed).batch(batch_size)


test_ds = test_ds.map(lambda x, y: (add_noise(x, noises, scale=scale), y))

for audios, labels in test_ds.take(1):
    ffts = audio_to_fft(audios)
    y_pred = model.predict(ffts)
    rnd = np.random.randint(0, batch_size, SAMPLES_TO_DISPLAY)
    audios = audios.numpy()[rnd, :, :]
    labels = labels.numpy()[rnd]
    y_pred = np.argmax(y_pred, axis=-1)[rnd]

    for index in range(SAMPLES_TO_DISPLAY):
        print(
            "Speaker:\33{} {}\33[0m\tPredicted:\33{} {}\33[0m".format(
                "[92m" if labels[index] == y_pred[index] else "[91m",
                class_names[labels[index]],
                "[92m" if labels[index] == y_pred[index] else "[91m",
                class_names[y_pred[index]],
            )
        )
        if labels[index] ==y_pred[index]:
            print("Welcome")
        else:
            print("Sorry")
        print("The speaker is" if labels[index] == y_pred[index] else "", class_names[y_pred[index]])

Speaker:[92m Benjamin_Netanyau[0m	Predicted:[92m Benjamin_Netanyau[0m
Welcome
The speaker is Benjamin_Netanyau
Speaker:[92m Jens_Stoltenberg[0m	Predicted:[92m Jens_Stoltenberg[0m
Welcome
The speaker is Jens_Stoltenberg
Speaker:[92m Jens_Stoltenberg[0m	Predicted:[92m Jens_Stoltenberg[0m
Welcome
The speaker is Jens_Stoltenberg
Speaker:[92m Nelson_Mandela[0m	Predicted:[92m Nelson_Mandela[0m
Welcome
The speaker is Nelson_Mandela
Speaker:[92m Julia_Gillard[0m	Predicted:[92m Julia_Gillard[0m
Welcome
The speaker is Julia_Gillard
Speaker:[92m Julia_Gillard[0m	Predicted:[92m Julia_Gillard[0m
Welcome
The speaker is Julia_Gillard
Speaker:[92m Julia_Gillard[0m	Predicted:[92m Julia_Gillard[0m
Welcome
The speaker is Julia_Gillard
Speaker:[92m Magaret_Tarcher[0m	Predicted:[92m Magaret_Tarcher[0m
Welcome
The speaker is Magaret_Tarcher
Speaker:[92m Julia_Gillard[0m	Predicted:[92m Julia_Gillard[0m
Welcome
The speaker is Julia_Gillard
Speaker:[92m Nelson_Mandela[0m	P

# Real Time prediction

In [None]:
def predict(path, labels):
    # Extract the speaker label from the path
    speaker_label = os.path.basename(os.path.dirname(path[0]))

    # Get the index of the speaker label in the class_names list
    label_index = class_names.index(speaker_label)

    # Assign the integer label to the labels list
    labels = [label_index]

    # Create the test dataset
    test = paths_and_labels_to_dataset(path, labels, sample_rate)

    test = test.shuffle(buffer_size=batch_size * 8, seed=shuffle_seed).batch(batch_size)
    test = test.prefetch(tf.data.experimental.AUTOTUNE)
    test = test.map(lambda x, y: (add_noise(x, noises, scale=scale), y))

    for audios, labels in test.take(1):
        ffts = audio_to_fft(audios)
        y_pred = model.predict(ffts)
        rnd = np.random.randint(0, len(audios), 1)
        audios = audios.numpy()[rnd, :, :]
        labels = labels.numpy()[rnd]
        y_pred = np.argmax(y_pred, axis=-1)[rnd]
        for index in range(1):
            print(
                "Speaker:\33{} {}\33[0m\tPredicted:\33{} {}\33[0m".format(
                    "[92m", class_names[labels[index]],
                    "[92m", class_names[y_pred[index]]
                )
            )
            print("Speaker Predicted:", class_names[y_pred[index]])

path = ["./speaker-recognition-dataset/16000_pcm_speeches/Nelson_Mandela/33.wav"]
labels = [0]  # Assuming "unknown" corresponds to class index 0

try:
    predict(path, labels)
except:
    print("Error! Check if the file correctly passed or not!")

Speaker:[92m Nelson_Mandela[0m	Predicted:[92m Nelson_Mandela[0m
Speaker Predicted: Nelson_Mandela
