In [8]:
###
# Based on: https://www.kaggle.com/code/mohamedatef20/speaker-recognition
# Dataset: https://www.kaggle.com/datasets/kongaevans/speaker-recognition-dataset/
###

In [9]:
import os
from itertools import product
from datetime import datetime

import numpy as np
import tensorflow as tf
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow

from Utils.config import DATA_DIR
from Utils.create_model import create_model
from Utils.get_wav_paths import get_wav_paths
from Utils.generate_training_data import generate_training_data

In [10]:
rng = tf.random.experimental.Generator.from_seed(420420)
tf.random.set_seed(420420)

In [11]:
data_dir = DATA_DIR
os.listdir(data_dir)

# A.Solzhenitsyn was finally excluded from the dataset, 
# because of language differences (Russian vs English)
# and the fact that he was recorded with simultaneous translation.

['Alexander_Solzhenitsyn',
 'Barrack_Obama',
 'Benjamin_Netanyau',
 'Hillary_Clinton',
 'Jens_Stoltenberg',
 'John_F_Kennedy',
 'Julia_Gillard',
 'Margaret_Tarcher',
 'Nelson_Mandela',
 'Ronald_Reagan']

In [12]:
nelson_madela = [item for item in os.listdir(data_dir + "/Nelson_Mandela")]
nelson_madela[:10]

['0.wav',
 '1.wav',
 '10.wav',
 '100.wav',
 '1000.wav',
 '1001.wav',
 '1002.wav',
 '1003.wav',
 '1004.wav',
 '1005.wav']

##  Process training dataset


In [13]:
barack_obama_paths = get_wav_paths("Barrack_Obama")
benjamin_netanyau_paths = get_wav_paths("Benjamin_Netanyau")
hillary_clinton_paths = get_wav_paths("Hillary_Clinton")
jens_stoltenberg_paths = get_wav_paths("Jens_Stoltenberg")
john_f_kennedy_paths = get_wav_paths("John_F_Kennedy")
julia_gillard_paths = get_wav_paths("Julia_Gillard")
margaret_thatcher_paths = get_wav_paths("Margaret_Tarcher")
nelson_mandela_paths = get_wav_paths("Nelson_Mandela")
ronald_reagan_paths = get_wav_paths("Ronald_Reagan")

print("Number of samples for Margaret Tarcher: ", len(margaret_thatcher_paths))
print(margaret_thatcher_paths[:10])

Number of samples for Margaret Tarcher:  1500
['0.wav', '1.wav', '10.wav', '100.wav', '1000.wav', '1001.wav', '1002.wav', '1003.wav', '1004.wav', '1005.wav']


# Additive White Gaussian Noise (AWGN)

In [14]:
barack_obama_wavs, barack_obama_labels = generate_training_data(
    barack_obama_paths, "Barrack_Obama", 0
)
benjamin_netanyau_wavs, benjamin_netanyau_labels = generate_training_data(
    benjamin_netanyau_paths, "Benjamin_Netanyau", 1
)
hillary_clinton_wavs, hillary_clinton_labels = generate_training_data(
    hillary_clinton_paths, "Hillary_Clinton", 2
)
jens_stoltenberg_wavs, jens_stoltenberg_labels = generate_training_data(
    jens_stoltenberg_paths, "Jens_Stoltenberg", 3
)
john_f_kennedy_wavs, john_f_kennedy_labels = generate_training_data(
    john_f_kennedy_paths, "John_F_Kennedy", 4
)
julia_gillard_wavs, julia_gillard_labels = generate_training_data(
    julia_gillard_paths, "Julia_Gillard", 5
)
margaret_thatcher_wavs, margaret_thatcher_labels = generate_training_data(
    margaret_thatcher_paths, "Margaret_Tarcher", 6
)
nelson_mandela_wavs, nelson_mandela_labels = generate_training_data(
    nelson_mandela_paths, "Nelson_Mandela", 7
)
ronald_reagan_wavs, ronald_reagan_labels = generate_training_data(
    ronald_reagan_paths, "Ronald_Reagan", 8
)

100%|██████████| 1266/1266 [00:01<00:00, 968.62it/s]
100%|██████████| 1500/1500 [00:01<00:00, 1052.04it/s]
100%|██████████| 418/418 [00:00<00:00, 998.52it/s] 
100%|██████████| 1500/1500 [00:01<00:00, 1036.81it/s]
100%|██████████| 418/418 [00:00<00:00, 973.93it/s]
100%|██████████| 1500/1500 [00:01<00:00, 1044.57it/s]
100%|██████████| 1500/1500 [00:01<00:00, 1015.45it/s]
100%|██████████| 1500/1500 [00:01<00:00, 1015.84it/s]
100%|██████████| 1911/1911 [00:01<00:00, 972.49it/s] 


### Make all lists equal in length

In [15]:
all_speakers_labels_and_wavs_list = [
    barack_obama_labels,
    barack_obama_wavs,
    benjamin_netanyau_labels,
    benjamin_netanyau_wavs,
    hillary_clinton_labels,
    hillary_clinton_wavs,
    jens_stoltenberg_labels,
    jens_stoltenberg_wavs,
    john_f_kennedy_labels,
    john_f_kennedy_wavs,
    julia_gillard_labels,
    julia_gillard_wavs,
    margaret_thatcher_labels,
    margaret_thatcher_wavs,
    nelson_mandela_labels,
    nelson_mandela_wavs,
    ronald_reagan_labels,
    ronald_reagan_wavs,
]

In [16]:
min_length = min(len(lst) for lst in all_speakers_labels_and_wavs_list)
print("Minimum length: ", min_length)
trimmed_all_speakers_labels_and_wavs = [
    lst[:min_length] for lst in all_speakers_labels_and_wavs_list
]

Minimum length:  417


In [17]:
trimmed_all_speakers_labels = trimmed_all_speakers_labels_and_wavs[0::2]
print(len(trimmed_all_speakers_labels))
print(trimmed_all_speakers_labels[1])

9
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [18]:
trimmed_all_speakers_wavs = trimmed_all_speakers_labels_and_wavs[1::2]
print(len(trimmed_all_speakers_wavs))
print(trimmed_all_speakers_wavs[1][0])
print(trimmed_all_speakers_wavs[1][5])

9
tf.Tensor([[0.00368235 0.00368235 0.00365183 ... 0.00380442 0.00737498 0.00734446]], shape=(1, 16000), dtype=float32)
tf.Tensor(
[[ 0.06874497  0.01030381 -0.09937637 ...  0.18623765  0.31123766
   0.39696154]], shape=(1, 16000), dtype=float32)


In [19]:
all_wavs = []

for wavs in trimmed_all_speakers_wavs:
    all_wavs.extend(wavs)

In [20]:
all_labels = []

for labels in trimmed_all_speakers_labels:
    all_labels.extend(labels)

In [21]:
train_wavs, test_wavs, train_labels, test_labels = train_test_split(
    all_wavs, all_labels, test_size=0.2
)

train_x = np.array(train_wavs)
train_y = np.array(train_labels)
test_x, test_y = np.array(test_wavs), np.array(test_labels)

In [22]:
print(train_x.shape)
print(train_y.shape)
print(np.unique(train_y))

test_y_original = test_y

train_y = tf.keras.utils.to_categorical(train_y)
test_y = tf.keras.utils.to_categorical(test_y)

print(train_x.shape)
print(train_y.shape)
print(np.unique(train_y))

(3002, 1, 16000)
(3002,)
[0 1 2 3 4 5 6 7 8]
(3002, 1, 16000)
(3002, 9)
[0. 1.]


In [23]:
mlflow.set_experiment("speech-recognition")

model = create_model("spectrogram")
timestamp = datetime.now().strftime("%Y-%m-%d___%H%M__%S%f")[:-4]
classes = [
    "Barrack_Obama",
    "Benjamin_Netanyau",
    "Hillary_Clinton",
    "Jens_Stoltenberg",
    "John_F_Kennedy",
    "Julia_Gillard",
    "Margaret_Tarcher",
    "Nelson_Mandela",
    "Ronald_Reagan",
]







In [24]:
# hipermarameters tuning - uncomment if needed

# spectrogram_types_list = ["melspectrogram", "spectrogram"]
spectrogram_types_list = ["spectrogram"]
# num_epochs_list = [10, 20, 30, 50, 70]
num_epochs_list = [70]

In [25]:
spectrogram_type = spectrogram_types_list[0]
num_epochs = num_epochs_list[0]

print("Spectrogram type: ", spectrogram_type)
print("Number of epochs: ", num_epochs)

Spectrogram type:  spectrogram
Number of epochs:  70


In [26]:
train_loss = []
train_accuracy = []
val_loss = []
val_accuracy = []

for spectrogram_type, num_epochs in product(spectrogram_types_list, num_epochs_list):
    with mlflow.start_run():
        mlflow.log_param("model", model)
        mlflow.log_param("spectrogram", spectrogram_type)
        mlflow.log_param("num_epochs", num_epochs)

        history = model.fit(
            x=train_x, y=train_y, epochs=num_epochs, validation_data=(test_x, test_y)
        )
        test_loss, test_accuracy = model.evaluate(test_x, test_y)

        mlflow.log_metric("train_loss", history.history["loss"][-1])
        mlflow.log_metric("train_accuracy", history.history["accuracy"][-1])
        mlflow.log_metric("test_loss", test_loss)
        mlflow.log_metric("test_accuracy", test_accuracy)

        train_loss.extend(history.history["loss"])
        train_accuracy.extend(history.history["accuracy"])
        val_loss.extend(history.history["val_loss"])
        val_accuracy.extend(history.history["val_accuracy"])

        # Confusion Matrix
        y_pred = model.predict(test_x)
        y_pred_classes = np.argmax(y_pred, axis=1)
        y_true = np.argmax(test_y, axis=1)

        cm = confusion_matrix(y_true, y_pred_classes)

        # Plot Confusion Matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(
            cm,
            annot=True,
            fmt="g",
            cmap="Blues",
            xticklabels=classes,
            yticklabels=classes,
        )
        plt.xlabel("Predicted labels")
        plt.ylabel("True labels")
        plt.title("Confusion Matrix")
        plt.savefig("confusion_matrix.png")
        plt.close()

        # Log Confusion Matrix as MLflow artifact
        mlflow.log_artifact("confusion_matrix.png", "confusion_matrix_image")

        model.save_weights(f"{num_epochs}_epochs_{spectrogram_type}.h5")
        mlflow.log_dict(model.summary(), "model_summary")

        # Plot Accuracy and Loss
        plt.figure(figsize=(8, 6))
        plt.plot(train_accuracy, label="Train Accuracy")
        plt.plot(val_accuracy, label="Validation Accuracy")
        plt.xlabel("Epochs")
        plt.ylabel("Accuracy")
        plt.title("Accuracy - Train vs Validation")
        plt.legend()
        plt.savefig("accuracy_plot.png")
        plt.close()

        plt.figure(figsize=(8, 6))
        plt.plot(train_loss, label="Train Loss")
        plt.plot(val_loss, label="Validation Loss")
        plt.xlabel("Epochs")
        plt.ylabel("Loss")
        plt.title("Loss - Train vs Validation")
        plt.legend()
        plt.yscale('log') 
        plt.savefig("loss_plot.png")
        plt.close()

        # Log Accuracy and Loss plots as MLflow artifacts
        mlflow.log_artifact("accuracy_plot.png", "accuracy_plot")
        mlflow.log_artifact("loss_plot.png", "loss_plot")


Epoch 1/70












Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 static_stft (Spectrogram)  