In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt

from Constants.Paths import *
from Constants.Labels import *
from Unpacking.PrepareAudioFiles import prepare_audio_files
from Preprocessing.GenerateSpectrograms import generate_all_spectrograms
from SpectrogramLoading import *
from Models.TrainingHistory import TrainingHistory
from Models.CnnModel import CnnModel
from Models.TransformerModel import TransformerModel
from Models.WandbDetails import WandbDetails
from Models.InputPadding import pad_to_length
from Models.HistoryPlots import plot_loss_history, plot_accuracy_history

In [2]:
prepare_audio_files()
generate_all_spectrograms(backend="soundfile")

Silence folder /home/jknyspel/Documents/Code/DeepLearningSpeechRecognition/Dataset/train/audio/silence already exists. Skipping.
Already extended. Skipping.
Output directory /home/jknyspel/Documents/Code/DeepLearningSpeechRecognition/Dataset/train/spectrograms already exists. Skipping.
Output directory /home/jknyspel/Documents/Code/DeepLearningSpeechRecognition/Dataset/test/spectrograms already exists. Skipping.


In [3]:
train_paths, val_paths = get_divided_paths_with_labels()

random.seed(42)
random.shuffle(train_paths)
random.shuffle(val_paths)

train = [load_spectrogram_for_path(path_with_label) for path_with_label in train_paths]
validation = [load_spectrogram_for_path(path_with_label) for path_with_label in val_paths]

In [4]:
X_train, y_train, label_indexes = spectrograms_to_x_y(train)
X_validation, y_validation, _ = spectrograms_to_x_y(validation, label_indexes)

max_length = max(x.shape[1] for x in [*X_validation, *X_train])
X_validation = pad_to_length(X_validation, max_length)
X_train = pad_to_length(X_train, max_length)

In [5]:
from typing import Any
import optimization
from skopt.space import Real


def evaluate_cnn_params(epochs: int, **params: dict[str, Any]) -> float:
    cnn_model = CnnModel(
        classes=labels,
        learning_rate=params["learning_rate"],
        lr_decay=params["lr_decay"],
        beta_1=1-params["inv_beta_1"],
        beta_2=1-params["inv_beta_2"],
        eps=params["eps"],
        classifier_dropout_1=params["classifier_dropout_1"],
        classifier_dropout_2=params["classifier_dropout_2"],
        classifier_dropout_3=params["classifier_dropout_3"],
        print_every=None, # Disable printing epoch info
        validate_every=epochs, # Validate only on last epoch
        seed=42
    )
    
    cnn_model.train((X_train, y_train), (X_validation, y_validation), epochs=epochs, batch_size=32)
    return cnn_model.get_history().val_accuracy[-1] 

cnn_spaces = {
    "learning_rate": Real(1e-6, 1e-3, "log-uniform"),
    "lr_decay": Real(1e-6, 1e-2, "log-uniform"),
    "inv_beta_1": Real(1e-4, 0.5, "log-uniform"),
    "inv_beta_2": Real(1e-6, 0.1, "log-uniform"),
    "eps": Real(1e-8, 1e-3, "log-uniform"),
    "classifier_dropout_1": Real(0.0, 0.3, "uniform"),
    "classifier_dropout_2": Real(0.0, 0.3, "uniform"),
    "classifier_dropout_3": Real(0.0, 0.3, "uniform")
}

best_cnn_params_from_search, best_cnn_accuracy = optimization.bayes_search(
    lambda **params: evaluate_cnn_params(epochs=50, **params), 
    cnn_spaces, 
    iterations=2)

best_cnn_params = {
    "learning_rate": best_cnn_params_from_search["learning_rate"],
    "lr_decay": best_cnn_params_from_search["lr_decay"],
    "beta_1": 1 - best_cnn_params_from_search["inv_beta_1"],
    "beta_2": 1 - best_cnn_params_from_search["inv_beta_2"],
    "eps": best_cnn_params_from_search["eps"],
    "classifier_dropout_1": best_cnn_params_from_search["classifier_dropout_1"],
    "classifier_dropout_2": best_cnn_params_from_search["classifier_dropout_2"],
    "classifier_dropout_3": best_cnn_params_from_search["classifier_dropout_3"]
}

{classifier_dropout_1=0.0042, classifier_dropout_2=0.1834, classifier_dropout_3=0.2591, eps=8.4356e-06, inv_beta_1=0.0764, inv_beta_2=1.5100e-04, learning_rate=1.7422e-04, lr_decay=1.5753e-05} -> 91.10412
{classifier_dropout_1=0.1364, classifier_dropout_2=0.2498, classifier_dropout_3=0.0207, eps=3.9558e-07, inv_beta_1=0.0062, inv_beta_2=3.0996e-05, learning_rate=4.3379e-06, lr_decay=4.6450e-06} -> 48.38293

Best parameters: classifier_dropout_1=0.0042, classifier_dropout_2=0.1834, classifier_dropout_3=0.2591, eps=8.4356e-06, inv_beta_1=0.0764, inv_beta_2=1.5100e-04, learning_rate=1.7422e-04, lr_decay=1.5753e-05
Best score: 91.10412


In [6]:
def transformer_coeffs_to_params(**params: dict[str, Any]) -> dict[str, Any]:
    coeff_param_names = ["embedding_dimension", "num_attention_heads", "num_encoder_layers", "dim_feedforward"]
    default_params = {
        "embedding_dimension": 512,
        "num_attention_heads": 8,
        "num_encoder_layers": 6,
        "dim_feedforward": 2048,
    }
    
    coeff_mean = sum(params[f"{name}_coeff"] for name in coeff_param_names) / len(coeff_param_names)
    transformed_params = {
        param_name: int(params[f"{param_name}_coeff"] / coeff_mean * default_params[param_name])
        for param_name in coeff_param_names
    }
    
    # Embedding dimension must be even
    if transformed_params["embedding_dimension"] % 2 != 0:
        transformed_params["embedding_dimension"] += 1
    
    return transformed_params    

def evaluate_transformer_params(epochs: int, **params: dict[str, Any]) -> float:
    transformer_model = TransformerModel(
        classes=labels,
        **transformer_coeffs_to_params(**params),
        dropout=params["dropout"],
        learning_rate=params["learning_rate"],
        lr_decay=params["lr_decay"],
        beta_1=1-params["inv_beta_1"],
        beta_2=1-params["inv_beta_2"],
        eps=params["eps"],
        print_every=None, # Disable printing epoch info
        validate_every=epochs, # Validate only on last epoch
        seed=42
    )
    
    transformer_model.train((X_train, y_train), (X_validation, y_validation), epochs=epochs, batch_size=8)
    return transformer_model.get_history().val_accuracy[-1] 

transformer_spaces = {
    "embedding_dimension_coeff": Real(0.1, 0.9, "uniform"),
    "num_attention_heads_coeff": Real(0.1, 0.9, "uniform"),
    "num_encoder_layers_coeff": Real(0.1, 0.9, "uniform"),
    "dim_feedforward_coeff": Real(0.1, 0.9, "uniform"),
    "dropout": Real(0.0, 0.3, "uniform"),
    "learning_rate": Real(1e-5, 1e-3, "log-uniform"),
    "lr_decay": Real(1e-6, 1e-2, "log-uniform"),
    "inv_beta_1": Real(1e-4, 0.5, "log-uniform"),
    "inv_beta_2": Real(1e-6, 0.1, "log-uniform"),
    "eps": Real(1e-8, 1e-3, "log-uniform")
}

best_transformer_params_from_search, best_transformer_accuracy = optimization.bayes_search(
    lambda **params: evaluate_transformer_params(epochs=5, **params), 
    transformer_spaces, 
    iterations=2)

best_transformer_params = {
    **transformer_coeffs_to_params(**best_transformer_params_from_search),
    "dropout": best_transformer_params_from_search["dropout"],
    "learning_rate": best_transformer_params_from_search["learning_rate"],
    "lr_decay": best_transformer_params_from_search["lr_decay"],
    "beta_1": 1 - best_transformer_params_from_search["inv_beta_1"],
    "beta_2": 1 - best_transformer_params_from_search["inv_beta_2"],
    "eps": best_transformer_params_from_search["eps"]
}

# best_transformer_params = {
#     "embedding_dimension": 512,
#     "num_attention_heads": 8,
#     "num_encoder_layers": 6,
#     "dim_feedforward": 2048,
#     "dropout": 0.1,
#     "learning_rate": 1e-4,
#     "lr_decay": 1e-6,
#     "beta_1": 0.9,
#     "beta_2": 0.98,
#     "eps": 1e-9
# }

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

In [8]:
def train_and_evaluate_transformer(params):
    model = TransformerModel(
        classes=labels,
        embedding_dimension=params.get("embedding_dimension", 512),
        num_attention_heads=params.get("num_attention_heads", 8),
        num_encoder_layers=params.get("num_encoder_layers", 6),
        dim_feedforward=params.get("dim_feedforward", 2048),
        dropout=params.get("dropout", 0.1),
        learning_rate=params.get("learning_rate", 1e-4),
        lr_decay=params.get("lr_decay", 0.0),
        beta_1=params.get("beta_1", 0.9),
        beta_2=params.get("beta_2", 0.98),
        eps=params.get("eps", 1e-9),
        print_every=None,
        validate_every=1,
        seed=42
    )
    
    model.train((X_train, y_train), (X_validation, y_validation), epochs=5, batch_size=32)
    
    y_pred_classes = model.predict(X_validation)
    
    acc = accuracy_score(y_validation, y_pred_classes)
    precision, recall, f1_score, _ = precision_recall_fscore_support(
        y_validation, y_pred_classes, average=None, labels=list(range(len(labels)))
    )
    cm = confusion_matrix(y_validation, y_pred_classes)
    history = model.get_history()
    
    return {
        "params": params,
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "confusion_matrix": cm,
        "history": history
    }

def save_transformer_results_to_json(results, filename):
    serializable_results = []
    
    for result in results:
        serializable_result = {
            "params": result["params"],
            "accuracy": result["accuracy"],
            "precision": result["precision"].tolist(),
            "recall": result["recall"].tolist(),
            "f1_score": result["f1_score"].tolist(),
            "confusion_matrix": result["confusion_matrix"].tolist(),
            "history": {
                "loss": result["history"].loss,
                "val_loss": result["history"].val_loss,
                "accuracy": result["history"].accuracy,
                "val_accuracy": result["history"].val_accuracy,
            }
        }
        serializable_results.append(serializable_result)
    
    with open(f"../Outputs/{filename}", "w") as f:
        json.dump(serializable_results, f, indent=4)

transformer_param_grid = [
    {**best_transformer_params, "embedding_dimension": 256},
    {**best_transformer_params, "embedding_dimension": 512},
    {**best_transformer_params, "embedding_dimension": 1024},
    
    {**best_transformer_params, "num_attention_heads": 4},
    {**best_transformer_params, "num_attention_heads": 8},
    {**best_transformer_params, "num_attention_heads": 16},
    
    {**best_transformer_params, "dropout": 0.1},
    {**best_transformer_params, "dropout": 0.2},
    {**best_transformer_params, "dropout": 0.3},
    
    {**best_transformer_params, "learning_rate": 1e-3},
    {**best_transformer_params, "learning_rate": 1e-4},
    {**best_transformer_params, "learning_rate": 1e-5},
]

transformer_results = []
for params in transformer_param_grid:
    result = train_and_evaluate_transformer(params)
    transformer_results.append(result)

save_transformer_results_to_json(transformer_results, "transformer_experiments_results.json")

for i, result in enumerate(transformer_results):
    print(f"\nModel {i+1}")
    print(f"Params: {result['params']}")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print(f"Precision: {result['precision']}")
    print(f"Recall: {result['recall']}")
    print(f"F1-Score: {result['f1_score']}")
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(result["confusion_matrix"], annot=True, fmt="d", cmap="Blues",
                xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title(f'Confusion Matrix for Model {i+1}')
    plt.show()

OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 7.64 GiB of which 41.25 MiB is free. Process 207127 has 5.97 MiB memory in use. Process 302350 has 4.62 GiB memory in use. Including non-PyTorch memory, this process has 2.60 GiB memory in use. Of the allocated memory 2.43 GiB is allocated by PyTorch, and 3.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Testing CNN

def train_and_evaluate_cnn(params):
    model = CnnModel(
        classes=labels,
        learning_rate=params.get("learning_rate", 1e-3),
        lr_decay=params.get("lr_decay", 1e-4),
        beta_1=params.get("beta_1", 0.9),
        beta_2=params.get("beta_2", 0.999),
        eps=params.get("eps", 1e-8),
        classifier_dropout_1=params.get("classifier_dropout_1", 0.3),
        classifier_dropout_2=params.get("classifier_dropout_2", 0.3),
        classifier_dropout_3=params.get("classifier_dropout_3", 0.1),
        print_every=None,
        validate_every=1,
        seed=42
    )
    
    model.train((X_train, y_train), (X_validation, y_validation), epochs=5, batch_size=32)
    
    y_pred_classes = model.predict(X_validation)
    
    acc = accuracy_score(y_validation, y_pred_classes)
    precision, recall, f1_score, _ = precision_recall_fscore_support(
        y_validation, y_pred_classes, average=None, labels=list(range(len(labels)))
    )
    cm = confusion_matrix(y_validation, y_pred_classes)
    history = model.get_history()
    
    return {
        "params": params,
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "confusion_matrix": cm,
        "history": history
    }

def save_cnn_results_to_json(results, filename):
    serializable_results = []
    
    for result in results:
        serializable_result = {
            "params": result["params"],
            "accuracy": result["accuracy"],
            "precision": result["precision"].tolist(),
            "recall": result["recall"].tolist(),
            "f1_score": result["f1_score"].tolist(),
            "confusion_matrix": result["confusion_matrix"].tolist(),
            "history": {
                "loss": result["history"].loss,
                "val_loss": result["history"].val_loss,
                "accuracy": result["history"].accuracy,
                "val_accuracy": result["history"].val_accuracy,
            }
        }
        serializable_results.append(serializable_result)
    
    with open(f"../Outputs/{filename}", "w") as f:
        json.dump(serializable_results, f, indent=4)

cnn_param_grid = [
    {**best_cnn_params, "classifier_dropout_1": 0.2, "classifier_dropout_2": 0.2, "classifier_dropout_3": 0.1},
    {**best_cnn_params, "classifier_dropout_1": 0.3, "classifier_dropout_2": 0.3, "classifier_dropout_3": 0.1},
    {**best_cnn_params, "classifier_dropout_1": 0.4, "classifier_dropout_2": 0.4, "classifier_dropout_3": 0.2},
    
    {**best_cnn_params, "learning_rate": 1e-3},
    {**best_cnn_params, "learning_rate": 1e-4},
    {**best_cnn_params, "learning_rate": 5e-5},
]

cnn_results = []

for params in cnn_param_grid:
    result = train_and_evaluate_cnn(params)
    cnn_results.append(result)

save_cnn_results_to_json(cnn_results, "cnn_experiments_results.json")

for i, result in enumerate(cnn_results):
    print(f"\nModel {i+1}")
    print(f"Params: {result['params']}")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print(f"Precision: {result['precision']}")
    print(f"Recall: {result['recall']}")
    print(f"F1-Score: {result['f1_score']}")
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(result["confusion_matrix"], annot=True, fmt="d", cmap="Blues",
                xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title(f'Confusion Matrix for Model {i+1}')
    plt.show()