In [4]:
%%file hparams_ecapatdnn_fbanks.yaml
# #################################
# Training ECAPA-TDNN embeddings for language identification (LID).
#
# Authors:
#  * Hwidong Na
#  * Mirco Ravanelli
#  * Pavlo Ruban
# #################################

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1986
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]

# Set up folders for reading from and writing to
# Dataset will be downloaded to the `data_folder`
data_folder: !PLACEHOLDER # e.g. /localscratch/common_voice_kpd/
output_folder: !ref /home/ulaval.ca/maelr5/scratch/parkinsons-results/ECAPA-TDNN/full_dataset/fbank/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Path where data manifest files are stored
train_annotation: /home/ulaval.ca/maelr5/parkinsons/train.json
valid_annotation: /home/ulaval.ca/maelr5/parkinsons/valid.json
test_annotation: /home/ulaval.ca/maelr5/parkinsons/test.json

skip_prep: False

# The train logger writes training statistics to a file, as well as stdout.
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

####################### Training Parameters ####################################

# Feature parameters btw: 40 - 80
n_mels: 40
sample_rate: 16000
number_of_epochs: 15
batch_size: 2
n_classes: 2
emb_dim: 192 # dimensionality of the embeddings
emb_channels: [1024, 1024, 1024, 1024, 3072]
emb_attention_channels: 128

# Dataloaders
num_workers: 4
drop_last: True
train_dataloader_options:
    num_workers: !ref <num_workers>
    batch_size: !ref <batch_size>
    drop_last: !ref <drop_last>
    shuffle: True

test_dataloader_options:
    num_workers: !ref <num_workers>
    batch_size: !ref <batch_size>
    shuffle: True

############################## Augmentations ###################################

# Feature extraction
compute_features: !new:speechbrain.lobes.features.Fbank
    n_mels: !ref <n_mels>

# Mean and std normalization of the input features
mean_var_norm_input: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

############################## Models ##########################################

# To design a custom model, either just edit the simple CustomModel
# class that's listed here, or replace this `!new` call with a line
# pointing to a different file you've defined.

# Embedding Model
embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
    input_size: !ref <n_mels>
    activation: !name:torch.nn.LeakyReLU
    channels: !ref <emb_channels>
    kernel_sizes: [5, 3, 3, 3, 1]
    dilations: [1, 2, 3, 4, 1]
    attention_channels: !ref <emb_attention_channels>
    lin_neurons: !ref <emb_dim>

# Classifier based on cosine distance
classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
    input_size: !ref <emb_dim>
    out_neurons: !ref <n_classes>

# The first object passed to the Brain class is this "Epoch Counter"
# which is saved by the Checkpointer so that training can be resumed
# if it gets interrupted at any point.
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>

# Objects in "modules" dict will have their parameters moved to the correct
# device, as well as having train()/eval() called on them by the Brain class.
modules:
    compute_features: !ref <compute_features>
    embedding_model: !ref <embedding_model>
    mean_var_norm_input: !ref <mean_var_norm_input>
    classifier: !ref <classifier>

# Additive Angular Margin
compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
    loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
        margin: 0.2
        scale: 30

# Learning rates
lr: 0.0001
lr_final: 0.00001


# This optimizer will be constructed by the Brain class after all parameters
# are moved to the correct device. Then it will be added to the checkpointer.
opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002


# Linear lr decay
lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler
    initial_value: !ref <lr>
    final_value: !ref <lr_final>
    epoch_count: !ref <number_of_epochs>

############################## Logging and Pretrainer ##########################

# This object is used for saving the state of training both so that it
# can be resumed if it gets interrupted, and also so that the best checkpoint
# can be later loaded for evaluation or inference.
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        normalizer_input: !ref <mean_var_norm_input>
        embedding_model: !ref <embedding_model>
        classifier: !ref <classifier>
        counter: !ref <epoch_counter>

# Load pretrained embedding module
# Note: in this case, we pre-train with the ECAPA-TDNN model trained on voxceleb
# for speaker-id (this leads to a performance improvement).
embedding_model_path: speechbrain/spkrec-ecapa-voxceleb/embedding_model.ckpt

# Pretrained ECAPA embeddings from SpeakerID on VoxCeleb
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    collect_in: !ref <save_folder>
    loadables:
        embedding_model: !ref <embedding_model>
    paths:
        embedding_model: !ref <embedding_model_path>


Overwriting hparams_ecapatdnn_fbanks.yaml


In [5]:
%%file train_ecapatdnn_fbanks.py

#!/usr/bin/env python3
import os
import sys

import torchaudio
# from common_language_prepare import prepare_common_language
from hyperpyyaml import load_hyperpyyaml

import speechbrain as sb
from speechbrain.utils.logger import get_logger

"""Recipe for training a LID system with CommonLanguage.

To run this recipe, do the following:
> python train.py hparams/train_ecapa_tdnn.yaml

Author
------
 * Mirco Ravanelli 2021
 * Pavlo Ruban 2021
"""

# logger = get_logger(__name__)


# Brain class for Language ID training
class DetectorBrain(sb.Brain):
    def prepare_features(self, wavs, stage):
        """Prepare the features for computation, including augmentation.

        Arguments
        ---------
        wavs : tuple
            Input signals (tensor) and their relative lengths (tensor).
        stage : sb.Stage
            The current stage of training.

        Returns
        -------
        feats : torch.Tensor
            Computed features.
        lens : torch.Tensor
            The length of the corresponding features.
        """
        wavs, lens = wavs

        # Feature extraction and normalization
        feats = self.modules.compute_features(wavs)
        feats = self.modules.mean_var_norm_input(feats, lens)

        return feats, lens

    def compute_forward(self, batch, stage):
        """Runs all the computation of that transforms the input into the
        output probabilities over the N classes.

        Arguments
        ---------
        batch : PaddedBatch
            This batch object contains all the relevant tensors for computation.
        stage : sb.Stage
            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.

        Returns
        -------
        predictions : torch.Tensor
            torch.Tensor that contains the posterior probabilities over the N classes.
        """

        # We first move the batch to the appropriate device.
        batch = batch.to(self.device)

        # Compute features, embeddings and output
        feats, lens = self.prepare_features(batch.sig, stage)
        embeddings = self.modules.embedding_model(feats)
        outputs = self.modules.classifier(embeddings)

        return outputs, lens

    def compute_objectives(self, inputs, batch, stage):
        """Computes the loss given the predicted and targeted outputs.

        Arguments
        ---------
        inputs : tensors
            The output tensors from `compute_forward`.
        batch : PaddedBatch
            This batch object contains all the relevant tensors for computation.
        stage : sb.Stage
            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.

        Returns
        -------
        loss : torch.Tensor
            A one-element tensor used for backpropagating the gradient.
        """

        predictions, lens = inputs

        targets = batch.detection_id_encoded.data

        # # Concatenate labels (due to data augmentation)
        # if stage == sb.Stage.TRAIN:
        #     if hasattr(self.hparams, "wav_augment"):
        #         targets = self.hparams.wav_augment.replicate_labels(targets)
        #         if hasattr(self.hparams.lr_annealing, "on_batch_end"):
        #             self.hparams.lr_annealing.on_batch_end(self.optimizer)

        loss = self.hparams.compute_cost(predictions, targets)

        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(batch.id, predictions, targets, lens)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """Gets called at the beginning of each epoch.

        Arguments
        ---------
        stage : sb.Stage
            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
        epoch : int
            The currently-starting epoch. This is passed
            `None` during the test stage.
        """

        # Set up evaluation-only statistics trackers
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """Gets called at the end of an epoch.

        Arguments
        ---------
        stage : sb.Stage
            One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
        stage_loss : float
            The average loss for all of the data processed in this stage.
        epoch : int
            The currently-starting epoch. This is passed
            `None` during the test stage.
        """

        # Store the train loss until the validation stage.
        if stage == sb.Stage.TRAIN:
            self.train_loss = stage_loss

        # Summarize the statistics from the stage for record-keeping.
        else:
            stats = {
                "loss": stage_loss,
                "error": self.error_metrics.summarize("average"),
            }

        # At the end of validation...
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(epoch)
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            # The train_logger writes a summary to stdout and to the logfile.
            self.hparams.train_logger.log_stats(
                {"Epoch": epoch, "lr": old_lr},
                train_stats={"loss": self.train_loss},
                valid_stats=stats,
            )

            # Save the current checkpoint and delete previous checkpoints,
            self.checkpointer.save_and_keep_only(meta=stats, min_keys=["error"])

        # We also write statistics about test data to stdout and to the logfile.
        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                {"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stats,
            )


def dataio_prep(hparams):
    """This function prepares the datasets to be used in the brain class.
    It also defines the data processing pipeline through user-defined functions.
    We expect `prepare_common_language` to have been called before this,
    so that the `train.csv`, `valid.csv`,  and `test.csv` manifest files
    are available.

    Arguments
    ---------
    hparams : dict
        This dictionary is loaded from the `train.yaml` file, and it includes
        all the hyperparameters needed for dataset construction and loading.

    Returns
    -------
    datasets : dict
        Contains two keys, "train" and "dev" that correspond
        to the appropriate DynamicItemDataset object.
    """

    # Initialization of the label encoder. The label encoder assigns to each
    # of the observed label a unique index (e.g, 'lang01': 0, 'lang02': 1, ..)
    label_encoder = sb.dataio.encoder.CategoricalEncoder()

    # Define audio pipeline
    @sb.utils.data_pipeline.takes("path")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav):
        """Load the signal, and pass it and its length to the corruption class.
        This is done on the CPU in the `collate_fn`."""
        sig, _ = torchaudio.load(wav)
        # sig = torchaudio.functional.resample(sig.squeeze(0), fs, hparams["sample_rate"])
        sig = sig.transpose(0, 1).squeeze(1)

        return sig

    # Define label pipeline:
    @sb.utils.data_pipeline.takes("detection")
    @sb.utils.data_pipeline.provides("detection", "detection_id_encoded")
    def label_pipeline(detection_id):
        yield detection_id
        detection_id_encoded = label_encoder.encode_label_torch(detection_id)
        yield detection_id_encoded

    # Define datasets. We also connect the dataset with the data processing
    # functions defined above.
    # datasets = {}
    # for dataset in ["train", "dev", "test"]:
    #     datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_csv(
    #         csv_path=hparams[f"{dataset}_csv"],
    #         replacements={"data_root": hparams["data_folder"]},
    #         dynamic_items=[audio_pipeline, label_pipeline],
    #         output_keys=["id", "sig", "language_encoded"],
    #     )

    datasets = {}
    data_info = {
        "train": hparams["train_annotation"],
        "valid": hparams["valid_annotation"],
        "test": hparams["test_annotation"],
    }

    for dataset in data_info:
        datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
            json_path=data_info[dataset],
            replacements={"data_root": hparams["data_folder"]},
            dynamic_items=[audio_pipeline, label_pipeline],
            output_keys=["id", "sig", "detection_id_encoded"],
        )

    # Load or compute the label encoder (with multi-GPU DDP support)
    # Please, take a look into the lab_enc_file to see the label to index
    # mapping.
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[datasets["train"]],
        output_key="detection",
    )

    return datasets, label_encoder


# Recipe begins!
if __name__ == "__main__":
    # Reading command line arguments.
    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

    # Load hyperparameters file with command-line overrides.
    with open(hparams_file, encoding="utf-8") as fin:
        hparams = load_hyperpyyaml(fin, overrides)

    # Create experiment directory
    sb.create_experiment_directory(
        experiment_directory=hparams["output_folder"],
        hyperparams_to_save=hparams_file,
        overrides=overrides,
    )

    # Create dataset objects "train", "valid", and "test" and label_encoder
    datasets, label_encoder = dataio_prep(hparams)

    # Initialize the Brain object to prepare for mask training.
    detection_brain = DetectorBrain(
        modules=hparams["modules"],
        opt_class=hparams["opt_class"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )

    # The `fit()` method iterates the training loop, calling the methods
    # necessary to update the parameters of the model. Since all objects
    # with changing state are managed by the Checkpointer, training can be
    # stopped at any point, and will be resumed on next call.
    detection_brain.fit(
        epoch_counter=detection_brain.hparams.epoch_counter,
        train_set=datasets["train"],
        valid_set=datasets["valid"],
        train_loader_kwargs=hparams["train_dataloader_options"],
        valid_loader_kwargs=hparams["test_dataloader_options"],
    )

    # Load the best checkpoint for evaluation
    test_stats = detection_brain.evaluate(
        test_set=datasets["test"],
        min_key="error",
        test_loader_kwargs=hparams["test_dataloader_options"],
    )
    

Overwriting train_ecapatdnn_fbanks.py


In [6]:

import sys
# Run Training
!{sys.executable} train_ecapatdnn_fbanks.py hparams_ecapatdnn_fbanks.yaml  --data_folder='/home/ulaval.ca/maelr5/scratch/parkinsons' --device='cuda:0' # --number_of_epochs=5 --use_tensorboard=True


  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
speechbrain.utils.quirks - Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
speechbrain.utils.quirks - Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: /home/ulaval.ca/maelr5/scratch/parkinsons-results/ECAPA-TDNN/full_dataset/fbank/1986
speechbrain.dataio.encoder - Load called, but CategoricalEncoder is not empty. Loaded data will overwrite everything. This is normal if there is e.g. an unk label defined at init.
speechbrain.core - Gradscaler enabled: `False`
speechbrain.core - Using training precision: `--precision=fp32`
speechbrain.core - Using evaluation precision: `--eval_precision=fp32`
speechbrain.core - DetectorBrain Model Statistics:
* Total Number of Trainable Parameters: 20.6M
* Total Number of Parameters: 20.6M
* Trainable Parameters represent 100.0

The output train_log file when running the run.sh file:

```
Epoch: 1, lr: 1.00e-04 - train loss: 4.20 - valid loss: 6.02, valid error: 3.28e-01
Epoch: 2, lr: 9.36e-05 - train loss: 3.00 - valid loss: 4.95, valid error: 4.69e-01
Epoch: 3, lr: 8.71e-05 - train loss: 2.50 - valid loss: 2.89, valid error: 3.91e-01
Epoch: 4, lr: 8.07e-05 - train loss: 2.04 - valid loss: 2.33, valid error: 2.66e-01
Epoch: 5, lr: 7.43e-05 - train loss: 1.89 - valid loss: 1.08, valid error: 1.56e-01
Epoch: 6, lr: 6.79e-05 - train loss: 1.58 - valid loss: 1.46, valid error: 2.03e-01
Epoch: 7, lr: 6.14e-05 - train loss: 1.42 - valid loss: 1.07, valid error: 1.41e-01
Epoch: 8, lr: 5.50e-05 - train loss: 1.36 - valid loss: 1.88, valid error: 2.34e-01
Epoch: 9, lr: 4.86e-05 - train loss: 1.31 - valid loss: 1.83, valid error: 2.19e-01
Epoch: 10, lr: 4.21e-05 - train loss: 1.44 - valid loss: 9.76e-01, valid error: 9.38e-02
Epoch: 11, lr: 3.57e-05 - train loss: 1.30 - valid loss: 1.44, valid error: 2.34e-01
Epoch: 12, lr: 2.93e-05 - train loss: 1.22 - valid loss: 1.01, valid error: 1.09e-01
Epoch: 13, lr: 2.29e-05 - train loss: 1.26 - valid loss: 1.02, valid error: 1.25e-01
Epoch: 14, lr: 1.64e-05 - train loss: 1.25 - valid loss: 1.37, valid error: 1.56e-01
Epoch: 15, lr: 1.00e-05 - train loss: 1.29 - valid loss: 1.11, valid error: 1.41e-01
Epoch loaded: 10 - test loss: 2.02, test error: 2.09e-01
```

### plot train/valid loss/accuracy during training

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt

def plot_training(event_file):
    # Extract data from the event file
    step = []  # Store the global steps
    values = []  # Store the values (e.g., loss or accuracy)
    tags = []  # Store the tags (e.g., "train_loss", "validation_accuracy")
    
    for summary in tf.compat.v1.train.summary_iterator(event_file):
        for value in summary.summary.value:
            # You can filter out the tags you're interested in (e.g., "train_loss", "accuracy")
            if value.HasField('simple_value'):  # Make sure it is a scalar value
                tags.append(value.tag)
                values.append(value.simple_value)
                step.append(summary.step)
    
    # Plot the loss
    plt.figure(figsize=(10, 6))
    
    # Example: Plotting a specific tag (e.g., "train_loss")
    for tag in set(tags):  # We loop through each unique tag
        # print(tag)
        if tag == 'error/valid':
            continue
        if tag == 'Epoch':
            continue
        if tag == 'acc/train':
            continue
        if tag == 'acc/valid':
            continue
        tag_indices = [i for i, t in enumerate(tags) if t == tag]
        tag_steps = [step[i] for i in tag_indices]
        tag_values = [values[i] for i in tag_indices]
        
        plt.plot(tag_steps, tag_values, label=tag)  # Plot each tag with its respective values
    
    # Customize plot
    plt.xlabel('Step')
    plt.ylabel('Value')
    plt.title('Loss during training')
    plt.legend()
    plt.grid(True)
    
    # Show plot
    plt.show()
    
    # Plot the acc
    plt.figure(figsize=(10, 6))
    
    # Example: Plotting a specific tag (e.g., "train_loss")
    for tag in set(tags):  # We loop through each unique tag
        # print(tag)
        if tag == 'error/valid':
            continue
        if tag == 'Epoch':
            continue
        if tag == 'loss/train':
            continue
        if tag == 'loss/valid':
            continue
        tag_indices = [i for i, t in enumerate(tags) if t == tag]
        tag_steps = [step[i] for i in tag_indices]
        tag_values = [values[i] for i in tag_indices]
        
        plt.plot(tag_steps, tag_values, label=tag)  # Plot each tag with its respective values
    
    # Customize plot
    plt.xlabel('Step')
    plt.ylabel('Value')
    plt.title('Accuracy during training')
    plt.legend()
    plt.grid(True)
    
    # Show plot
    plt.show()

event_file = "/home/ulaval.ca/maelr5/scratch/parkinsons-results/xvector/fulldataset/FBANKs/1986/tb_logs/events.out.tfevents.1745138757.ul-val-pr-gpu05.l.ul.ca.1159741.0"
plot_training(event_file)

