In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [None]:
%%capture
!pip install pydub
!pip install jiwer
!pip install torchaudio
!pip install librosa
!pip install audplot
!pip install datasets

In [None]:
%%capture
!pip install accelerate==0.27.2
!pip install transformers==4.40.2

In [None]:
import numpy as np
import pandas as pd
from pydub import AudioSegment
import json

import gdown
import zipfile

from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt

import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys
import joblib
from copy import deepcopy

import audplot
import audmetric
import librosa
import IPython.display as ipd

from datasets import load_dataset
from datasets import load_metric, Dataset, DatasetDict
import transformers
from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2FeatureExtractor, AutoModelForPreTraining, EvalPrediction
from transformers import TrainingArguments, EarlyStoppingCallback, TrainerCallback
from transformers import (
    Trainer,
    is_apex_available,
)
from transformers import TrainerState, TrainerControl

from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput

import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)

from typing import Any, Dict, List, Optional, Union
from packaging import version

from sklearn.metrics import classification_report

# Import EmoTale

In [None]:
with open('/content/drive/My Drive/MSc_data/github_token.txt') as f:
    token = f.read().strip()

In [None]:
!git clone https://{token}@github.com/MajaHjuler/EmoTale.git

Cloning into 'EmoTale'...
remote: Enumerating objects: 845, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 845 (delta 8), reused 12 (delta 2), pack-reused 822[K
Receiving objects: 100% (845/845), 179.34 MiB | 24.73 MiB/s, done.
Resolving deltas: 100% (13/13), done.
Updating files: 100% (803/803), done.


In [None]:
audio_folder = "/content/EmoTale/Data/"
filenames = []
data = {'path':[], 'filename':[], 'speaker':[], 'emotion':[], 'sentence':[], 'language': []}
df_EmoTale = pd.DataFrame(data)
emo_dict= {'A': 'anger', 'B': 'boredom', 'H': 'happiness', 'S': 'sadness', 'N': 'neutral'}

for filename in os.listdir(audio_folder):
  if filename.endswith(".wav"):
      filenames.append(filename)
      path = os.path.join(audio_folder, filename)
      language = filename[0:2]
      emotion = filename[7]
      speaker = filename[3:6]
      sentence = filename[-5]
      df_EmoTale.loc[len(df_EmoTale.index)] = [path, filename, speaker, emotion, sentence, language]
df_EmoTale["emotion"] = df_EmoTale["emotion"].map(emo_dict)

print(f"dataset shape: {df_EmoTale.shape}")

dataset shape: (800, 6)


In [None]:
EmoTale_gender_dict = {'001': 'F', '003':'F', '004':'M', '005':'M', '006':'M', '007':'F', '008':'F', '009':'F', '010':'F', '011':'F', '012':'F', '013':'F', '014':'F', '015':'M', '016':'F', '017':'F', '018':'M', '019':'M'}
EmoTale_age_dict = {'001':22, '003': 23, '004': 24, '005': 24, '006': 22, '007': 27, '008': 12, '009': 12, '010': 21, '011': 25, '012': 26, '013': 24, '014': 9, '015': 24, '016': 39, '017': 25, '018': 25, '019': 26}

df_EmoTale['gender'] = df_EmoTale['speaker'].map(EmoTale_gender_dict)
df_EmoTale['age'] = df_EmoTale['speaker'].map(EmoTale_age_dict)

df_EmoTale = df_EmoTale[df_EmoTale["language"] == 'DK']

# Model name, config, processor

In [None]:
# We need to specify the input and output column
input_column = "path"
output_column = "emotion"

In [None]:
# we need to distinguish the unique labels in our SER dataset
label_list = sorted(df_EmoTale['emotion'].unique())
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 5 classes: ['anger', 'boredom', 'happiness', 'neutral', 'sadness']


In [None]:
model_name_or_path = "chcaa/xls-r-300m-danish"
pooling_mode = "mean"

In [None]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

In [None]:
#load features without tokenizer
processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
target_sampling_rate = processor.sampling_rate#feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

preprocessor_config.json:   0%|          | 0.00/225 [00:00<?, ?B/s]

The target sampling rate: 16000


In [None]:
config.id2label

{0: 'anger', 1: 'boredom', 2: 'happiness', 3: 'neutral', 4: 'sadness'}

# Model

Before diving into the training part, we need to build our classification model based on the merge strategy.

In [None]:
@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


In [None]:
class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    print('multi-label classification')
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


# Training

The data is processed so that we are ready to start setting up the training pipeline. We will make use of 🤗's [Trainer](https://huggingface.co/transformers/master/main_classes/trainer.html?highlight=trainer) for which we essentially need to do the following:

- Define a data collator. In contrast to most NLP models, XLSR-Wav2Vec2 has a much larger input length than output length. *E.g.*, a sample of input length 50000 has an output length of no more than 100. Given the large input sizes, it is much more efficient to pad the training batches dynamically meaning that all training samples should only be padded to the longest sample in their batch and not the overall longest sample. Therefore, fine-tuning XLSR-Wav2Vec2 requires a special padding data collator, which we will define below

- Evaluation metric. During training, the model should be evaluated on the word error rate. We should define a `compute_metrics` function accordingly

- Load a pretrained checkpoint. We need to load a pretrained checkpoint and configure it correctly for training.

- Define the training configuration.

After having fine-tuned the model, we will correctly evaluate it on the test data and verify that it has indeed learned to correctly transcribe speech.

## Set-up Trainer

Let's start by defining the data collator. The code for the data collator was copied from [this example](https://github.com/huggingface/transformers/blob/9a06b6b11bdfc42eea08fa91d0c737d1863c99e3/examples/research_projects/wav2vec2/run_asr.py#L81).

Without going into too many details, in contrast to the common data collators, this data collator treats the `input_values` and `labels` differently and thus applies to separate padding functions on them (again making use of XLSR-Wav2Vec2's context manager). This is necessary because in speech input and output are of different modalities meaning that they should not be treated by the same padding function.
Analogous to the common data collators, the padding tokens in the labels with `-100` so that those tokens are **not** taken into account when computing the loss.

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

Next, the evaluation metric is defined. There are many pre-defined metrics for classification/regression problems, but in this case, we would continue with just **Accuracy** for classification and **MSE** for regression. You can define other metrics on your own.

## Evaluation metric

In [None]:
is_regression = False

In [None]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)

    if is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

Now, we can load the pretrained XLSR-Wav2Vec2 checkpoint into our classification model with a pooling strategy.

In [None]:
model_name_or_path

'chcaa/xls-r-300m-danish'

The first component of XLSR-Wav2Vec2 consists of a stack of CNN layers that are used to extract acoustically meaningful - but contextually independent - features from the raw speech signal. This part of the model has already been sufficiently trained during pretraining and as stated in the [paper](https://arxiv.org/pdf/2006.13979.pdf) does not need to be fine-tuned anymore.
Thus, we can set the `requires_grad` to `False` for all parameters of the *feature extraction* part.

In a final step, we define all parameters related to training.
To give more explanation on some of the parameters:
- `learning_rate` and `weight_decay` were heuristically tuned until fine-tuning has become stable. Note that those parameters strongly depend on the Common Voice dataset and might be suboptimal for other speech datasets.

For more explanations on other parameters, one can take a look at the [docs](https://huggingface.co/transformers/master/main_classes/trainer.html?highlight=trainer#trainingarguments).

**Note**: If one wants to save the trained models in his/her google drive the commented-out `output_dir` can be used instead.

## Model save path

## Training args

In [None]:
training_args = TrainingArguments(
    output_dir = '/content/models',
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=6.0,
    fp16=True,
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-3,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model ='eval_loss',
    weight_decay=0.1,
    seed = 42
)

class CustomCallback(TrainerCallback):

  def __init__(self, trainer) -> None:
    super().__init__()
    self._trainer = trainer

  def on_step_end(self, args, state, control, **kwargs):
    # if state.global_step % 10 == 0 and state.global_step > 0:
    if control.should_evaluate:
      control_copy = deepcopy(control)
      self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
      return control_copy

For future use we can create our training script, we do it in a simple way. You can add more on you own.

# Define Trainer

In [None]:
%%capture

!git clone https://github.com/NVIDIA/apex
%cd apex
!python3 setup.py install

In [None]:
if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast


class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.

        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """
        self.use_amp = False
        self.use_apex = True
        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_amp:
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            self.deepspeed.backward(loss)
        else:
            loss.backward()

        return loss.detach()


# Import data splits

In [None]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech_array = speech_array.mean(dim=0) # turn audio from stereo into mono
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    print('Computing speech_list')
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]

    print('Computing target_list')
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    result = processor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result

In [None]:
%mkdir "/content/data"

In [None]:
save_path = "/content/data"

In [None]:
results_path = "/content/results"

In [None]:
# Import data splits
url = 'https://drive.google.com/file/d/1xyv4pAwwb2P4in2FbKe9oDft6lmsjS8U/view?usp=sharing'
output = 'Ntrain_experiment.zip'
gdown.download(url, output, quiet=False, fuzzy=True)
with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall()

Downloading...
From: https://drive.google.com/uc?id=1xyv4pAwwb2P4in2FbKe9oDft6lmsjS8U
To: /content/apex/Ntrain_experiment.zip
100%|██████████| 2.83M/2.83M [00:00<00:00, 140MB/s]


In [None]:
i = 0
test_df = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntest{i+1}.csv')
val_df = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/nval{i+1}.csv')

train_df_360 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_360.csv')
train_df_330 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_330.csv')
train_df_300 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_300.csv')
train_df_270 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_270.csv')
train_df_240 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_240.csv')
train_df_210 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_210.csv')
train_df_180 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_180.csv')
train_df_150 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_150.csv')
train_df_120 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_120.csv')
train_df_90 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_90.csv')
train_df_60 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_60.csv')
train_df_30 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_30.csv')
train_df_25 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_25.csv')
train_df_20 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_20.csv')
train_df_15 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_15.csv')
train_df_10 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_10.csv')
train_df_5 = pd.read_csv(f'/content/Ntrain_experiment/K{i+1}_fold/ntrain{i+1}_5.csv')

train_df_360.to_csv(f"{save_path}/train_360.csv", sep="\t", encoding="utf-8", index=False)
train_df_330.to_csv(f"{save_path}/train_330.csv", sep="\t", encoding="utf-8", index=False)
train_df_300.to_csv(f"{save_path}/train_300.csv", sep="\t", encoding="utf-8", index=False)
train_df_270.to_csv(f"{save_path}/train_270.csv", sep="\t", encoding="utf-8", index=False)
train_df_240.to_csv(f"{save_path}/train_240.csv", sep="\t", encoding="utf-8", index=False)
train_df_210.to_csv(f"{save_path}/train_210.csv", sep="\t", encoding="utf-8", index=False)
train_df_180.to_csv(f"{save_path}/train_180.csv", sep="\t", encoding="utf-8", index=False)
train_df_150.to_csv(f"{save_path}/train_150.csv", sep="\t", encoding="utf-8", index=False)
train_df_120.to_csv(f"{save_path}/train_120.csv", sep="\t", encoding="utf-8", index=False)
train_df_90.to_csv(f"{save_path}/train_90.csv", sep="\t", encoding="utf-8", index=False)
train_df_60.to_csv(f"{save_path}/train_60.csv", sep="\t", encoding="utf-8", index=False)
train_df_30.to_csv(f"{save_path}/train_30.csv", sep="\t", encoding="utf-8", index=False)
train_df_25.to_csv(f"{save_path}/train_25.csv", sep="\t", encoding="utf-8", index=False)
train_df_20.to_csv(f"{save_path}/train_20.csv", sep="\t", encoding="utf-8", index=False)
train_df_15.to_csv(f"{save_path}/train_15.csv", sep="\t", encoding="utf-8", index=False)
train_df_10.to_csv(f"{save_path}/train_10.csv", sep="\t", encoding="utf-8", index=False)
train_df_5.to_csv(f"{save_path}/train_5.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)
val_df.to_csv(f"{save_path}/val.csv", sep="\t", encoding="utf-8", index=False)

# Loading the created dataset using datasets
data_files = {
  "train_360": "/content/data/train_360.csv",
  "train_330": "/content/data/train_330.csv",
  "train_300": "/content/data/train_300.csv",
  "train_270": "/content/data/train_270.csv",
  "train_240": "/content/data/train_240.csv",
  "train_210": "/content/data/train_210.csv",
  "train_180": "/content/data/train_180.csv",
  "train_150": "/content/data/train_150.csv",
  "train_120": "/content/data/train_120.csv",
  "train_90": "/content/data/train_90.csv",
  "train_60": "/content/data/train_60.csv",
  "train_30": "/content/data/train_30.csv",
  "train_25": "/content/data/train_25.csv",
  "train_20": "/content/data/train_20.csv",
  "train_15": "/content/data/train_15.csv",
  "train_10": "/content/data/train_10.csv",
  "train_5": "/content/data/train_5.csv",
  "test": "/content/data/test.csv",
  "validation": "/content/data/val.csv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_360_dataset = dataset["train_360"]
train_330_dataset = dataset["train_330"]
train_300_dataset = dataset["train_300"]
train_270_dataset = dataset["train_270"]
train_240_dataset = dataset["train_240"]
train_210_dataset = dataset["train_210"]
train_180_dataset = dataset["train_180"]
train_150_dataset = dataset["train_150"]
train_120_dataset = dataset["train_120"]
train_90_dataset = dataset["train_90"]
train_60_dataset = dataset["train_60"]
train_30_dataset = dataset["train_30"]
train_25_dataset = dataset["train_25"]
train_20_dataset = dataset["train_20"]
train_15_dataset = dataset["train_15"]
train_10_dataset = dataset["train_10"]
train_5_dataset = dataset["train_5"]
eval_dataset = dataset["validation"]
test_dataset = dataset["test"]

print(eval_dataset)
print(test_dataset)

# We need to specify the input and output column
input_column = "path"
output_column = "emotion"

train_360 = train_360_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_330 = train_330_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_300 = train_300_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_270 = train_270_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_240 = train_240_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_210 = train_210_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_180 = train_180_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_150 = train_150_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_120 = train_120_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_90 = train_90_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_60 = train_60_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_30 = train_30_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_25 = train_25_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_20 = train_20_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_15 = train_15_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_10 = train_10_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)
train_5 = train_5_dataset.map(preprocess_function, batch_size=45, batched=True, num_proc=1)

eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=45,
    batched=True,
    num_proc=1
)
test_dataset = test_dataset.map(
    preprocess_function,
    batch_size=45,
    batched=True,
    num_proc=1
)

print(f"Training input_values: {train_330[0]['input_values']}")
print(f"Training attention_mask: {train_330[0]['attention_mask']}")
print(f"Training labels: {train_330[0]['labels']} - {train_330[0]['emotion']}")

Generating train_360 split: 0 examples [00:00, ? examples/s]

Generating train_330 split: 0 examples [00:00, ? examples/s]

Generating train_300 split: 0 examples [00:00, ? examples/s]

Generating train_270 split: 0 examples [00:00, ? examples/s]

Generating train_240 split: 0 examples [00:00, ? examples/s]

Generating train_210 split: 0 examples [00:00, ? examples/s]

Generating train_180 split: 0 examples [00:00, ? examples/s]

Generating train_150 split: 0 examples [00:00, ? examples/s]

Generating train_120 split: 0 examples [00:00, ? examples/s]

Generating train_90 split: 0 examples [00:00, ? examples/s]

Generating train_60 split: 0 examples [00:00, ? examples/s]

Generating train_30 split: 0 examples [00:00, ? examples/s]

Generating train_25 split: 0 examples [00:00, ? examples/s]

Generating train_20 split: 0 examples [00:00, ? examples/s]

Generating train_15 split: 0 examples [00:00, ? examples/s]

Generating train_10 split: 0 examples [00:00, ? examples/s]

Generating train_5 split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['path', 'filename', 'speaker', 'emotion', 'sentence', 'language', 'gender', 'age'],
    num_rows: 45
})
Dataset({
    features: ['path', 'filename', 'speaker', 'emotion', 'sentence', 'language', 'gender', 'age'],
    num_rows: 45
})


Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list


Map:   0%|          | 0/330 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list


Map:   0%|          | 0/270 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list


Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list


Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list


Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list


Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list


Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list
Computing speech_list
Computing target_list
Computing speech_list
Computing target_list


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list
Computing speech_list
Computing target_list


Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list
Computing speech_list
Computing target_list


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list


Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list


Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Computing speech_list
Computing target_list
Training input_values: [-0.0014873853651806712, -0.0020572782959789038, -0.0007037300965748727, -0.0006444522296078503, -0.0010497126495465636, -0.0006806039018556476, -7.723060116404667e-05, 0.0007142176618799567, 0.001711223740130663, 0.0011534810764715075, 0.0011787725379690528, 0.0017780661582946777, 0.0017921340186148882, 0.0014324826188385487, 0.0015036365948617458, -0.0011410964652895927, 0.012727353721857071, 0.005445028189569712, 0.0063218604773283005, 0.011338779702782631, 0.0003120220499113202, 0.0071073053404688835, -0.001271469984203577, -0.011045861057937145, -0.007533427327871323, -0.007969898171722889, -0.002536805346608162, 0.0021369990427047014, -0.002900388091802597, -0.0033832662738859653, -0.005292687565088272, -0.006418209057301283, -0.0005560524296015501, -0.0029149996116757393, -0.006002056412398815, -0.0024713079910725355, -0.008132810704410076, 0.0002612096432130784, 0.02832753397524357, -0.0018578828312456608, -0.01

# Training

In [None]:
train_datasets = [train_360, train_330, train_300, train_270, train_240, train_210, train_180, train_150, train_120, train_90, train_60, train_30, train_25, train_20, train_15, train_10, train_5]

In [None]:
# Repeat training for every training split
for training_dataset in train_datasets[0]:
    val_predictions = []
    test_predictions = []
    train_predictions = []
    EMODB_predictions = []
    print(training_dataset)

    print("Loading model and freezing extractor... \n")
    model = Wav2Vec2ForSpeechClassification.from_pretrained(
        model_name_or_path,
        config=config,
    )
    model.freeze_feature_extractor()

    print("Initializing Trainer... \n")
    trainer = Trainer(
        model=model,
        data_collator=data_collator,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=training_dataset,
        eval_dataset=eval_dataset
    )

    trainer.add_callback(CustomCallback(trainer))

    print("Training... \n")
    trainer.train()

    results = trainer.evaluate(eval_dataset = eval_dataset)
    print(f"Evaluation results: {results} \n")

    print("Computing training predictions... \n")
    predictions = trainer.predict(test_dataset = training_dataset)
    logits = predictions.predictions
    pred_ids = np.argmax(logits, axis = 1)
    train_predictions.append({
        'logits': logits,
        'predictions': pred_ids,
        'label_ids': predictions.label_ids,
        'metrics': predictions.metrics
    })

    print("Computing validation predictions... \n")
    predictions = trainer.predict(test_dataset = eval_dataset)
    logits = predictions.predictions
    pred_ids = np.argmax(logits, axis = 1)
    val_predictions.append({
        'logits': logits,
        'predictions': pred_ids,
        'label_ids': predictions.label_ids,
        'metrics': predictions.metrics
    })

    print("Computing test predictions... \n")
    predictions = trainer.predict(test_dataset = test_dataset)
    logits = predictions.predictions
    pred_ids = np.argmax(logits, axis = 1)
    test_predictions.append({
        'logits': logits,
        'predictions': pred_ids,
        'label_ids': predictions.label_ids,
        'metrics': predictions.metrics
    })

    Ntrain = training_dataset.num_rows

    # save predictions
    np.save(results_path + f'Ntrain_{Ntrain}_Kfold_{i}_' 'train_predictions.npy', np.array(train_predictions))
    np.save(results_path + f'Ntrain_{Ntrain}_Kfold_{i}_' 'val_predictions.npy', np.array(val_predictions))
    np.save(results_path + f'Ntrain_{Ntrain}_Kfold_{i}_' 'test_predictions.npy', np.array(test_predictions))

    # save logs
    log_history = trainer.state.log_history
    with open(results_path + f'Ntrain_{Ntrain}_Kfold_{i}_log_history.json', 'w') as f:
      json.dump(log_history, f)

Dataset({
    features: ['path', 'filename', 'speaker', 'emotion', 'sentence', 'language', 'gender', 'age', 'input_values', 'attention_mask', 'labels'],
    num_rows: 360
})
Loading model and freezing extractor... 



Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at chcaa/xls-r-300m-danish and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initializing Trainer... 

Training... 



Step,Training Loss,Validation Loss,Accuracy
10,2.0884,1.6463,0.2
20,1.7498,1.584147,0.2
30,1.555,1.438726,0.333333
40,1.4018,1.32946,0.333333
50,1.3728,1.504624,0.333333
60,1.4696,1.319401,0.333333
70,1.617,1.518001,0.333333
80,1.2637,1.391998,0.377778
90,1.7308,1.306387,0.444444
100,1.1227,1.45467,0.466667


  return F.conv1d(input, weight, bias, self.stride,
  return F.conv1d(input, weight, bias, self.stride,
  return F.conv1d(input, weight, bias, self.stride,
  return F.conv1d(input, weight, bias, self.stride,
  return F.conv1d(input, weight, bias, self.stride,
  return F.conv1d(input, weight, bias, self.stride,


Evaluation results: {'eval_loss': 1.0674835443496704, 'eval_accuracy': 0.6666666865348816, 'eval_runtime': 2.2092, 'eval_samples_per_second': 20.369, 'eval_steps_per_second': 4.074, 'epoch': 6.0} 

Computing training predictions... 

Computing validation predictions... 



Computing test predictions... 



FileNotFoundError: [Errno 2] No such file or directory: '/content/resultspredictions/Ntrain_360_Kfold_0_train_predictions.npy'