# Обучение модели HuBERT для задачи распознавания эмоций в речи

## 1. Загрузка датасета

In [4]:
from datasets import load_dataset

ds = load_dataset("csv", data_files="../data/crowd_train_100.csv")

In [66]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'audio_path', 'features_path', 'emotion', 'label'],
        num_rows: 400
    })
})

In [2]:
from transformers import (
    HubertForSequenceClassification,
    AutoConfig,
    Trainer,
    TrainingArguments,
    Wav2Vec2FeatureExtractor,
)

import librosa
import numpy as np

In [43]:
model_id = "facebook/hubert-base-ls960"
# model_id = "facebook/hubert-large-ll60k"
# model_id = "facebook/hubert-large-ls960-ft"
# model_id = "facebook/hubert-xlarge-ll60k"

In [44]:
NUM_LABELS = 4

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)
config = AutoConfig.from_pretrained(model_id, num_labels=NUM_LABELS)
model = HubertForSequenceClassification.from_pretrained(
    model_id,
    config=config,
    ignore_mismatched_sizes=True
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
# Заморозка всех слоев
for param in model.parameters():
    param.requires_grad = False

# model.freeze_feature_extractor() # Заморозка только feature_extractor части

In [42]:
print(len(list(model.parameters())))

215


In [40]:
for param in model.named_parameters():
    print(param[0])

hubert.masked_spec_embed
hubert.feature_extractor.conv_layers.0.conv.weight
hubert.feature_extractor.conv_layers.0.layer_norm.weight
hubert.feature_extractor.conv_layers.0.layer_norm.bias
hubert.feature_extractor.conv_layers.1.conv.weight
hubert.feature_extractor.conv_layers.2.conv.weight
hubert.feature_extractor.conv_layers.3.conv.weight
hubert.feature_extractor.conv_layers.4.conv.weight
hubert.feature_extractor.conv_layers.5.conv.weight
hubert.feature_extractor.conv_layers.6.conv.weight
hubert.feature_projection.layer_norm.weight
hubert.feature_projection.layer_norm.bias
hubert.feature_projection.projection.weight
hubert.feature_projection.projection.bias
hubert.encoder.pos_conv_embed.conv.bias
hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0
hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1
hubert.encoder.layer_norm.weight
hubert.encoder.layer_norm.bias
hubert.encoder.layers.0.attention.k_proj.weight
hubert.encoder.layers.0.attention.k_proj.bia

In [46]:
# Разморозка двух последних слоев кодировщика
layers_freeze_num = 2
n_layers = (
    4 + layers_freeze_num * 16
)  # 4 refers to projector and classifier's weights and biases.

for name, param in list(model.named_parameters())[-n_layers:]:
    param.requires_grad = True
    print(name)

hubert.encoder.layers.10.attention.k_proj.weight
hubert.encoder.layers.10.attention.k_proj.bias
hubert.encoder.layers.10.attention.v_proj.weight
hubert.encoder.layers.10.attention.v_proj.bias
hubert.encoder.layers.10.attention.q_proj.weight
hubert.encoder.layers.10.attention.q_proj.bias
hubert.encoder.layers.10.attention.out_proj.weight
hubert.encoder.layers.10.attention.out_proj.bias
hubert.encoder.layers.10.layer_norm.weight
hubert.encoder.layers.10.layer_norm.bias
hubert.encoder.layers.10.feed_forward.intermediate_dense.weight
hubert.encoder.layers.10.feed_forward.intermediate_dense.bias
hubert.encoder.layers.10.feed_forward.output_dense.weight
hubert.encoder.layers.10.feed_forward.output_dense.bias
hubert.encoder.layers.10.final_layer_norm.weight
hubert.encoder.layers.10.final_layer_norm.bias
hubert.encoder.layers.11.attention.k_proj.weight
hubert.encoder.layers.11.attention.k_proj.bias
hubert.encoder.layers.11.attention.v_proj.weight
hubert.encoder.layers.11.attention.v_proj.bias


In [10]:
def speech_file_to_array(batch):
    batch["array"] = librosa.load(f'../{batch["audio_path"]}', sr=16000, mono=False)[0]
    return batch

def get_input_values(batch, feature_extractor):
    array = batch["array"]
    input = feature_extractor(
        array,
        sampling_rate=16000,
        padding=True,
        return_tensors="pt"
    )

    batch["input_values"] = input.input_values[0]
    
    return batch

In [11]:
ds = ds.map(speech_file_to_array)

In [12]:
ds = ds.remove_columns(["id", "features_path", "emotion"])

In [13]:
ds = ds.rename_column("label", "labels")
ds = ds.map(
    get_input_values,
    fn_kwargs={"feature_extractor": feature_extractor}
)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [14]:
ds

DatasetDict({
    train: Dataset({
        features: ['audio_path', 'labels', 'array', 'input_values'],
        num_rows: 400
    })
})

In [15]:
from datasets import DatasetDict

# 90% train, 10% test + validation
train_testvalid = ds["train"].train_test_split(shuffle=True, test_size=0.1)

# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

ds = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'val': test_valid['train']
})

In [48]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

import torch
from transformers import Wav2Vec2Processor

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(
        self, examples: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:

        input_features = [
            {"input_values": example["input_values"]} for example in examples
        ]
        labels = [example["labels"] for example in examples]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        batch["labels"] = torch.tensor(labels)

        return batch

In [49]:
trainer_config = {
  "OUTPUT_DIR": "results",
  "EPOCHS": 5,
  "TRAIN_BATCH_SIZE": 4, # 8
  "EVAL_BATCH_SIZE": 4, # 8
  "GRADIENT_ACCUMULATION_STEPS": 2, # 4
  "WARMUP_STEPS": 500,
  "DECAY": 0.01,
  "LOGGING_STEPS": 10,
  # "MODEL_DIR": "models/test-hubert-model",
  "SAVE_STEPS": 100,
  "LR": 1e-3,
}

# Fine-Tuning with Trainer
training_args = TrainingArguments(
    output_dir=trainer_config["OUTPUT_DIR"],  # output directory
    gradient_accumulation_steps=trainer_config["GRADIENT_ACCUMULATION_STEPS"],  # accumulate the gradients before running optimization step
    num_train_epochs=trainer_config["EPOCHS"], # total number of training epochs
    per_device_train_batch_size=trainer_config["TRAIN_BATCH_SIZE"],
    per_device_eval_batch_size=trainer_config["EVAL_BATCH_SIZE"],  # batch size for evaluation
    warmup_steps=trainer_config["WARMUP_STEPS"],  # number of warmup steps for learning rate scheduler
    save_steps=trainer_config["SAVE_STEPS"], # save checkpoint every 100 steps
    weight_decay=trainer_config["DECAY"],  # strength of weight decay
    logging_steps=trainer_config["LOGGING_STEPS"],
    eval_strategy="epoch",  # report metric at end of each epoch
    learning_rate=1e-3, # 5e-5 by default
)

In [50]:
data_collator = DataCollatorCTCWithPadding(
    processor=feature_extractor,
    padding=True
)

In [20]:
import evaluate

def compute_metrics(eval_pred):
    compute_accuracy_metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return compute_accuracy_metric.compute(predictions=predictions, references=labels)

In [51]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds["train"],
    eval_dataset=ds["val"],
    compute_metrics=compute_metrics,
)

In [52]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.3985,1.37203,0.4
2,1.3246,1.363753,0.35
3,1.3371,1.33615,0.35
4,1.278,1.176343,0.4
5,1.1054,1.438816,0.4


TrainOutput(global_step=225, training_loss=1.2928171242607964, metrics={'train_runtime': 1346.6662, 'train_samples_per_second': 1.337, 'train_steps_per_second': 0.167, 'total_flos': 1.213761859269742e+17, 'train_loss': 1.2928171242607964, 'epoch': 5.0})

In [None]:
def emo_2_label(emotion):
    d = {
        "neutral": 0,
        "angry": 1,
        "positive": 2,
        "sad": 3
    }

    return d[emotion]

def label_2_emo(label):
    d = ["neutral", "angry", "positive", "sad"]

    return d[label]