# Обучение модели HuBERT для задачи распознавания эмоций в речи

## 1. Загрузка датасета

In [1]:
from datasets import load_dataset

ds = load_dataset("csv", data_files="../data/crowd_train_500.csv")

In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'audio_path', 'features_path', 'emotion', 'label'],
        num_rows: 2000
    })
})

In [3]:
from transformers import (
    HubertForSequenceClassification,
    AutoConfig,
    Trainer,
    TrainingArguments,
    Wav2Vec2FeatureExtractor,
)

import librosa
import numpy as np

In [4]:
model_id = "facebook/hubert-base-ls960"
# model_id = "facebook/hubert-large-ll60k"
# model_id = "facebook/hubert-large-ls960-ft"
# model_id = "facebook/hubert-xlarge-ll60k"

In [5]:
NUM_LABELS = 4
labels_names = ["neutral", "angry", "positive", "sad"]

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)

config = AutoConfig.from_pretrained(
    model_id, 
    num_labels=NUM_LABELS,
    label2id={label: i for i, label in enumerate(labels_names)},
    id2label={i: label for i, label in enumerate(labels_names)},
)

model = HubertForSequenceClassification.from_pretrained(
    model_id,
    config=config,
    ignore_mismatched_sizes=True
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Заморозка всех слоев
for param in model.parameters():
    param.requires_grad = False

# model.freeze_feature_extractor() # Заморозка только feature_extractor части

In [42]:
print(len(list(model.parameters())))

215


In [7]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [8]:
torch.cuda.current_device()

0

In [40]:
for param in model.named_parameters():
    print(param[0])

hubert.masked_spec_embed
hubert.feature_extractor.conv_layers.0.conv.weight
hubert.feature_extractor.conv_layers.0.layer_norm.weight
hubert.feature_extractor.conv_layers.0.layer_norm.bias
hubert.feature_extractor.conv_layers.1.conv.weight
hubert.feature_extractor.conv_layers.2.conv.weight
hubert.feature_extractor.conv_layers.3.conv.weight
hubert.feature_extractor.conv_layers.4.conv.weight
hubert.feature_extractor.conv_layers.5.conv.weight
hubert.feature_extractor.conv_layers.6.conv.weight
hubert.feature_projection.layer_norm.weight
hubert.feature_projection.layer_norm.bias
hubert.feature_projection.projection.weight
hubert.feature_projection.projection.bias
hubert.encoder.pos_conv_embed.conv.bias
hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0
hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1
hubert.encoder.layer_norm.weight
hubert.encoder.layer_norm.bias
hubert.encoder.layers.0.attention.k_proj.weight
hubert.encoder.layers.0.attention.k_proj.bia

#### Разморозка последних N слоев кодировщика

In [8]:
layers_freeze_num = 4

n_layers = (
    4 + layers_freeze_num * 16
)  # 4 refers to projector and classifier's weights and biases.

for name, param in list(model.named_parameters())[-n_layers:]:
    param.requires_grad = True
    print(name)

hubert.encoder.layers.8.attention.k_proj.weight
hubert.encoder.layers.8.attention.k_proj.bias
hubert.encoder.layers.8.attention.v_proj.weight
hubert.encoder.layers.8.attention.v_proj.bias
hubert.encoder.layers.8.attention.q_proj.weight
hubert.encoder.layers.8.attention.q_proj.bias
hubert.encoder.layers.8.attention.out_proj.weight
hubert.encoder.layers.8.attention.out_proj.bias
hubert.encoder.layers.8.layer_norm.weight
hubert.encoder.layers.8.layer_norm.bias
hubert.encoder.layers.8.feed_forward.intermediate_dense.weight
hubert.encoder.layers.8.feed_forward.intermediate_dense.bias
hubert.encoder.layers.8.feed_forward.output_dense.weight
hubert.encoder.layers.8.feed_forward.output_dense.bias
hubert.encoder.layers.8.final_layer_norm.weight
hubert.encoder.layers.8.final_layer_norm.bias
hubert.encoder.layers.9.attention.k_proj.weight
hubert.encoder.layers.9.attention.k_proj.bias
hubert.encoder.layers.9.attention.v_proj.weight
hubert.encoder.layers.9.attention.v_proj.bias
hubert.encoder.layer

In [9]:
def speech_file_to_array(batch):
    batch["array"] = librosa.load(f'../{batch["audio_path"]}', sr=16000, mono=False)[0]
    return batch

def get_input_values(batch, feature_extractor):
    array = batch["array"]
    input = feature_extractor(
        array,
        sampling_rate=16000,
        padding=True,
        return_tensors="pt"
    )

    batch["input_values"] = input.input_values[0]
    
    return batch

In [10]:
ds = ds.map(speech_file_to_array)

In [11]:
ds = ds.remove_columns(["id", "features_path", "emotion"])

In [12]:
ds = ds.rename_column("label", "labels")
ds = ds.map(
    get_input_values,
    fn_kwargs={"feature_extractor": feature_extractor}
)

In [13]:
ds

DatasetDict({
    train: Dataset({
        features: ['audio_path', 'labels', 'array', 'input_values'],
        num_rows: 2000
    })
})

In [14]:
from datasets import DatasetDict

# 90% train, 10% test + validation
train_testvalid = ds["train"].train_test_split(shuffle=True, test_size=0.1)

# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

ds = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'val': test_valid['train']
})

In [15]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

import torch
from transformers import Wav2Vec2Processor

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(
        self, examples: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:

        input_features = [
            {"input_values": example["input_values"]} for example in examples
        ]
        labels = [example["labels"] for example in examples]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        batch["labels"] = torch.tensor(labels)

        return batch

In [16]:
trainer_config = {
    "OUTPUT_DIR": "results/results_4_layers",
    "EPOCHS": 3,
    "TRAIN_BATCH_SIZE": 8, # [4, 8]
    "EVAL_BATCH_SIZE": 8, # [4, 8]
    "GRADIENT_ACCUMULATION_STEPS": 4, # [2, 4]
    "WARMUP_STEPS": 500,
    "DECAY": 0.01,
    "LOGGING_STEPS": 10,
    "MODEL_DIR": "models/hubert-ft-4-layers",
    "SAVE_STEPS": 100,
    "LR": 5e-5,
    "FP16": True,
}

# Fine-Tuning with Trainer
training_args = TrainingArguments(
    output_dir=trainer_config["OUTPUT_DIR"],  # output directory
    gradient_accumulation_steps=trainer_config["GRADIENT_ACCUMULATION_STEPS"],  # accumulate the gradients before running optimization step
    num_train_epochs=trainer_config["EPOCHS"], # total number of training epochs
    per_device_train_batch_size=trainer_config["TRAIN_BATCH_SIZE"],
    per_device_eval_batch_size=trainer_config["EVAL_BATCH_SIZE"],  # batch size for evaluation
    # warmup_steps=trainer_config["WARMUP_STEPS"],  # number of warmup steps for learning rate scheduler
    save_steps=trainer_config["SAVE_STEPS"], # save checkpoint every 100 steps
    # weight_decay=trainer_config["DECAY"],  # strength of weight decay
    logging_steps=trainer_config["LOGGING_STEPS"],
    eval_strategy="epoch",  # report metric at end of each epoch
    learning_rate=trainer_config["LR"], # 5e-5 by default
)

In [17]:
data_collator = DataCollatorCTCWithPadding(
    processor=feature_extractor,
    padding=True
)

In [18]:
import evaluate

def compute_metrics(eval_pred):
    compute_accuracy_metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return compute_accuracy_metric.compute(predictions=predictions, references=labels)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds["train"],
    eval_dataset=ds["val"],
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,1.3651,1.332981,0.34
1,1.2941,1.257539,0.41
2,1.2405,1.236798,0.43


TrainOutput(global_step=168, training_loss=1.3165623119899206, metrics={'train_runtime': 5503.346, 'train_samples_per_second': 0.981, 'train_steps_per_second': 0.031, 'total_flos': 3.9608392505591443e+17, 'train_loss': 1.3165623119899206, 'epoch': 2.986666666666667})

In [21]:
def prepare_dataset(path, feature_extractor):
    # data_files = {"train": "train.csv", "test": "test.csv"}
    ds = load_dataset("csv", data_files=path)
    ds = ds.map(speech_file_to_array)
    ds = ds.remove_columns(["id", "features_path", "emotion"])
    ds = ds.rename_column("label", "labels")
    ds = ds.map(
        get_input_values,
        fn_kwargs={"feature_extractor": feature_extractor}
    )

    return ds

In [22]:
test_ds = prepare_dataset("../data/crowd_test_50.csv", feature_extractor)
test_ds

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['audio_path', 'labels', 'array', 'input_values'],
        num_rows: 200
    })
})

In [23]:
test_results = trainer.predict(ds["test"])
print(test_results.metrics)
print(test_results.metrics["test_accuracy"])

{'test_loss': 1.227473258972168, 'test_accuracy': 0.52, 'test_runtime': 42.3665, 'test_samples_per_second': 2.36, 'test_steps_per_second': 0.307}
0.52


In [28]:
test_results = trainer.predict(test_ds["train"])
print(test_results.metrics)
print(test_results.metrics["test_accuracy"])

{'test_loss': 1.301796317100525, 'test_accuracy': 0.38, 'test_runtime': 84.7288, 'test_samples_per_second': 2.36, 'test_steps_per_second': 0.295}
0.38


In [29]:
trainer.save_model(trainer_config["MODEL_DIR"])

#### 2 последних слоя, размер батча 8, накопление градиента 2

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,1.3291,1.342338,0.36
2,1.2526,1.225059,0.45


TrainOutput(global_step=336, training_loss=1.3089277942975361, metrics={'train_runtime': 1476.4492, 'train_samples_per_second': 3.657, 'train_steps_per_second': 0.228, 'total_flos': 3.9935122467909024e+17, 'train_loss': 1.3089277942975361, 'epoch': 2.986666666666667})

In [None]:
def emo_2_label(emotion):
    d = {
        "neutral": 0,
        "angry": 1,
        "positive": 2,
        "sad": 3
    }

    return d[emotion]

def label_2_emo(label):
    d = ["neutral", "angry", "positive", "sad"]

    return d[label]