# Обучение модели HuBERT для задачи распознавания эмоций в речи

## 1. Загрузка датасета

In [1]:
from datasets import load_dataset

data_files = {"train": "../data/crowd_train_500.csv", "test": "../data/crowd_test_500.csv"}
ds = load_dataset("csv", data_files=data_files)

In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'audio_path', 'features_path', 'emotion', 'label', 'duration'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['id', 'audio_path', 'features_path', 'emotion', 'label', 'duration'],
        num_rows: 2000
    })
})

In [3]:
import datetime

print(f"Суммарное время обучающего набора: {datetime.timedelta(seconds=sum(ds['train']['duration']))}")
print(f"Суммарное время тестового набора: {datetime.timedelta(seconds=sum(ds['test']['duration']))}")

Суммарное время обучающего набора: 2:41:34.095813
Суммарное время тестового набора: 2:43:03.650688


## 2. Подготовка модели

### 2.1 Загрузка модели

In [4]:
from transformers import (
    HubertForSequenceClassification,
    AutoConfig,
    Trainer,
    TrainingArguments,
    Wav2Vec2FeatureExtractor,
)

import librosa
import numpy as np

In [5]:
model_id = "facebook/hubert-base-ls960"
# model_id = "facebook/hubert-large-ll60k"
# model_id = "facebook/hubert-large-ls960-ft"
# model_id = "facebook/hubert-xlarge-ll60k"

In [6]:
NUM_LABELS = 4
labels_names = ["neutral", "angry", "positive", "sad"]

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)

config = AutoConfig.from_pretrained(
    model_id, 
    num_labels=NUM_LABELS,
    label2id={label: i for i, label in enumerate(labels_names)},
    id2label={i: label for i, label in enumerate(labels_names)},
)

model = HubertForSequenceClassification.from_pretrained(
    model_id,
    config=config,
    ignore_mismatched_sizes=True
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Заморозка всех слоев
for param in model.parameters():
    param.requires_grad = False

# model.freeze_feature_extractor() # Заморозка только feature_extractor части

# for param in model.named_parameters():
    # print(param[0])

### 2.2 Разморозка последних N слоев кодировщика

In [8]:
layers_freeze_num = 2

n_layers = (
    4 + layers_freeze_num * 16
) # 4 refers to projector and classifier's weights and biases.

for name, param in list(model.named_parameters())[-n_layers:]:
    param.requires_grad = True
    print(name)

hubert.encoder.layers.10.attention.k_proj.weight
hubert.encoder.layers.10.attention.k_proj.bias
hubert.encoder.layers.10.attention.v_proj.weight
hubert.encoder.layers.10.attention.v_proj.bias
hubert.encoder.layers.10.attention.q_proj.weight
hubert.encoder.layers.10.attention.q_proj.bias
hubert.encoder.layers.10.attention.out_proj.weight
hubert.encoder.layers.10.attention.out_proj.bias
hubert.encoder.layers.10.layer_norm.weight
hubert.encoder.layers.10.layer_norm.bias
hubert.encoder.layers.10.feed_forward.intermediate_dense.weight
hubert.encoder.layers.10.feed_forward.intermediate_dense.bias
hubert.encoder.layers.10.feed_forward.output_dense.weight
hubert.encoder.layers.10.feed_forward.output_dense.bias
hubert.encoder.layers.10.final_layer_norm.weight
hubert.encoder.layers.10.final_layer_norm.bias
hubert.encoder.layers.11.attention.k_proj.weight
hubert.encoder.layers.11.attention.k_proj.bias
hubert.encoder.layers.11.attention.v_proj.weight
hubert.encoder.layers.11.attention.v_proj.bias


## 3. Подготовка данных

In [9]:
def speech_file_to_array(batch):
    batch["array"] = librosa.load(f'../{batch["audio_path"]}', sr=16000, mono=False)[0]
    return batch

def get_input_values(batch, feature_extractor):
    array = batch["array"]
    input = feature_extractor(
        array,
        sampling_rate=16000,
        padding=True,
        return_tensors="pt"
    )

    batch["input_values"] = input.input_values[0]
    
    return batch

In [10]:
ds = ds.remove_columns(["id", "features_path", "emotion", "duration"])
ds = ds.rename_column("label", "labels")
ds = ds.map(speech_file_to_array)
ds = ds.map(
    get_input_values,
    fn_kwargs={"feature_extractor": feature_extractor}
)

In [11]:
ds

DatasetDict({
    train: Dataset({
        features: ['audio_path', 'labels', 'array', 'input_values'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['audio_path', 'labels', 'array', 'input_values'],
        num_rows: 2000
    })
})

In [12]:
from datasets import DatasetDict

# 90% train, 10% validation
train_val = ds["train"].train_test_split(shuffle=True, test_size=0.1)

ds = DatasetDict({
    'train': train_val['train'],
    'test': ds['test'],
    'val': train_val['test']
})

ds

DatasetDict({
    train: Dataset({
        features: ['audio_path', 'labels', 'array', 'input_values'],
        num_rows: 1800
    })
    test: Dataset({
        features: ['audio_path', 'labels', 'array', 'input_values'],
        num_rows: 2000
    })
    val: Dataset({
        features: ['audio_path', 'labels', 'array', 'input_values'],
        num_rows: 200
    })
})

## 4. Обучение

In [14]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

import torch
from transformers import Wav2Vec2Processor

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:

        input_features = [
            {"input_values": feature["input_values"]} for feature in features
        ]
        
        labels = [feature["labels"] for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        batch["labels"] = torch.tensor(labels)

        return batch

In [15]:
import evaluate

def compute_metrics(eval_pred):
    compute_accuracy_metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return compute_accuracy_metric.compute(predictions=predictions, references=labels)

In [17]:
trainer_config = {
    "OUTPUT_DIR": "train_results/2000_samples_2_layers",
    "MODEL_DIR": "models/hubert-base-dusha-ft-2-layers",
    "EPOCHS": 3,
    "TRAIN_BATCH_SIZE": 8, # [4, 8]
    "EVAL_BATCH_SIZE": 8, # [4, 8]
    "GRADIENT_ACCUMULATION_STEPS": 4, # [2, 4]
    "WARMUP_STEPS": 500,
    "DECAY": 0.01,
    "LOGGING_STEPS": 10,
    "SAVE_STEPS": 100,
    "LR": 5e-5,
    "FP16": True,
}

In [18]:
training_args = TrainingArguments(
    output_dir=trainer_config["OUTPUT_DIR"],  # output directory
    gradient_accumulation_steps=trainer_config["GRADIENT_ACCUMULATION_STEPS"],  # accumulate the gradients before running optimization step
    num_train_epochs=trainer_config["EPOCHS"], # total number of training epochs
    per_device_train_batch_size=trainer_config["TRAIN_BATCH_SIZE"],
    per_device_eval_batch_size=trainer_config["EVAL_BATCH_SIZE"],  # batch size for evaluation
    # warmup_steps=trainer_config["WARMUP_STEPS"],  # number of warmup steps for learning rate scheduler
    save_steps=trainer_config["SAVE_STEPS"], # save checkpoint every 100 steps
    # weight_decay=trainer_config["DECAY"],  # strength of weight decay
    logging_steps=trainer_config["LOGGING_STEPS"],
    eval_strategy="epoch", # report metric at end of each epoch
    learning_rate=trainer_config["LR"], # 5e-5 by default
)

In [19]:
data_collator = DataCollatorCTCWithPadding(
    processor=feature_extractor,
    padding=True
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds["train"],
    eval_dataset=ds["val"],
    compute_metrics=compute_metrics,
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,1.3665,1.360602,0.34
1,1.3475,1.338306,0.355
2,1.3011,1.334396,0.355


TrainOutput(global_step=168, training_loss=1.3484024831226893, metrics={'train_runtime': 7497.4934, 'train_samples_per_second': 0.72, 'train_steps_per_second': 0.022, 'total_flos': 3.9616063537933824e+17, 'train_loss': 1.3484024831226893, 'epoch': 2.986666666666667})

## 5. Тестирование

In [22]:
test_results = trainer.predict(ds["test"])
print(test_results.metrics)

{'test_loss': 1.3115416765213013, 'test_accuracy': 0.388, 'test_runtime': 791.2938, 'test_samples_per_second': 2.528, 'test_steps_per_second': 0.316}


In [23]:
trainer.save_model(trainer_config["MODEL_DIR"])