In [1]:
from datasets import load_dataset, Audio
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, Seq2SeqTrainingArguments, Seq2SeqTrainer

import numpy as np
from collections import defaultdict

import os
import torch
from speechbrain.pretrained import EncoderClassifier

from dataclasses import dataclass
from typing import Any, Dict, List, Union
from functools import partial

torchvision is not available - cannot save figures


In [2]:
from huggingface_hub import notebook_login

notebook_login(new_session=False)

User is already logged in.


## Load data

In [3]:
%%time
dataset = load_dataset("facebook/voxpopuli", "pl", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
len(dataset)

CPU times: user 206 ms, sys: 6.78 ms, total: 213 ms
Wall time: 2.48 s


34665

## Preprocessing

In [4]:
# load preprocessor
checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
tokenizer = processor.tokenizer

In [5]:
# Match dataset vocab
def extract_all_chars(batch):
    all_text = " ".join(batch["normalized_text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}


vocabs = dataset.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset.column_names,
)

dataset_vocab = set(vocabs["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}
print(dataset_vocab - tokenizer_vocab)

replacements = [("1", "l"), ("ó", "u"), ("ą", "on"), ("ć", "ch"),
                ("ę", "en"), ("ł", "w"), ("ń", "ny"), ("ś", "sh"),
                ("ź", "zh"), ("ż", "zh")]


def cleanup_text(inputs):
    for src, dst in replacements:
        inputs["normalized_text"] = inputs["normalized_text"].replace(src, dst)
    return inputs


dataset = dataset.map(cleanup_text)

Map:   0%|          | 0/34665 [00:00<?, ? examples/s]

{'ą', 'ś', 'ę', '1', 'ł', 'ć', 'ó', 'ż', 'ź', ' ', 'ń'}


In [6]:
# filter by speakers number
speaker_counts = defaultdict(int)

for speaker_id in dataset["speaker_id"]:
    speaker_counts[speaker_id] += 1
    
def select_speaker(speaker_id):
    return 100 <= speaker_counts[speaker_id] <= 400

dataset = dataset.filter(select_speaker, input_columns=["speaker_id"])
len(set(dataset["speaker_id"])), len(dataset)

(41, 9581)

In [7]:
# Embeddings for speakers
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)


def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

In [8]:
%%time
# Processing dataset
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        text=example["normalized_text"],
        audio_target=audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )

    # strip off the batch dimension
    example["labels"] = example["labels"][0]

    # use SpeechBrain to obtain x-vector
    example["speaker_embeddings"] = create_speaker_embedding(audio["array"])

    return example

dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)

Map:   0%|          | 0/9581 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (984 > 600). Running this sequence through the model will result in indexing errors


CPU times: user 31min 39s, sys: 41min 49s, total: 1h 13min 29s
Wall time: 6min 32s


In [9]:
# Remove too long examples
def is_not_too_long(input_ids):
    input_length = len(input_ids)
    return input_length < 200


dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])
len(dataset)

Filter:   0%|          | 0/9581 [00:00<?, ? examples/s]

6867

In [10]:
# split data
dataset = dataset.train_test_split(test_size=0.1)

## Fine-tuning

In [11]:
# data collator
@dataclass
class TTSDataCollatorWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
        label_features = [{"input_values": feature["labels"]} for feature in features]
        speaker_features = [feature["speaker_embeddings"] for feature in features]

        # collate the inputs and targets into a batch
        batch = processor.pad(
            input_ids=input_ids, labels=label_features, return_tensors="pt"
        )

        # replace padding with -100 to ignore loss correctly
        batch["labels"] = batch["labels"].masked_fill(
            batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
        )

        # not used during fine-tuning
        del batch["decoder_attention_mask"]

        # round down target lengths to multiple of reduction factor
        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor(
                [len(feature["input_values"]) for feature in label_features]
            )
            target_lengths = target_lengths.new(
                [
                    length - length % model.config.reduction_factor
                    for length in target_lengths
                ]
            )
            max_length = max(target_lengths)
            batch["labels"] = batch["labels"][:, :max_length]

        # also add in the speaker embeddings
        batch["speaker_embeddings"] = torch.tensor(speaker_features)

        return batch
    
data_collator = TTSDataCollatorWithPadding(processor=processor)

In [12]:
# load model
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)

# disable cache during training since it's incompatible with gradient checkpointing
model.config.use_cache = False
# set language and task for generation and re-enable cache
model.generate = partial(model.generate, use_cache=True)

In [13]:
# training args
training_args = Seq2SeqTrainingArguments(
    output_dir="speecht5_finetuned_voxpopuli_pl",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=6000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=2,
    save_steps=500,
    eval_steps=500,
    logging_steps=25,
#     report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=False,
)

In [14]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=processor,
)
trainer.train()


* 'schema_extra' has been renamed to 'json_schema_extra'


Step,Training Loss,Validation Loss
500,0.5306,0.483449
1000,0.4926,0.459784
1500,0.4896,0.451517
2000,0.4795,0.447135
2500,0.4839,0.443878




KeyboardInterrupt: 

In [15]:
# Push to hub
kwargs = {
    "dataset_tags": "facebook/voxpopuli",
    "dataset": "VOXPOPULI",
    "model_name": f"text-to-speech-finetuned-voxpopuli-pl",
    "finetuned_from": checkpoint,
    "tasks": "text-to-speech",
    "tags": "text-to-speech"
}
trainer.push_to_hub(**kwargs)

model.safetensors:   0%|          | 0.00/578M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.79k [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

'https://huggingface.co/Maksimkrug/speecht5_finetuned_voxpopuli_pl/tree/main/'