In [None]:
%%bash
pip install datasets soundfile SpeechBrain
pip install --upgrade accelerate
pip install git+https://github.com/huggingface/transformers.git
pip install tensorboard

In [None]:
import os
import librosa
from datasets import Dataset, load_dataset, Audio, load_from_disk, concatenate_datasets
from collections import defaultdict
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
from speechbrain.inference.classifiers import EncoderClassifier
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from dataclasses import dataclass
from typing import Any, Dict, List, Union

In [None]:
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tokenizer = processor.tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

Converting datasets to Hugging Face format

In [None]:
# All files in the folders must have corresponding names for each audio and transcript pair.
audio_folders=['/path/to/audio/folder','/path/to/audio/folder']
txt_folders=['/path/to/transcript/folder','/path/to/transcript/folder']

In [None]:
speaker_datasets = [[] for _ in range(len(audio_folders))]  # Create a list of lists for each speaker

for index, value in enumerate(audio_folders):
    for audio_file in os.listdir(value):
        audio_path = os.path.join(value, audio_file)
        txt_path = os.path.join(txt_folders[index], (audio_file.split('_')[0] + '.txt'))

        # Load the audio file
        audio_array, _ = librosa.load(audio_path, sr=22050)  # Ensure correct path variable name and sampling rate

        # Read the transcript from the corresponding text file
        transcript = ""
        if os.path.exists(txt_path):  # Check if the text file exists
            with open(txt_path, 'r', encoding='utf-8') as txt_file:
                transcript = txt_file.read()

        speaker_datasets[index].append({
            'audio': {
                'path': audio_path,
                'array': audio_array.tolist(),  # Convert numpy array to list for JSON serialization
                'sampling_rate': 22050
            },
            'speaker_id': index,  # Assuming a fixed speaker_id; change as needed
            'transcript': transcript,
            'audio_id': audio_file  # or audio_file.split('.')[0] if you want to remove the file extension
        })

        # Convert the list of dictionaries into a Hugging Face Dataset
        keys = speaker_datasets[index][0].keys()  # Extract dataset keys
        hf_dataset = Dataset.from_dict({key: [dic[key] for dic in speaker_datasets[index]] for key in keys})

        # Save the dataset
        hf_dataset.save_to_disk('/path/to/save/locally')  # Specify your path here

        # Optionally, you can push your dataset to the Hugging Face Hub (requires authentication)
        # hf_dataset.push_to_hub('your_dataset_name')


Merging the 2 datasets

In [None]:

# Paths to your datasets
path_to_dataset1 = '/path/to/first/dataset'
path_to_dataset2 = '/path/to/second/dataset'

# Load the datasets
dataset1 = load_from_disk(path_to_dataset1)
dataset2 = load_from_disk(path_to_dataset2)

# Merge the datasets
merged_dataset = concatenate_datasets([dataset1, dataset2])

# Optionally, you can save the merged dataset to disk
path_to_merged_dataset = '/path/to/merged/data'
merged_dataset.save_to_disk(path_to_merged_dataset)


In [None]:
dataset = merged_dataset.cast_column("audio", Audio(sampling_rate=16000))

Prepare data for training

In [None]:
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name)
)

def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

In [None]:
def prepare_dataset(example):
    # load the audio data; if necessary, this resamples the audio to 16kHz
    audio = example["audio"]

    # feature extraction and tokenization
    example = processor(
        text=example["transcript"],
        audio_target=audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )

    # strip off the batch dimension
    example["labels"] = example["labels"][0]

    # use SpeechBrain to obtain x-vector
    example["speaker_embeddings"] = create_speaker_embedding(audio["array"])

    return example

dataset = dataset.map(
    prepare_dataset, remove_columns=dataset.column_names,
)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

In [None]:
@dataclass
class TTSDataCollatorWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
        label_features = [{"input_values": feature["labels"]} for feature in features]
        speaker_features = [feature["speaker_embeddings"] for feature in features]

        # collate the inputs and targets into a batch
        batch = processor.pad(
            input_ids=input_ids,
            labels=label_features,
            return_tensors="pt",
        )

        # replace padding with -100 to ignore loss correctly
        batch["labels"] = batch["labels"].masked_fill(
            batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
        )

        # not used during fine-tuning
        del batch["decoder_attention_mask"]

        # round down target lengths to multiple of reduction factor
        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor([
                len(feature["input_values"]) for feature in label_features
            ])
            target_lengths = target_lengths.new([
                length - length % model.config.reduction_factor for length in target_lengths
            ])
            max_length = max(target_lengths)
            batch["labels"] = batch["labels"][:, :max_length]

        # also add in the speaker embeddings
        batch["speaker_embeddings"] = torch.tensor(speaker_features)

        return batch

In [None]:
data_collator = TTSDataCollatorWithPadding(processor=processor)
model.config.use_cache = False

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/output/path",  
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=50000,
    # num_train_epochs=3,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=False,
)

In [None]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=processor.tokenizer,
)

In [None]:
%load_ext tensorboard
%tensorboard --logdir '/path/to/save/logs'


In [None]:
trainer.train()