In [None]:
hf_home_dir = "./hf_cache"

In [None]:
import os

os.environ["HF_HOME"] = hf_home_dir

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset, DatasetDict

dataset = DatasetDict()

dataset["train"] = load_dataset("alexandrainst/nst-da", split="train")
dataset["test"] = load_dataset("alexandrainst/nst-da", split="test")

In [None]:
# num_samples = len(dataset)

In [None]:
# import random

# random.seed(10)
# rands = random.sample(range(len(dataset)), num_samples)
# dataset = dataset.select(rands)

In [None]:
from transformers import SpeechT5Processor

checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)

In [None]:
tokenizer = processor.tokenizer

In [None]:
dataset = dataset.filter(
    lambda x: not (set("0123456789") & set(x)), input_columns=["text"]
)

In [None]:
def extract_all_chars(batch):
    all_text = " ".join(batch["text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}


vocabs = dataset.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset["train"].column_names,
)

dataset_vocab = set(vocabs["train"]["vocab"][0] + vocabs["test"]["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}

In [None]:
missing_vocab = dataset_vocab - tokenizer_vocab

In [None]:
replacements = [
    ("&", "og"),
    ("\r", " "),
    ("´", ""),
    ("\\", ""),
    ("¨", " "),
    ("Å", "AA"),
    ("Æ", "AE"),
    ("É", "E"),
    ("Ö", "OE"),
    ("Ø", "OE"),
    ("á", "a"),
    ("ä", "ae"),
    ("å", "aa"),
    ("è", "e"),
    ("î", "i"),
    ("ô", "oe"),
    ("ö", "oe"),
    ("ø", "oe"),
    ("ü", "y"),
]

In [None]:
def cleanup_text(inputs):
    for src, dst in replacements:
        inputs["text"] = inputs["text"].replace(src, dst)
    return inputs


dataset = dataset.map(cleanup_text)

In [None]:
from collections import defaultdict

speaker_counts = defaultdict(int)

for speaker_id in dataset["train"]["speaker_id"]:
    speaker_counts[speaker_id] += 1

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.hist(speaker_counts.values(), bins=20)
plt.ylabel("Speakers")
plt.xlabel("Examples")
plt.show()

In [None]:
def select_speaker(speaker_id):
    return 280 <= speaker_counts[speaker_id] <= 327


dataset["train"] = dataset["train"].filter(
    select_speaker,
    input_columns=["speaker_id"],
)

In [None]:
len(set(dataset["train"]["speaker_id"]))

In [None]:
len(dataset["train"])

In [None]:
speaker_id_examples = [
    (k, v) for (k, v) in list(speaker_counts.items()) if 280 <= v <= 327
]
speaker_id_examples_sorted = sorted(
    speaker_id_examples, key=lambda x: x[1], reverse=True
)
speaker_id_examples_sorted[0:20]

In [None]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)


def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

In [None]:
from speechbrain.pretrained import SpectralMaskEnhancement

metricgan_model_name = "speechbrain/metricgan-plus-voicebank"


enhance_model = SpectralMaskEnhancement.from_hparams(
    source=metricgan_model_name,
    savedir=os.path.join("/tmp", metricgan_model_name),
    run_opts={"device": device},
)


def enhance_audio(waveform):
    tensor = torch.tensor(waveform).reshape(1, -1).float()
    enhanced = enhance_model.enhance_batch(tensor, lengths=torch.tensor([1.0]))
    enhanced = enhanced.squeeze().cpu().numpy()
    return enhanced

In [None]:
from datasets import Audio

sampling_rate = processor.feature_extractor.sampling_rate
assert sampling_rate == 16000
dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [None]:
female_23_vestjylland_audio = dataset["train"].filter(
    lambda x: x == 202, input_columns=["speaker_id"]
)[2]["audio"]
female_24_storkoebenhav_audio = dataset["train"].filter(
    lambda x: x == 404, input_columns=["speaker_id"]
)[55]["audio"]
female_49_nordjylland_audio = dataset["train"].filter(
    lambda x: x == 419, input_columns=["speaker_id"]
)[1]["audio"]
male_51_vest_sudsjaelland_audio = dataset["train"].filter(
    lambda x: x == 475, input_columns=["speaker_id"]
)[1]["audio"]
male_18_vest_sydsjaelland_audio = dataset["train"].filter(
    lambda x: x == 83, input_columns=["speaker_id"]
)[17]["audio"]
male_31_fyn_audio = dataset["train"].filter(
    lambda x: x == 496, input_columns=["speaker_id"]
)[37]["audio"]

In [None]:
speaker_list = [
    (female_23_vestjylland_audio, "female_23_vestjylland.npy"),
    (female_24_storkoebenhav_audio, "female_24_storkoebenhavn.npy"),
    (female_49_nordjylland_audio, "female_49_nordjylland.npy"),
    (male_51_vest_sudsjaelland_audio, "male_51_vest_sudsjaelland.npy"),
    (male_18_vest_sydsjaelland_audio, "male_18_vest_sydsjaelland.npy"),
    (male_31_fyn_audio, "male_31_fyn.npy"),
]

speaker_embeddings_list = [
    (create_speaker_embedding(enhance_audio(speaker["array"])), file_name)
    for (speaker, file_name) in speaker_list
]

In [None]:
from pathlib import Path

root = "./embeddings/"

Path(root).mkdir(parents=True, exist_ok=True)

In [None]:
import numpy as np

for embedding, file_name in speaker_embeddings_list:
    np.save(root + file_name, embedding)

In [None]:
def prepare_dataset(example):
    audio = example["audio"]
    audio_enhanced = enhance_audio(audio["array"])

    example = processor(
        text=example["text"],
        audio_target=audio_enhanced,
        sampling_rate=sampling_rate,
        return_attention_mask=False,
    )

    # strip off the batch dimension
    example["labels"] = example["labels"][0]

    # use SpeechBrain to obtain x-vector

    example["speaker_embeddings"] = create_speaker_embedding(audio_enhanced)

    return example

In [None]:
processed_example = prepare_dataset(dataset["train"][0])
list(processed_example.keys())

In [None]:
processed_example["speaker_embeddings"].shape

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.imshow(processed_example["labels"].T)
plt.show()

In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset["train"].column_names)

In [None]:
input_length_max = 600

In [None]:
def is_not_too_long(input_ids):
    input_length = len(input_ids)
    return input_length < input_length_max


dataset = dataset.filter(
    is_not_too_long,
    input_columns=["input_ids"],
)

In [None]:
len(dataset["train"])

In [None]:
len(dataset["test"])

In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class TTSDataCollatorWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
        label_features = [{"input_values": feature["labels"]} for feature in features]
        speaker_features = [feature["speaker_embeddings"] for feature in features]

        # collate the inputs and targets into a batch
        batch = processor.pad(
            input_ids=input_ids, labels=label_features, return_tensors="pt"
        )

        # replace padding with -100 to ignore loss correctly
        batch["labels"] = batch["labels"].masked_fill(
            batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
        )

        # not used during fine-tuning
        del batch["decoder_attention_mask"]

        # round down target lengths to multiple of reduction factor
        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor(
                [len(feature["input_values"]) for feature in label_features]
            )
            target_lengths = target_lengths.new(
                [
                    length - length % model.config.reduction_factor
                    for length in target_lengths
                ]
            )
            max_length = max(target_lengths)
            batch["labels"] = batch["labels"][:, :max_length]

        # also add in the speaker embeddings
        batch["speaker_embeddings"] = torch.tensor(speaker_features)

        return batch

In [None]:
data_collator = TTSDataCollatorWithPadding(processor=processor)

In [None]:
from transformers import SpeechT5ForTextToSpeech

In [None]:
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)

In [None]:
from functools import partial

# disable cache during training since it's incompatible with gradient checkpointing
model.config.use_cache = False

# set language and task for generation and re-enable cache
model.generate = partial(model.generate, use_cache=True)

In [None]:
model_name = checkpoint.split("/")[-1]
finetuned_model_name = f"{model_name}-finetuned-nst-da"

In [None]:
train_epochs = 20

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=finetuned_model_name,
    per_device_train_batch_size=32,
    auto_find_batch_size=True,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_ratio=0.1,
    num_train_epochs=train_epochs,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="epoch",
    per_device_eval_batch_size=32,
    save_strategy="epoch",
    logging_steps=5,
    save_total_limit=1,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=True,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=processor,
)

In [None]:
resume_from_checkpoint = True

In [None]:
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
# trainer.train()

In [None]:
# NOTE evaluate explicitly so that best model performance is updated on model card
trainer.evaluate(dataset["test"])

In [None]:
kwargs = {
    "dataset_tags": "alexandrainst/nst-da",
    "dataset": "NST Danish ASR Database",
    "model_name": finetuned_model_name,
    "finetuned_from": checkpoint,
    "tasks": "text-to-speech",
    "language": "da",
}

In [None]:
trainer.push_to_hub(**kwargs)

In [1]:
import torch
from transformers import pipeline

pipe = pipeline(
    "text-to-speech",
    model="JackismyShephard/speecht5_tts-finetuned-nst-da",
    use_fast=True,
    device=0 if torch.cuda.is_available() else "cpu",
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  return self.fget.__get__(instance, owner)()


In [None]:
def replace_danish_letters(text):
    for src, dst in replacements:
        text = text.replace(src, dst)
    return text

In [None]:
text = "I sin oprindelige før-kristne form blev alferne sandsynligvis opfattet som en personificering af det land og den natur, der omgav menneskene, dvs. den opdyrkede jord, gården og de naturressourcer, som hørte dertil. De var guddommelige eller delvis guddommelige væsener, der besad magiske kræfter, som de brugte både til fordel og ulempe for menneskene."
text = replace_danish_letters(text)

In [None]:
text2 = "Selvom mosser er almindelige, er en lang række mosser dog knyttet snævert til habitater der er i stærk tilbagegang og disse mosser er truede. Kendtest af disse er måske tørvemosserne hvor flere arter kun forekommer i såkaldte højmoser der nu er meget sjældne i Danmark."
text2 = replace_danish_letters(text2)

In [None]:
root = "./embeddings/"
speaker_embedding_path = root + "male_51_vest_sudsjaelland.npy"

In [None]:
import numpy as np

speaker_embedding = np.load(speaker_embedding_path)
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)

In [None]:
forward_params = {"speaker_embeddings": speaker_embedding}
speech_pipe = pipe(text, forward_params=forward_params)

In [None]:
from resemble_enhance.enhancer.inference import denoise, enhance


def enhance_audio2(waveform, sr, device="cuda"):
    tensor = torch.tensor(waveform).float()
    denoised, new_sr = denoise(tensor, sr, device)
    enhanced, new_sr = enhance(
        denoised, new_sr, device, nfe=2, solver="midpoint", lambd=0.9, tau=0.95
    )
    enhanced_cpu = enhanced.cpu().numpy()
    return enhanced_cpu, new_sr

In [None]:
speech_enhanced, new_sr = enhance_audio2(
    speech_pipe["audio"], speech_pipe["sampling_rate"]
)

In [None]:
from IPython.display import Audio

Audio(speech_enhanced, rate=new_sr)