In [None]:
# import os
# hf_home_dir = "./hf_cache"
# os.environ["HF_HOME"] = hf_home_dir  # TODO outcomment this?

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset("alexandrainst/nst-da")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
# num_train_samples = int(10000 * 0.9)
# num_test_samples = int(10000 * 0.1)

In [None]:
# import random

# random.seed(10)
# train_rands = random.sample(range(len(dataset['train'])), num_train_samples)
# dataset['train'] = dataset['train'].select(train_rands)

# test_rands = random.sample(range(len(dataset['test'])), num_test_samples)
# dataset['test'] = dataset['test'].select(test_rands)

In [None]:
from transformers import SpeechT5Processor

checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
tokenizer = processor.tokenizer

In [None]:
dataset = dataset.filter(
    lambda x: not (set("0123456789") & set(x)), input_columns=["text"]
)

In [None]:
def extract_all_chars(batch):
    all_text = " ".join(batch["text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab]}


vocabs = dataset.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset["train"].column_names,
)

dataset_vocab = set(vocabs["train"]["vocab"][0] + vocabs["test"]["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}

In [None]:
missing_vocab = dataset_vocab - tokenizer_vocab

In [None]:
replacements = [
    ("&", "og"),
    ("\r", " "),
    ("´", ""),
    ("\\", ""),
    ("¨", " "),
    ("Å", "AA"),
    ("Æ", "AE"),
    ("É", "E"),
    ("Ö", "OE"),
    ("Ø", "OE"),
    ("á", "a"),
    ("ä", "ae"),
    ("å", "aa"),
    ("è", "e"),
    ("î", "i"),
    ("ô", "oe"),
    ("ö", "oe"),
    ("ø", "oe"),
    ("ü", "y"),
]

In [None]:
def cleanup_text(inputs):
    for src, dst in replacements:
        inputs["text"] = inputs["text"].replace(src, dst)
    return inputs


dataset = dataset.map(cleanup_text)

In [None]:
from collections import defaultdict

speaker_counts = defaultdict(int)

for speaker_id in dataset["train"]["speaker_id"]:
    speaker_counts[speaker_id] += 1

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.hist(speaker_counts.values(), bins=20)
plt.ylabel("Speakers")
plt.xlabel("Examples")
plt.show()

In [None]:
def select_speaker(speaker_id):
    return 280 <= speaker_counts[speaker_id] <= 327

In [None]:
dataset["train"] = dataset["train"].filter(
    select_speaker,
    input_columns=["speaker_id"],
)

In [None]:
len(set(dataset["train"]["speaker_id"]))

In [None]:
len(dataset["train"])

In [None]:
speaker_id_examples = [
    (k, v) for (k, v) in list(speaker_counts.items()) if 280 <= v <= 327
]
speaker_id_examples_sorted = sorted(
    speaker_id_examples, key=lambda x: x[1], reverse=True
)
speaker_id_examples_sorted[0:20]

In [None]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier

device = "cuda" if torch.cuda.is_available() else "cpu"

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)


def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

In [None]:
from speechbrain.pretrained import SpectralMaskEnhancement

metricgan_model_name = "speechbrain/metricgan-plus-voicebank"


enhance_model = SpectralMaskEnhancement.from_hparams(
    source=metricgan_model_name,
    savedir=os.path.join("/tmp", metricgan_model_name),
    run_opts={"device": device},
)


def enhance_audio(waveform):
    tensor = torch.tensor(waveform).reshape(1, -1).float()
    enhanced = enhance_model.enhance_batch(tensor, lengths=torch.tensor([1.0]))
    enhanced = enhanced.squeeze().cpu().numpy()
    return enhanced

In [None]:
from resemble_enhance.enhancer.inference import enhance
from IPython.utils import io


def mega_enhance_audio(
    waveform, sr, device="cuda", nfe=64, solver="midpoint", lambd=0.9, tau=0.95
):
    tensor = torch.tensor(waveform).float()
    with io.capture_output() as _:
        enhanced, new_sr = enhance(
            tensor, sr, device, nfe=nfe, solver=solver, lambd=lambd, tau=tau
        )
    enhanced_cpu = enhanced.cpu().numpy()
    return enhanced_cpu, new_sr

In [None]:
dataset_id = "JackismyShephard/nst-da-norm"

dataset["train"].push_to_hub(
    dataset_id, split="train", commit_message="add train split"
)

dataset["test"].push_to_hub(dataset_id, split="test", commit_message="add test split")

In [None]:
from huggingface_hub import DatasetCard, DatasetCardData

# Using the Default Template
card_data = DatasetCardData(
    size_categories="100K<n<1M",
    license="cc0-1.0",
    task_categories=["automatic-speech-recognition", "text-to-speech"],
    language="da",
    pretty_name="NST-da Normalized",
    annotations_creators=["machine-generated", "expert-generated"],
    language_creators=["expert-generated"],
    multilinguality="monolingual",
    source_datasets="extended",
)
card = DatasetCard.from_template(
    card_data,
)
card.push_to_hub(dataset_id, commit_message="update dataset card")

In [None]:
female_23_vestjylland = dataset["train"].filter(
    lambda x: x == 202, input_columns=["speaker_id"]
)[2]
female_24_storkoebenhavn = dataset["train"].filter(
    lambda x: x == 404, input_columns=["speaker_id"]
)[55]
female_49_nordjylland = dataset["train"].filter(
    lambda x: x == 419, input_columns=["speaker_id"]
)[1]
male_51_vest_sydsjaelland = dataset["train"].filter(
    lambda x: x == 475, input_columns=["speaker_id"]
)[1]
male_18_vest_sydsjaelland = dataset["train"].filter(
    lambda x: x == 83, input_columns=["speaker_id"]
)[17]
male_31_fyn = dataset["train"].filter(lambda x: x == 496, input_columns=["speaker_id"])[
    37
]

In [None]:
female_22_oestjylland = dataset["train"].filter(
    lambda x: x == 301, input_columns=["speaker_id"]
)[0]
female_24_storkoebenhavn_2 = dataset["train"].filter(
    lambda x: x == 404, input_columns=["speaker_id"]
)[0]
female_44_nordjylland = dataset["train"].filter(
    lambda x: x == 517, input_columns=["speaker_id"]
)[0]

male_18_vest_syd_sjaelland = dataset["train"].filter(
    lambda x: x == 83, input_columns=["speaker_id"]
)[2]
male_31_fyn_2 = dataset["train"].filter(
    lambda x: x == 496, input_columns=["speaker_id"]
)[8]
male_55_storkoebenhavn = dataset["train"].filter(
    lambda x: x == 43, input_columns=["speaker_id"]
)[0]

In [None]:
speaker_list = [
    (female_23_vestjylland, "female_23_vestjylland.npy"),
    (female_24_storkoebenhavn, "female_24_storkoebenhavn.npy"),
    (female_49_nordjylland, "female_49_nordjylland.npy"),
    (male_51_vest_sydsjaelland, "male_51_vest_sydsjaelland.npy"),
    (male_18_vest_sydsjaelland, "male_18_vest_sydsjaelland.npy"),
    (male_31_fyn, "male_31_fyn.npy"),
]

speaker_embeddings_list = [
    (create_speaker_embedding(enhance_audio(speaker["audio"]["array"])), file_name)
    for (speaker, file_name) in speaker_list
]

In [None]:
speaker_list_2 = [
    (female_22_oestjylland, "female_22_oestjylland.npy"),
    (female_24_storkoebenhavn_2, "female_24_storkoebenhavn.npy"),
    (female_44_nordjylland, "female_44_nordjylland.npy"),
    (male_18_vest_syd_sjaelland, "male_18_vest_syd_sjaelland.npy"),
    (male_31_fyn_2, "male_31_fyn.npy"),
    (male_55_storkoebenhavn, "male_55_storkoebenhavn.npy"),
]

speaker_embeddings_list_2 = [
    (
        create_speaker_embedding(
            mega_enhance_audio(
                speaker["audio"]["array"], speaker["audio"]["sampling_rate"]
            )[0]
        ),
        file_name,
    )
    for (speaker, file_name) in speaker_list_2
]

In [None]:
from pathlib import Path

root_metricgan_plus = "./embeddings/nst-da-metricgan-plus/"
root_resemble_enhance = "./embeddings/nst-da-resemble-enhance/"
Path(root_metricgan_plus).mkdir(parents=True, exist_ok=True)
Path(root_resemble_enhance).mkdir(parents=True, exist_ok=True)

In [None]:
import numpy as np

for embedding, file_name in speaker_embeddings_list:
    np.save(root_metricgan_plus + file_name, embedding)

for embedding, file_name in speaker_embeddings_list_2:
    np.save(root_resemble_enhance + file_name, embedding)