In [1]:
import gradio as gr
import numpy as np
import torch
from transformers import pipeline

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"

revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c"


pipe = pipeline(
    "text-to-speech",
    model=checkpoint_finetuned,
    use_fast=True,
    device=device,
    revision=revision,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  return self.fget.__get__(instance, owner)()


In [4]:
embeddings_dir = "../embeddings/nst-da-metricgan-plus/"

speaker_embeddings = {
    "F23": embeddings_dir + "female_23_vestjylland.npy",
    "F24": embeddings_dir + "female_24_storkoebenhavn.npy",
    "F49": embeddings_dir + "female_49_nordjylland.npy",
    "M51": embeddings_dir + "male_51_vest_sydsjaelland.npy",
    "M18": embeddings_dir + "male_18_vest_sydsjaelland.npy",
    "M31": embeddings_dir + "male_31_fyn.npy",
}

In [5]:
target_dtype = np.int16
max_range = np.iinfo(target_dtype).max


def replace_danish_letters(text):
    for src, dst in replacements:
        text = text.replace(src, dst)
    return text


replacements = [
    ("&", "og"),
    ("\r", " "),
    ("´", ""),
    ("\\", ""),
    ("¨", " "),
    ("Å", "AA"),
    ("Æ", "AE"),
    ("É", "E"),
    ("Ö", "OE"),
    ("Ø", "OE"),
    ("á", "a"),
    ("ä", "ae"),
    ("å", "aa"),
    ("è", "e"),
    ("î", "i"),
    ("ô", "oe"),
    ("ö", "oe"),
    ("ø", "oe"),
    ("ü", "y"),
]

In [6]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)


def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

In [7]:
from speechbrain.pretrained import SpectralMaskEnhancement

metricgan_model_name = "speechbrain/metricgan-plus-voicebank"


enhance_model = SpectralMaskEnhancement.from_hparams(
    source=metricgan_model_name,
    savedir=os.path.join("/tmp", metricgan_model_name),
    run_opts={"device": device},
)


def enhance_audio(waveform):
    tensor = torch.tensor(waveform).reshape(1, -1).float()
    enhanced = enhance_model.enhance_batch(tensor, lengths=torch.tensor([1.0]))
    enhanced = enhanced.squeeze().cpu().numpy()
    return enhanced


def enhance_audio_file(file):
    enhanced = enhance_model.enhance_file(file)
    enhanced = enhanced.squeeze().cpu().numpy()
    return enhanced

In [8]:
def predict(text, speaker, audio=None):
    if len(text.strip()) == 0:
        return (16000, np.zeros(0))

    text = replace_danish_letters(text)
    if audio:
        speaker_embedding = create_speaker_embedding(enhance_audio_file(audio))
    else:
        speaker_id = speaker[:3]

        speaker_embedding_path = speaker_embeddings[speaker_id]

        speaker_embedding = np.load(speaker_embedding_path)

    speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)

    forward_params = {"speaker_embeddings": speaker_embedding}
    speech = pipe(text, forward_params=forward_params)

    sr, audio = speech["sampling_rate"], speech["audio"]

    audio = (audio * max_range).astype(np.int16)

    return sr, audio

In [9]:
title = "Danish Speech Synthesis"

description = (
    "Synthesize long-form danish speech from text with the click of a button! Demo uses the"
    f" checkpoint [{checkpoint_finetuned}](https://huggingface.co/{checkpoint_finetuned}) and 🤗 Transformers to synthesize speech"
    "."
)

examples = [
    [
        "I sin oprindelige før-kristne form blev alferne sandsynligvis opfattet som en personificering af det land og den natur, der omgav menneskene, dvs. den opdyrkede jord, gården og de naturressourcer, som hørte dertil. De var guddommelige eller delvis guddommelige væsener, der besad magiske kræfter, som de brugte både til fordel og ulempe for menneskene.",
        "F23 (Female, 23, Vestjylland)",
    ],
]


demo = gr.Interface(
    fn=predict,
    inputs=[
        gr.Textbox(label="Input Text"),
        gr.Radio(
            label="Preset speaker",
            choices=[
                "F23 (Female, 23, Vestjylland)",
                "F24 (Female, 24, Storkøbenhavn)",
                "F49 (Female, 49 Nordjylland)",
                "M51 (Male. 51, Vest-sydsjælland)",
                "M18 (Male, 18, Vest-sydjælland)",
                "M31 (Male, 31, Fyn)",
            ],
            value="F23 (Female, 23, Vestjylland)",
        ),
        gr.Audio(label="Custom speaker", type="filepath"),
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
    ],
    title=title,
    description=description,
    examples=examples,
    cache_examples=True,
    allow_flagging="never",
)

Using cache from '/home/cdt/repositories/hugging-face-audio-course/notebooks/gradio_cached_examples/16' directory. If method or examples have changed since last caching, delete this folder to clear cache.





In [10]:
demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://13f6ccb955dcc332e9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


