##Interface

Install the Requirements

In [1]:
!pip install -q gradio==3.45.0
!pip list

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.6/50.6 kB[0m [31m191.4 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.2/20.2 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.3/298.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.5/144.5 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.9/129.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.6/94.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Create a requirements.txt file with the specified packages

In [None]:
with open('requirements.txt', 'w') as f:
    f.write('numpy==1.23.5\n')
    f.write('transformers\n')
    f.write('datasets\n')
    f.write('soundfile\n')
    f.write('torch\n')
    f.write('torchaudio\n')
    f.write('sentencepiece\n')
    f.write('speechbrain==0.5.16\n')
    f.write('librosa\n')
    f.write('num2words\n')

!pip install -r requirements.txt

In [None]:
import gradio as gr
import torch
import soundfile as sf
import os
import numpy as np
import re
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
from datasets import load_dataset as load_huggingface_dataset  # Import the original function
from num2words import num2words

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def load_models():
    model_name = "microsoft/speecht5_tts"
    processor = SpeechT5Processor.from_pretrained(model_name)
    model = SpeechT5ForTextToSpeech.from_pretrained("Meen15kshi/speecht5_finetuned_meenakshiTTS").to(device)
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
    return model, processor, vocoder


In [None]:
def load_speaker_model():
    spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
    speaker_model = EncoderClassifier.from_hparams(
        source=spk_model_name,
        run_opts={"device": device},
        savedir=os.path.join("/tmp", spk_model_name),
    )
    return speaker_model

In [None]:
def load_custom_dataset():
    dataset = load_huggingface_dataset("Yassmen/TTS_English_Technical_data")  # Load the dataset without split
    example = dataset["train"][304]  # Access the training split
    return example

In [None]:
def create_speaker_embedding(waveform, speaker_model):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze()
    return speaker_embeddings

In [None]:
def prepare_default_embedding(example, speaker_model):
    audio = example["audio"]
    return create_speaker_embedding(audio["array"], speaker_model)

In [None]:
def replace_numbers_with_words(text):
    def replace(match):
        number = int(match.group())
        return num2words(number)

    # Find the numbers and change with words.
    result = re.sub(r'\b\d+\b', replace, text)
    return result

In [None]:
def normalize_text(text):
    # Convert to lowercase
    text = text.lower()

    # Replace numbers with words
    text = replace_numbers_with_words(text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    return text

In [None]:
def text_to_speech(text, model, processor, vocoder, speaker_embedding):
    # Normalize the input text
    normalized_text = normalize_text(text)

    # Prepare the input for the model
    inputs = processor(text=normalized_text, return_tensors="pt").to(device)

    # Use the default speaker embedding
    speaker_embeddings = speaker_embedding

    # Generate speech
    with torch.no_grad():
        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)

    speech_np = speech.cpu().numpy()
    return (16000, speech_np)

In [None]:
def main():
    model, processor, vocoder = load_models()
    speaker_model = load_speaker_model()
    example = load_custom_dataset()  # Call the renamed function
    default_embedding = prepare_default_embedding(example, speaker_model)

    iface = gr.Interface(
        fn=lambda text: text_to_speech(text, model, processor, vocoder, default_embedding),
        inputs=[
            gr.Textbox(label="Enter text to convert to speech")
        ],
        outputs=[
            gr.Audio(label="Generated Speech", type="numpy")
        ],
        title="Technical English SpeechT5 Text-to-Speech Demo",
        description="Enter Your text, and listen to the generated speech."
    )
    iface.launch(share=True)

if __name__ == "__main__":
    main()



config.json:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/578M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  torch.load(path, map_location=device), strict=False
  stats = torch.load(path, map_location=device)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
IMPORTANT: You are using gradio version 3.45.0, however version 4.44.1 is available, please upgrade.
--------
Running on public URL: https://a97781830bf40f2b74.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


##Evaluation

In [10]:
Wer = [4,3.5,2.5,2,3]
Mos_Ratings = [3, 3.5, 3.5, 4, 1]
Naturalness = [3, 4, 4, 5, 1]
Intelligibility = [4, 4, 3, 1, 0.5]
Pronounciation = [5, 4, 4, 5, 1.5]

Wer_Score = sum(Wer) / len(Wer)
Mos_Score = sum(Mos_Ratings) / len(Mos_Ratings)
Naturalness_Score = sum(Naturalness) / len(Naturalness)
Intelligibility_Score = sum(Intelligibility) / len(Intelligibility)
Pronounciation_Score = sum(Pronounciation) / len(Pronounciation)
Rate=(Mos_Score+Naturalness_Score+Intelligibility_Score+Pronounciation_Score)/4

print(f"Word Error Rate of the Model: {Wer_Score}")
print(f"Mean Opinion Score (MOS): {Mos_Score}")
print(f"Naturalness Score: {Naturalness_Score}")
print(f"Intelligibility Score: {Intelligibility_Score}")
print(f"Pronounciation Score: {Pronounciation_Score}")
print(f"Rate of CoquiTTS: {Rate}")

Word Error Rate of the Model: 3.0
Mean Opinion Score (MOS): 3.0
Naturalness Score: 3.4
Intelligibility Score: 2.5
Pronounciation Score: 3.9
Rate of CoquiTTS: 3.2


##Evaluation for Coqui-TTs

You can check it [here](https://huggingface.co/spaces/coqui/xtts)

In [11]:
Wer_CoquiTTS = [0,0,0,0,0]
Mos_Ratings_CoquiTTS = [5, 5, 5, 5, 5]
Naturalness_CoquiTTS = [5, 5, 5, 5, 5]
Intelligibility_CoquiTTS = [5, 5, 5, 5, 5]
Pronounciation_CoquiTTS = [5, 5, 5, 5, 5]

Wer_CoquiTTS_Score = sum(Wer_CoquiTTS) / len(Wer_CoquiTTS)
Mos_Score_CoquiTTS = sum(Mos_Ratings_CoquiTTS) / len(Mos_Ratings_CoquiTTS)
Naturalness_Score_CoquiTTS = sum(Naturalness_CoquiTTS) / len(Naturalness_CoquiTTS)
Intelligibility_Score_CoquiTTS = sum(Intelligibility_CoquiTTS) / len(Intelligibility_CoquiTTS)
Pronounciation_Score_CoquiTTS = sum(Pronounciation_CoquiTTS) / len(Pronounciation_CoquiTTS)
RateTTS=(Mos_Score_CoquiTTS+Naturalness_Score_CoquiTTS+Intelligibility_Score_CoquiTTS+Pronounciation_Score_CoquiTTS)/4

print(f"Word Error Rate of CoquiTTS: {Wer_CoquiTTS_Score}")
print(f"Mean Opinion Score (MOS) of CoquiTTS: {Mos_Score_CoquiTTS}")
print(f"Naturalness Score of CoquiTTS: {Naturalness_Score_CoquiTTS}")
print(f"Intelligibility Score of CoquiTTS: {Intelligibility_Score_CoquiTTS}")
print(f"Pronounciation Score of CoquiTTS: {Pronounciation_Score_CoquiTTS}")
print(f"Rate of CoquiTTS: {RateTTS}")

Word Error Rate of CoquiTTS: 0.0
Mean Opinion Score (MOS) of CoquiTTS: 5.0
Naturalness Score of CoquiTTS: 5.0
Intelligibility Score of CoquiTTS: 5.0
Pronounciation Score of CoquiTTS: 5.0
Rate of CoquiTTS: 5.0


##Comparison

In [12]:
if Rate<RateTTS:
  print(f'Coqui-TTS is better')

elif Rate==RateTTS:
  print(f'Both Models are equaly good')

else:
  print(f'Model is better')

Coqui-TTS is better
