##Interface

Install the Requirements

In [None]:
!pip install -q gradio==3.45.0
!pip list

Package                            Version
---------------------------------- --------------------
absl-py                            1.4.0
accelerate                         0.34.2
aiofiles                           23.2.1
aiohappyeyeballs                   2.4.3
aiohttp                            3.10.10
aiosignal                          1.3.1
alabaster                          0.7.16
albucore                           0.0.16
albumentations                     1.4.15
altair                             4.2.2
annotated-types                    0.7.0
anyio                              3.7.1
argon2-cffi                        23.1.0
argon2-cffi-bindings               21.2.0
array_record                       0.5.1
arviz                              0.19.0
astropy                            6.1.4
astropy-iers-data                  0.2024.10.14.0.32.55
astunparse                         1.6.3
async-timeout                      4.0.3
atpublic                           4.1.0
attrs          

Create a requirements.txt file with the specified packages

In [None]:
with open('requirements.txt', 'w') as f:
    f.write('numpy==1.23.5\n')
    f.write('transformers\n')
    f.write('datasets\n')
    f.write('soundfile\n')
    f.write('torch\n')
    f.write('torchaudio\n')
    f.write('sentencepiece\n')
    f.write('speechbrain==0.5.16\n')
    f.write('librosa\n')
    f.write('num2words\n')

!pip install -r requirements.txt

Collecting numpy==1.23.5 (from -r requirements.txt (line 1))
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting datasets (from -r requirements.txt (line 3))
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting speechbrain==0.5.16 (from -r requirements.txt (line 8))
  Downloading speechbrain-0.5.16-py3-none-any.whl.metadata (23 kB)
Collecting num2words (from -r requirements.txt (line 10))
  Downloading num2words-0.5.13-py3-none-any.whl.metadata (12 kB)
Collecting hyperpyyaml (from speechbrain==0.5.16->-r requirements.txt (line 8))
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->-r requirements.txt (line 3))
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->-r requirements.txt (line 3))
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting

In [None]:
import gradio as gr
import torch
import soundfile as sf
import os
import numpy as np
import re
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
from datasets import load_dataset as load_huggingface_dataset  # Import the original function
from num2words import num2words

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def load_models():
    model_name = "microsoft/speecht5_tts"
    processor = SpeechT5Processor.from_pretrained(model_name)
    model = SpeechT5ForTextToSpeech.from_pretrained("Meen15kshi/speecht5_finetuned_meenakshi_hindiTTS").to(device)
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
    return model, processor, vocoder


In [None]:
def load_speaker_model():
    spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
    speaker_model = EncoderClassifier.from_hparams(
        source=spk_model_name,
        run_opts={"device": device},
        savedir=os.path.join("/tmp", spk_model_name),
    )
    return speaker_model

In [None]:
def load_custom_dataset():
    dataset = load_huggingface_dataset("1rsh/tts-rj-hi-karya")  # Load the dataset without split
    dataset = dataset["train"]  # Access the training split
    size=len(dataset)//50
    example=dataset.select(range(size))
    return example

In [None]:
def create_speaker_embedding(waveform, speaker_model):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

In [None]:
def prepare_default_embedding(example, speaker_model):
    # Assuming example is a list and you want the first item
    audio = example[0]["audio"]  # Adjust this line based on the actual structure
    return create_speaker_embedding(audio["array"], speaker_model)

In [None]:
def replace_numbers_with_words(text):
    def replace(match):
        number = int(match.group())
        return num2words(number)

    # Find the numbers and change with words.
    result = re.sub(r'\b\d+\b', replace, text)
    return result

In [None]:
def normalize_hindi_text(text):
    # Remove punctuation (except apostrophes)
    text = re.sub(r'[^\u0900-\u097F\s\']', '', text)  # Unicode range for Devanagari (Hindi)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

# Example Hindi text
hindi_text = "यह एक उदाहरण है... जिसमें अतिरिक्त   whitespace और विराम चिह्न हैं!"

# Normalize the Hindi text
normalized_hindi_text = normalize_hindi_text(hindi_text)

print("Original Hindi Text:", hindi_text)
print("Normalized Hindi Text:", normalized_hindi_text)

Original Hindi Text: यह एक उदाहरण है... जिसमें अतिरिक्त   whitespace और विराम चिह्न हैं!
Normalized Hindi Text: यह एक उदाहरण है जिसमें अतिरिक्त और विराम चिह्न हैं


In [None]:
def text_to_speech(text, model, processor, vocoder, speaker_embedding):
    # Normalize the input text
    normalize_hindi_text = normalize_hindi_text(text)

    # Prepare the input for the model
    inputs = processor(text=normalize_hindi_text, return_tensors="pt").to(device)

    # Use the default speaker embedding
    speaker_embeddings = speaker_embedding

    # Generate speech
    with torch.no_grad():
        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)

    speech_np = speech.cpu().numpy()
    return (16000, speech_np)

In [None]:
def main():
    model, processor, vocoder = load_models()
    speaker_model = load_speaker_model()
    example = load_custom_dataset()  # Call the renamed function
    default_embedding = prepare_default_embedding(example, speaker_model)

    iface = gr.Interface(
        fn=lambda text: text_to_speech(text, model, processor, vocoder, default_embedding),
        inputs=[
            gr.Textbox(label="Enter text to convert to speech")
        ],
        outputs=[
            gr.Audio(label="Generated Speech", type="numpy")
        ],
        title="Hindi SpeechT5 Text-to-Speech Demo",
        description="Enter Your text, and listen to the generated speech."
    )
    iface.launch(share=True)

if __name__ == "__main__":
    main()

  torch.load(path, map_location=device), strict=False
  stats = torch.load(path, map_location=device)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
IMPORTANT: You are using gradio version 3.45.0, however version 4.44.1 is available, please upgrade.
--------
Running on public URL: https://a8c5d415849a5a382a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


##Evaluation

In [1]:
Wer = [4,3.7,3.5,4.5,4.3]
Mos_Ratings = [2, 2.5, 1.5, 1.5, 2.5]
Naturalness = [2.5, 2.4, 2.5, 2.2, 2.4]
Intelligibility = [2.5, 2.8, 2.6, 2.3, 2.3]
Pronounciation = [3, 2.5, 3.5, 3.5, 3.5]

Wer_Score = sum(Wer) / len(Wer)
Mos_Score = sum(Mos_Ratings) / len(Mos_Ratings)
Naturalness_Score = sum(Naturalness) / len(Naturalness)
Intelligibility_Score = sum(Intelligibility) / len(Intelligibility)
Pronounciation_Score = sum(Pronounciation) / len(Pronounciation)
Rate=(Mos_Score+Naturalness_Score+Intelligibility_Score+Pronounciation_Score)/4

print(f"Word Error Rate of the Model: {Wer_Score}")
print(f"Mean Opinion Score (MOS): {Mos_Score}")
print(f"Naturalness Score: {Naturalness_Score}")
print(f"Intelligibility Score: {Intelligibility_Score}")
print(f"Pronounciation Score: {Pronounciation_Score}")
print(f"Rate of CoquiTTS: {Rate}")

Word Error Rate of the Model: 4.0
Mean Opinion Score (MOS): 2.0
Naturalness Score: 2.4000000000000004
Intelligibility Score: 2.5
Pronounciation Score: 3.2
Rate of CoquiTTS: 2.5250000000000004


##Evaluation for Coqui-TTs

You can check it [here](https://huggingface.co/spaces/coqui/xtts)

In [2]:
Wer_CoquiTTS = [5,5,5,5,5]
Mos_Ratings_CoquiTTS = [5, 5, 5, 5, 5]
Naturalness_CoquiTTS = [5, 5, 5, 5, 5]
Intelligibility_CoquiTTS = [5, 5, 5, 5, 5]
Pronounciation_CoquiTTS = [5, 5, 5, 5, 5]

Wer_CoquiTTS_Score = sum(Wer_CoquiTTS) / len(Wer_CoquiTTS)
Mos_Score_CoquiTTS = sum(Mos_Ratings_CoquiTTS) / len(Mos_Ratings_CoquiTTS)
Naturalness_Score_CoquiTTS = sum(Naturalness_CoquiTTS) / len(Naturalness_CoquiTTS)
Intelligibility_Score_CoquiTTS = sum(Intelligibility_CoquiTTS) / len(Intelligibility_CoquiTTS)
Pronounciation_Score_CoquiTTS = sum(Pronounciation_CoquiTTS) / len(Pronounciation_CoquiTTS)
RateTTS=(Mos_Score_CoquiTTS+Naturalness_Score_CoquiTTS+Intelligibility_Score_CoquiTTS+Pronounciation_Score_CoquiTTS)/4

print(f"Word Error Rate of CoquiTTS: {Wer_CoquiTTS_Score}")
print(f"Mean Opinion Score (MOS) of CoquiTTS: {Mos_Score_CoquiTTS}")
print(f"Naturalness Score of CoquiTTS: {Naturalness_Score_CoquiTTS}")
print(f"Intelligibility Score of CoquiTTS: {Intelligibility_Score_CoquiTTS}")
print(f"Pronounciation Score of CoquiTTS: {Pronounciation_Score_CoquiTTS}")
print(f"Rate of CoquiTTS: {RateTTS}")

Word Error Rate of CoquiTTS: 5.0
Mean Opinion Score (MOS) of CoquiTTS: 5.0
Naturalness Score of CoquiTTS: 5.0
Intelligibility Score of CoquiTTS: 5.0
Pronounciation Score of CoquiTTS: 5.0
Rate of CoquiTTS: 5.0


##Comparison




In [3]:
if Rate<RateTTS:
  print(f'Coqui-TTS is better')

elif Rate==RateTTS:
  print(f'Both Models are equaly good')

else:
  print(f'Model is better')

Coqui-TTS is better
