In [None]:
!pip install gradio
!pip install resampy

Collecting gradio
  Downloading gradio-5.25.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

In [None]:
import os
import sys
import torch
import numpy as np
import gradio as gr
import time
import gdown
import json
import scipy.signal
import resampy
!pip install -q tqdm unidecode gdown resampy gradio
!git clone -q --recursive https://github.com/justinjohn0306/TTS-TT2.git
!git clone -q --recursive https://github.com/justinjohn0306/hifi-gan
sys.path.append('hifi-gan')
sys.path.append('TTS-TT2')
from model import Tacotron2
from hparams import create_hparams
from text import text_to_sequence
from env import AttrDict
from meldataset import mel_spectrogram, MAX_WAV_VALUE
from models import Generator
from denoiser import Denoiser



tacotron_id = "1--eW5nk5ijbpgBqEt1TdBPr9nopcjuHE"
hifigan_id = "universal"


def get_hifigan():
    conf = os.path.join("hifi-gan", "config_v1.json")
    with open(conf) as f:
        json_config = json.loads(f.read())
    h = AttrDict(json_config)
    torch.manual_seed(h.seed)
    hifigan_model = 'hifimodel_config_v1'
    !wget "https://github.com/justinjohn0306/tacotron2/releases/download/assets/g_02500000" -O $hifigan_model
    hifigan = Generator(h).to(torch.device("cpu"))
    state_dict_g = torch.load(hifigan_model, map_location=torch.device("cpu"))
    hifigan.load_state_dict(state_dict_g["generator"])
    hifigan.eval()
    hifigan.remove_weight_norm()
    denoiser = Denoiser(hifigan, mode="normal")
    return hifigan, h, denoiser


def get_tacotron2():
    tacotron2_model = 'MLPTTS'
    gdown.download(f'https://drive.google.com/uc?id={tacotron_id}', tacotron2_model, quiet=False)
    hparams = create_hparams()
    hparams.sampling_rate = 22050
    hparams.max_decoder_steps = 3000
    hparams.gate_threshold = 0.25
    model = Tacotron2(hparams)
    state_dict = torch.load(tacotron2_model, map_location=torch.device("cpu"))['state_dict']
    model.load_state_dict(state_dict)
    model.eval()
    return model, hparams


hifigan, h, denoiser = get_hifigan()
model, hparams = get_tacotron2()

def generate_speech(text):
    model.decoder.max_decoder_steps = 3000
    model.decoder.gate_threshold = 0.5
    with torch.no_grad():
        sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
        sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
        mel_outputs, mel_outputs_postnet, _, _ = model.inference(sequence)
        y_g_hat = hifigan(mel_outputs_postnet.float())
        audio = y_g_hat.squeeze()
        audio = audio * MAX_WAV_VALUE
        audio_denoised = denoiser(audio.view(1, -1), strength=35)[:, 0]
        audio_denoised = audio_denoised.cpu().numpy().reshape(-1)
        normalize = (MAX_WAV_VALUE / np.max(np.abs(audio_denoised))) ** 0.9
        audio_denoised = audio_denoised * normalize
        speed_factor = 1.4
        audio_sped_up = resampy.resample(audio_denoised, hparams.sampling_rate, int(hparams.sampling_rate * speed_factor))
    return (int(hparams.sampling_rate * speed_factor), audio_sped_up.astype(np.int16))


iface = gr.Interface(
    fn=generate_speech,
    inputs=gr.Textbox(label="Enter Text"),
    outputs=gr.Audio(label="Generated Speech"),
    allow_flagging="never"
)

iface.launch(debug=True)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m225.3/235.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

  return s in _symbol_to_id and s is not '_' and s is not '~'
  return s in _symbol_to_id and s is not '_' and s is not '~'


--2025-04-12 17:21:49--  https://github.com/justinjohn0306/tacotron2/releases/download/assets/g_02500000
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/486136587/690aeb04-57b2-43dc-9544-35c0c3b3ec29?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250412%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250412T172149Z&X-Amz-Expires=300&X-Amz-Signature=092f0d86b7d6b2d34f1b307fde83a458fd72f6564c4f732236b9391f4309d9c9&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dg_02500000&response-content-type=application%2Foctet-stream [following]
--2025-04-12 17:21:49--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/486136587/690aeb04-57b2-43dc-9544-35c0c3b3ec29?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credent

  WeightNorm.apply(module, name, dim)


Removing weight norm...


Downloading...
From (original): https://drive.google.com/uc?id=1--eW5nk5ijbpgBqEt1TdBPr9nopcjuHE
From (redirected): https://drive.google.com/uc?id=1--eW5nk5ijbpgBqEt1TdBPr9nopcjuHE&confirm=t&uuid=2bf1efd9-9a43-496e-ae90-1cac62c959a8
To: /content/MLPTTS
100%|██████████| 338M/338M [00:02<00:00, 130MB/s]


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://07904c08f843fc4603.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
