In [27]:
import torch
import soundfile as sf
import numpy as np
import librosa
from TTS.api import TTS
import IPython.display as ipd
import subprocess
import os

# Install required dependencies
!pip install torch soundfile numpy librosa TTS pydub simpleaudio --quiet

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load an offline TTS model
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(device)

def generate_speech(text, output_path="output.wav"):
    """ Generate speech from text and save it to a WAV file """
    print("Generating speech...")
    tts.tts_to_file(text=text, file_path=output_path)
    print(f"Audio saved as {output_path}")

    # Increase volume
    louder_path = "output_louder.wav"
    increase_volume(output_path, louder_path)

    # Play final audio in Google Colab
    return play_audio(louder_path)

def play_audio(file_path):
    """ Play audio in Google Colab using IPython.display.Audio """
    try:
        return ipd.Audio(file_path, autoplay=True)
    except Exception as e:
        print(f"Error playing audio: {e}")

def increase_volume(input_file, output_file, gain=5.0):
    """ Increase volume of the WAV file """
    try:
        audio, sr = librosa.load(input_file, sr=None)
        audio = np.clip(audio * gain, -1.0, 1.0)  # Prevent clipping
        sf.write(output_file, audio, sr)
        print(f"Volume increased: {output_file}")
    except Exception as e:
        print(f"Error increasing volume: {e}")

# Get user input and generate speech
user_text = input("Enter text to generate speech: ")
audio_output = generate_speech(user_text)

# Display & play the audio in Colab
audio_output


 > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: hifigan
 > Setting up Audio P