In [1]:
from speechbrain.inference import Tacotron2, HIFIGAN

In [2]:
import torchaudio
import librosa
import numpy as np

In [10]:
import warnings
warnings.filterwarnings("ignore")

In [11]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [19]:
tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", 
                                   savedir="tmpdir_tts")
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", 
                                savedir="tmpdir_vocoder")

In [13]:
# Function to extract features from reference audio (your input voice)
def extract_speaker_features(input_audio_path):
    # Load the audio file (assumed to be in .wav format)
    waveform, sample_rate = torchaudio.load(input_audio_path)
    print(f"Extracting features from: {input_audio_path}")
    return waveform

In [20]:
# Function to convert input text to speech in the same voice style (using the input reference voice)
def text_to_speech(input_text, input_audio_path, output_audio_path):
    # Extract speaker features from the reference voice
    reference_waveform = extract_speaker_features(input_audio_path)
    
    # Synthesize speech (text-to-mel spectrogram) from input text
    mel_output, mel_length, alignment = tacotron2.encode_text(input_text)
    
    # Now, we need to modify the TTS output to match the reference voice characteristics
    # For simplicity, we use the same vocoder (HIFIGAN) to decode the spectrogram
    waveforms = hifi_gan.decode_batch(mel_output)  # Generate speech from mel spectrogram
    
    # Save the generated speech to the output file
    torchaudio.save(output_audio_path, waveforms.squeeze(1), 22050)  # Save waveform
    print(f"Audio saved to {output_audio_path}")


In [21]:
# Example input parameters
input_audio_file = "voiceinput.wav"  # Your input reference audio file (your voice)
input_text = "Hello, this is the text I want to be read out loud in my voice."
output_audio_file = "output.wav"  # Path to save the output generated audio


In [22]:
# Call the function to synthesize speech in the reference voice
text_to_speech(input_text, input_audio_file, output_audio_file)

Extracting features from: voiceinput.wav
Audio saved to output.wav
