<a href="https://colab.research.google.com/github/LuisVMCR/tts-using_transformer_models/blob/main/colab_transformer_TTS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!pip install -q transformers datasets librosa soundfile IPython --use-feature=2020-resolver
!pip install -q pyarrow==14.0.1 requests==2.31.0 --use-feature=2020-resolver


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

option --use-feature: invalid choice: '2020-resolver' (choose from 'fast-deps', 'truststore', 'no-binary-enable-wheel-cache')

Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

option --use-feature: invalid choice: '2020-resolver' (choose from 'fast-deps', 'truststore', 'no-binary-enable-wheel-cache')


In [1]:
import torch
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, AutoProcessor, MusicgenForConditionalGeneration
import soundfile as sf
from datasets import load_dataset
import io
import librosa
import numpy as np
from IPython.display import Audio, display

In [None]:
# Load models
def load_models():
    sentiment_analyzer = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")
    speech_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    speech_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    music_processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
    music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    return sentiment_analyzer, speech_processor, speech_model, vocoder, music_processor, music_model

sentiment_analyzer, speech_processor, speech_model, vocoder, music_processor, music_model = load_models()

In [None]:
# Load speaker embeddings
def load_speaker_embeddings():
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    return speaker_embeddings

speaker_embeddings = load_speaker_embeddings()

In [4]:
# Function to process long text with sliding window
def process_long_text(text, max_length=512, stride=256):
    tokens = speech_processor.tokenizer.tokenize(text)
    token_chunks = []
    for i in range(0, len(tokens), stride):
        token_chunks.append(tokens[i:i + max_length])
    return token_chunks

In [5]:
# Function to adjust speech parameters based on narration style
def adjust_speech_parameters(style):
    rate = 1.0
    pitch = 0.0

    if style == "excited":
        rate = 1.2
        pitch = 2.0
    elif style == "sad":
        rate = 0.8
        pitch = -2.0
    elif style == "formal":
        rate = 0.9
        pitch = 0.5
    elif style == "casual":
        rate = 1.1
        pitch = -0.5

    return rate, pitch

In [16]:
# Main function to generate narration
def generate_narration(text_input, narration_style, music_prompt):
    # Process long text with sliding window
    token_chunks = process_long_text(text_input)
    text_chunks = [speech_processor.tokenizer.convert_tokens_to_string(chunk) for chunk in token_chunks]

    # Sentiment analysis
    sentiments = sentiment_analyzer(text_chunks)
    avg_sentiment = sum(float(s['score']) for s in sentiments) / len(sentiments)
    avg_label = 'POSITIVE' if avg_sentiment > 0.5 else 'NEGATIVE'
    print(f"Detected sentiment: {avg_label} (Score: {avg_sentiment:.2f})")

    # Generate speech for each chunk
    speech_chunks = []
    rate, pitch = adjust_speech_parameters(narration_style)
    for chunk in text_chunks:
        inputs = speech_processor(text=chunk, return_tensors="pt")
        speech = speech_model.generate_speech(
            inputs["input_ids"],
            speaker_embeddings,
            vocoder=vocoder
        )
        speech_chunks.append(speech.numpy())

    full_speech = np.concatenate(speech_chunks)
    speech_sr = 16000  # SpeechT5 output sample rate

    print(f"Speech generated: {len(full_speech)} samples")

    # Calculate duration of the generated speech
    speech_duration = len(full_speech) / speech_sr

    # Generate music for a fixed duration (e.g., 15 seconds)
    music_inputs = music_processor(
        text=[music_prompt],
        padding=True,
        return_tensors="pt",
    )
    try:
        audio_values = music_model.generate(**music_inputs, max_new_tokens=1000)  # Fixed number of tokens
    except Exception as e:
        print(f"Error generating music: {str(e)}")
        return None, None

    music_audio = audio_values[0, 0].cpu().numpy()
    music_sr = 32000  # MusicGen output sample rate

    print(f"Music generated: {len(music_audio)} samples")

    # Resample music to match speech sample rate
    music_audio = librosa.resample(music_audio, orig_sr=music_sr, target_sr=speech_sr)

    # Calculate the number of loops needed to match the speech duration
    music_duration = len(music_audio) / speech_sr
    num_loops = int(np.ceil(speech_duration / music_duration))

    # Loop the music to match the speech duration
    music_audio = np.tile(music_audio, num_loops)
    music_audio = music_audio[:len(full_speech)]

    # Ensure both audios have the same length
    target_length = max(len(full_speech), len(music_audio))
    full_speech = librosa.util.fix_length(full_speech, size=target_length)
    music_audio = librosa.util.fix_length(music_audio, size=target_length)

    # Combine audio (simple mixing)
    combined_audio = full_speech + music_audio * 0.3  # Reduce music volume

    print(f"Combined audio: {len(combined_audio)} samples")

    return combined_audio, speech_sr

In [None]:
# Example usage
text_input = """It's not telepathy, Donovan, Calvin said, her voice a dry rasp. We're dealing with\
a form of intelligence that has evolved under completely different pressures than ours. The question\
is what, can we find a common ground? Weeks turned into months. The team lowered hydrophones into the\
 depths, recorded the whales' haunting calls, and analyzed the complex patterns. They tried transmitting\
 sounds, mathematical sequences, even images. The whales responded, but it was like a conversation between\
 people who spoke different languages, struggling to find a shared meaning."""
narration_style = "excited"  # or "sad", "formal", "casual", "default"
music_prompt = "ambient soundscape representing the depths of the ocean"

output_audio, sample_rate = generate_narration(text_input, narration_style, music_prompt)

In [None]:
# Play the generated audio
if output_audio is not None:
    from IPython.display import Audio, display
    display(Audio(data=output_audio, rate=sample_rate))
else:
    print("No audio available to play.")

In [None]:
# Save the audio file
if output_audio is not None:
    import soundfile as sf

    file_name = "final_output.wav"
    sf.write(file_name, output_audio, sample_rate)
    print(f"Audio saved as '{file_name}'")
else:
    print("No audio available to save.")