In [1]:
import nest_asyncio
nest_asyncio.apply()
import asyncio
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import numpy as np

# Load models
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Load speaker embeddings
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

async def generate_audio_chunks_with_char_timestamps(text: str, chunk_size=4):
    words = text.strip().split()
    char_pos = 0

    for i in range(0, len(words), chunk_size):
        chunk_words = words[i:i+chunk_size]
        chunk_text = " ".join(chunk_words)

        # Generate speech
        inputs = processor(text=chunk_text, return_tensors="pt")
        speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

        # Approximate character-level timings
        num_chars = len(chunk_text)
        total_samples = speech_chunk.shape[0]
        duration_ms = total_samples / 16000 * 1000  # assuming 16kHz

        # simple proportional distribution
        char_durations_ms = [duration_ms / num_chars] * num_chars
        char_start_times_ms = np.cumsum([0]+char_durations_ms[:-1]).tolist()

        yield {
            "chars": list(chunk_text),
            "char_start_times_ms": [round(t) for t in char_start_times_ms],
            "char_durations_ms": [round(d) for d in char_durations_ms],
            "audio": speech_chunk
        }

        char_pos += len(chunk_text) + 1  # +1 for space
        await asyncio.sleep(0.01)


  from .autonotebook import tqdm as notebook_tqdm
Fetching 1 files: 100%|█████████████████████████| 1/1 [00:00<00:00, 3530.56it/s]


In [2]:
import sounddevice as sd

async def play_with_char_alignment(text):
    async for chunk_data in generate_audio_chunks_with_char_timestamps(text):
        audio_np = chunk_data["audio"].numpy()
        print("Chunk chars:", "".join(chunk_data["chars"]))
        print("Start times (ms):", chunk_data["char_start_times_ms"])
        print("Durations (ms):", chunk_data["char_durations_ms"])
        sd.play(audio_np, samplerate=16000, blocking=True)

# Run in notebook
await play_with_char_alignment("This is an example of alignment data.")


Chunk chars: This is an example
Start times (ms): [0, 78, 156, 235, 313, 391, 469, 548, 626, 704, 782, 860, 939, 1017, 1095, 1173, 1252, 1330]
Durations (ms): [78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78]
Chunk chars: of alignment data.
Start times (ms): [0, 69, 139, 208, 277, 347, 416, 485, 555, 624, 693, 763, 832, 901, 971, 1040, 1109, 1179]
Durations (ms): [69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69]
