In [3]:
!pip install datasets soundfile speechbrain
!pip install git+https://github.com/huggingface/transformers.git
!pip install --upgrade accelerate

Collecting speechbrain
  Downloading speechbrain-1.0.2-py3-none-any.whl.metadata (23 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain)
  Downloading ruamel.yaml-0.18.6-py3-none-any.whl.metadata (23 kB)
Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.28->hyperpyyaml->speechbrain)
  Downloading ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Downloading speechbrain-1.0.2-py3-none-any.whl (824 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m824.8/824.8 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)
Downloading ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.8/117.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ruamel.yaml.clib-0.2.12-cp310-cp31

In [4]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
import torch
import torchaudio
from torchaudio.transforms import MelSpectrogram
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import SpeechT5Tokenizer
from datasets import load_dataset, Audio


processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

----

In [5]:
dataset = load_dataset(
    "facebook/voxpopuli", "en", streaming=True, trust_remote_code=True
)

dataset = {
    'train': dataset["train"],
    'validation': dataset["validation"],
    'test': dataset["test"]
}

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

voxpopuli.py:   0%|          | 0.00/8.84k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/585M [00:00<?, ?B/s]

---

In [26]:
class TTSRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_bins=80, num_layers=2):
        super(TTSRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_bins)  

    def forward(self, text, target_length=None):
        # Embed text input
        embedded_text = self.embedding(text)  # Shape: [batch_size, seq_len, embedding_dim]

        # RNN forward pass
        rnn_out, _ = self.rnn(embedded_text)  # Shape: [batch_size, seq_len, hidden_size]

        # If target_length is provided, we need to adjust the sequence length
        if target_length:
            padding_size = target_length - rnn_out.size(1)
            if padding_size > 0:
                padding = torch.zeros(rnn_out.size(0), padding_size, rnn_out.size(2)).to(rnn_out.device)
                rnn_out = torch.cat([rnn_out, padding], dim=1)

        output = self.fc(rnn_out)  

        return output

---

In [34]:
import torch
from tqdm import tqdm  

for epoch in range(3):
    model.train() 
    running_loss = 0.0

    # Wrap DataLoader with tqdm to show progress
    for i, (text, ground_truth_spectrogram) in tqdm(enumerate(train_loader), 
                                                     desc=f'Epoch {epoch + 1}/{3}', 
                                                     unit='batch'):
        
        text, ground_truth_spectrogram = text.to(device), ground_truth_spectrogram.to(device)
        optimizer.zero_grad()
        target_length = ground_truth_spectrogram.size(1)

        predicted_spectrogram = model(text, target_length=target_length)


        loss = criterion(predicted_spectrogram, ground_truth_spectrogram)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/3], Loss: {running_loss / i}")

Epoch 1/3: 22811batch [1:19:31,  4.78batch/s]


Epoch [1/3], Loss: 10501.758684006307


Epoch 2/3: 22811batch [1:11:31,  5.32batch/s]


Epoch [2/3], Loss: 10493.556941493402


Epoch 3/3: 22811batch [59:59,  6.34batch/s]

Epoch [3/3], Loss: 10489.179151881719





---

---