In [None]:
import os
os.environ["WANDB_MODE"] = "disabled"

import torch
import torchaudio

# from IPython.display import Audio

import soundfile as sf
import sounddevice as sd
# import librosa
# import librosa.display

from datasets import load_dataset 

from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor
from transformers import SpeechT5HifiGan


MODEL_NAME = "microsoft/speecht5_tts"

CACHE_DIR = "D:/LanguageModels/cache"
DATASET_DIR = "D:/LanguageModels/dataset/"
AUDIO_DIR = "D:/LanguageModels/dataset/audio/"

finetuned = False

if finetuned:
    model = SpeechT5ForTextToSpeech.from_pretrained("D:/LanguageModels/ftT5modelGetallen")
    processor = SpeechT5Processor.from_pretrained("D:/LanguageModels/ftT5processorGetallen")
else:
    model = SpeechT5ForTextToSpeech.from_pretrained(MODEL_NAME , cache_dir=CACHE_DIR)
    processor = SpeechT5Processor.from_pretrained(MODEL_NAME , cache_dir=CACHE_DIR)

print('model & processor loaded')

# for name, param in model.named_parameters():
#     print(name, param.requires_grad) 

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation" , cache_dir=CACHE_DIR)
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan" , cache_dir=CACHE_DIR)

print('speaker embeddings & vocoder loaded')

  from .autonotebook import tqdm as notebook_tqdm


model & processor loaded
speaker embeddings & vocoder loaded


In [2]:
getal = 'negen'

inputs = processor(text = getal, return_tensors="pt")

print('inputs["input_ids"]' , inputs["input_ids"] )

spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)

print('spectrogram' , spectrogram)
print('spectrogram type' , type(spectrogram))

# # Convert PyTorch tensor to NumPy array
# spectrogramnp = spectrogram.cpu().numpy().T

# print('spectrogramnp.shape' , spectrogramnp.shape)

# # Convert Mel spectrogram back to audio
# speech = librosa.feature.inverse.mel_to_audio(spectrogramnp, sr=16000)

speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

print('speech' , speech)
print('speech shape' , speech.shape)

# Play the sound
sd.play(speech, 16000)
sd.wait()  # Wait until playback finishes

outfilename = ''

if finetuned:
    outfilename = './output/' + getal + '_T5modelFineGetallen.wav'
else:
    outfilename = './output/' + getal + '_T5modelOrigGetallen.wav'

sf.write(outfilename, speech, 16000)

print("Speech synthesis complete and saved for number " + getal + ' to ' + outfilename)


inputs["input_ids"] tensor([[ 4,  9,  5, 21,  5,  9,  2]])
spectrogram tensor([[-6.1812, -6.0740, -6.2466,  ..., -7.5092, -7.3178, -6.9933],
        [-5.0527, -5.2447, -5.1299,  ..., -5.5963, -5.4195, -5.3859],
        [-4.7476, -4.8096, -4.6938,  ..., -5.1124, -4.9849, -5.0775],
        ...,
        [-4.9865, -4.8153, -4.7756,  ..., -4.8943, -4.9383, -4.9652],
        [-4.8076, -4.8270, -4.8703,  ..., -4.8362, -4.8711, -4.8943],
        [-4.6025, -4.6175, -4.6118,  ..., -4.8453, -4.8828, -4.9767]])
spectrogram type <class 'torch.Tensor'>
speech tensor([ 7.7285e-06,  1.3608e-05, -1.9907e-06,  ..., -1.1225e-05,
         1.1667e-05, -6.2211e-06])
speech shape torch.Size([14848])
Speech synthesis complete and saved for number negen to ./output/negen_T5modelFineGetallen.wav
