In [35]:
import os
os.environ["WANDB_MODE"] = "disabled"

import torch
import torchaudio

# from IPython.display import Audio

import soundfile as sf
import sounddevice as sd
# import librosa
# import librosa.display

from datasets import load_dataset 

from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor
from transformers import SpeechT5HifiGan


MODEL_NAME = "microsoft/speecht5_tts"

CACHE_DIR = "D:/LanguageModels/cache"
DATASET_DIR = "D:/LanguageModels/dataset/"
AUDIO_DIR = "D:/LanguageModels/dataset/audio/"

finetuned = True

if finetuned:
    model = SpeechT5ForTextToSpeech.from_pretrained("D:/LanguageModels/ftT5modelGetallen9")
    processor = SpeechT5Processor.from_pretrained("D:/LanguageModels/ftT5processorGetallen9")
else:
    model = SpeechT5ForTextToSpeech.from_pretrained(MODEL_NAME , cache_dir=CACHE_DIR)
    processor = SpeechT5Processor.from_pretrained(MODEL_NAME , cache_dir=CACHE_DIR)

print('model & processor loaded')

# for name, param in model.named_parameters():
#     print(name, param.requires_grad) 

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation" , cache_dir=CACHE_DIR)
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan" , cache_dir=CACHE_DIR)

print('speaker embeddings & vocoder loaded')

model & processor loaded
speaker embeddings & vocoder loaded


In [36]:
getal = '9'

inputs = processor(text = getal, return_tensors="pt")

print('inputs["input_ids"]' , inputs["input_ids"] )

spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)

print('spectrogram' , spectrogram)
print('spectrogram type' , type(spectrogram))

# # Convert PyTorch tensor to NumPy array
# spectrogramnp = spectrogram.cpu().numpy().T

# print('spectrogramnp.shape' , spectrogramnp.shape)

# # Convert Mel spectrogram back to audio
# speech = librosa.feature.inverse.mel_to_audio(spectrogramnp, sr=16000)

speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

print('speech' , speech)
print('speech shape' , speech.shape)

# Play the sound
sd.play(speech, 16000)
sd.wait()  # Wait until playback finishes

outfilename = ''

if finetuned:
    outfilename = './output/' + getal + '_T5modelFineGetallen.wav'
else:
    outfilename = './output/' + getal + '_T5modelOrigGetallen.wav'

sf.write(outfilename, speech, 16000)

print("Speech synthesis complete and saved for number " + getal + ' to ' + outfilename)


inputs["input_ids"] tensor([[4, 3, 2]])
spectrogram tensor([[-6.2394, -6.1411, -6.3204,  ..., -7.5758, -7.3847, -7.0475],
        [-5.1235, -5.3018, -5.1958,  ..., -5.6678, -5.4840, -5.4435],
        [-4.9630, -5.0344, -4.9168,  ..., -5.0490, -4.9022, -5.0167],
        ...,
        [-5.1395, -4.9062, -4.7397,  ..., -4.9016, -5.0143, -4.9869],
        [-4.9191, -4.9275, -4.9738,  ..., -4.8679, -4.8923, -4.9015],
        [-4.7417, -4.7559, -4.7680,  ..., -4.8845, -4.9212, -5.0035]])
spectrogram type <class 'torch.Tensor'>
speech tensor([ 8.1847e-06,  1.3142e-05, -2.3868e-06,  ..., -7.2127e-06,
         1.5091e-05,  5.0695e-07])
speech shape torch.Size([14848])
Speech synthesis complete and saved for number 9 to ./output/9_T5modelFineGetallen.wav
