# Testing Finetuned Model

In [2]:
import sentencepiece
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
from transformers import SpeechT5HifiGan
from datasets import load_dataset
import torch
import config

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split= "validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

In [29]:
def play_processed_model(text_arr: list[str], model, embeddings):
    from IPython.display import Audio
    audio_samples = []

    for item in text_arr:

        ## Tokenise the input text
        input = processor (text= item, return_tensors = "pt") ## "pt" --> pytorch tensors
        print("Encoded input for the model: ", input) ## returns pytorch tensors

        ## model generates speech using the input passed into it, the speaker embeddings defined in the cell above,
        ## and a vocoder.
        ## If a vocoder is not specified, this method outputs a spectrogram.
        speech = model.generate_speech(input["input_ids"], embeddings, vocoder = vocoder)
        audio_samples.append(Audio(speech, rate = 16000))

    for text, audio in zip(text_arr, audio_samples):
        display(text)
        display(audio)
        
    return audio_samples

### Attempt 1: Finetuned model v1 (adapted from this [notebook](https://colab.research.google.com/drive/1i7I5pzBcU3WDFarDnzweIj4-sVVoIUFJ#scrollTo=sibMqU6qU-h8))
(Commit ce666a35b66cd4aac5978d82572b75a5d032551c)

- 50 speakers
- 4000 training and eval steps, Train Batch Size = 16, Eval batch size = 8
- Train size = 26636, Test size = 2960
- Eval steps are placed after every 1000th training step
- Best model is loaded at the end (minimises spectrogram loss)
- Total epochs (derived) = 266636/16 = 1664.75 (approximately 1665 training steps for the whole dataset). So epochs = (4000-4)/1665 = 2.4 epochs 

In [22]:
my_model = SpeechT5ForTextToSpeech.from_pretrained("JET2001/speecht5_tts_imda_nsc_p1", token = config.HF_TOKEN,
                                                  revision='ce666a35b66cd4aac5978d82572b75a5d032551c')
my_model



SpeechT5ForTextToSpeech(
  (speecht5): SpeechT5Model(
    (encoder): SpeechT5EncoderWithTextPrenet(
      (prenet): SpeechT5TextEncoderPrenet(
        (embed_tokens): Embedding(81, 768, padding_idx=1)
        (encode_positions): SpeechT5ScaledPositionalEncoding(
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (wrapped_encoder): SpeechT5Encoder(
        (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (layers): ModuleList(
          (0-11): 12 x SpeechT5EncoderLayer(
            (attention): SpeechT5Attention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (dropout): Dropout(p=0.1, inplace=False)
     

Get test set

In [23]:
from datasets import load_from_disk
processed_imda_dataset = load_from_disk('hf-imda-dataset-with-embeddings')
dataset = processed_imda_dataset.train_test_split(test_size = 0.1, seed = 1708)
dataset

  table = cls._concat_blocks(blocks, axis=0)


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'speaker_embeddings'],
        num_rows: 26636
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'speaker_embeddings'],
        num_rows: 2960
    })
})

In [34]:
## Get a speaker from the test set
example = dataset['test'][90]
sg_speaker1_embeddings = torch.tensor(example['speaker_embeddings']).unsqueeze(0)

torch.Size([1, 512])

In [35]:
texts = ["Hello, how are you?",
    "I tell you already, I don't like this",
    "Today so hot leh",
    "Walau"]
output = play_processed_model(texts, my_model, sg_speaker1_embeddings)

Encoded input for the model:  {'input_ids': tensor([[ 4, 35,  5, 15, 15,  8, 23,  4, 11,  8, 20,  4,  7, 13,  5,  4, 22,  8,
         16, 41,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Encoded input for the model:  {'input_ids': tensor([[ 4, 30,  4,  6,  5, 15, 15,  4, 22,  8, 16,  4,  7, 15, 13,  5,  7, 14,
         22, 23,  4, 30,  4, 14,  8,  9, 31,  6,  4, 15, 10, 28,  5,  4,  6, 11,
         10, 12,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Encoded input for the model:  {'input_ids': tensor([[ 4, 32,  8, 14,  7, 22,  4, 12,  8,  4, 11,  8,  6,  4, 15,  5, 11,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Encoded input for the model:  {'input_ids': tensor([[ 4, 38,  7, 15,  7, 16,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


'Hello, how are you?'

"I tell you already, I don't like this"

'Today so hot leh'

'Walau'

In [36]:
# Text without Singlish
texts = [
    "The two houses of Congress are much alike in their concern with local and special interest legislation", 
    "their intricate legislative and parliamentary procedure", 
    "their tendency toward voting by blocks and interest groups in defiance of party ties.",
    "Yet the upper chamber has a character all its own."
]
output = play_processed_model(texts, my_model, sg_speaker1_embeddings)

Encoded input for the model:  {'input_ids': tensor([[ 4, 32, 11,  5,  4,  6, 20,  8,  4, 11,  8, 16, 12,  5, 12,  4,  8, 19,
          4, 42,  8,  9, 21, 13,  5, 12, 12,  4,  7, 13,  5,  4, 18, 16, 17, 11,
          4,  7, 15, 10, 28,  5,  4, 10,  9,  4,  6, 11,  5, 10, 13,  4, 17,  8,
          9, 17,  5, 13,  9,  4, 20, 10,  6, 11,  4, 15,  8, 17,  7, 15,  4,  7,
          9, 14,  4, 12, 24,  5, 17, 10,  7, 15,  4, 10,  9,  6,  5, 13,  5, 12,
          6,  4, 15,  5, 21, 10, 12, 15,  7,  6, 10,  8,  9,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}
Encoded input for the model:  {'input_ids': tensor([[ 4,  6, 11,  5, 10, 13,  4, 10,  9,  6, 13, 10, 17, 

'The two houses of Congress are much alike in their concern with local and special interest legislation'

'their intricate legislative and parliamentary procedure'

'their tendency toward voting by blocks and interest groups in defiance of party ties.'

'Yet the upper chamber has a character all its own.'

In [38]:
texts = ["We usually start with a kopi or teh from the kopitiam before heading off to work or school.", 
         "Traffic can be a bit sian, especially during rush hour.",
         "Work or classes keep us busy until lunch, where we might grab some hawker food like chicken rice or laksa.",
         "Singaporeans love to makan, so dinner can be quite a spread.",
         "Weekends ah? That's when we really let loose.",
         "Maybe head to the heartland mall for some shopping or catch a movie.",
         "Sometimes we just lepak at a friend's place or explore the nightlife.",
         "but not every day is the same tho"]
output = play_processed_model(texts, my_model, sg_speaker1_embeddings)

Encoded input for the model:  {'input_ids': tensor([[ 4, 38,  5,  4, 16, 12, 16,  7, 15, 15, 22,  4, 12,  6,  7, 13,  6,  4,
         20, 10,  6, 11,  4,  7,  4, 28,  8, 24, 10,  4,  8, 13,  4,  6,  5, 11,
          4, 19, 13,  8, 18,  4,  6, 11,  5,  4, 28,  8, 24, 10,  6, 10,  7, 18,
          4, 25,  5, 19,  8, 13,  5,  4, 11,  5,  7, 14, 10,  9, 21,  4,  8, 19,
         19,  4,  6,  8,  4, 20,  8, 13, 28,  4,  8, 13,  4, 12, 17, 11,  8,  8,
         15, 26,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Encoded input for the model:  {'input_ids': tensor([[ 4, 32, 13,  7, 19, 19, 10, 17,  4, 17,  7,  9,  4, 25,  5,  4,  7,  4,
         25, 10,  6,  4, 12, 10,  7,  9, 23,  4,  5, 12, 24,  5, 1

'We usually start with a kopi or teh from the kopitiam before heading off to work or school.'

'Traffic can be a bit sian, especially during rush hour.'

'Work or classes keep us busy until lunch, where we might grab some hawker food like chicken rice or laksa.'

'Singaporeans love to makan, so dinner can be quite a spread.'

"Weekends ah? That's when we really let loose."

'Maybe head to the heartland mall for some shopping or catch a movie.'

"Sometimes we just lepak at a friend's place or explore the nightlife."

'but not every day is the same tho'

### Attempt 2: Finetuned model v2
(Commit = 467f0bbb4f7a5cb9295446f20abb72e33e954e15)
- Learning rate = 1e^-05
- Train batch size = 32, Eval batch size = 64
- Trained on 10 epochs on an EarlyStop callback(instead of 4000 training steps)
- Seed = 42
- Train size: 70472 samples
- Test size: 17618 samples

In [4]:
my_model_v2 = SpeechT5ForTextToSpeech.from_pretrained("JET2001/speecht5_tts_imda_nsc_p1", token = config.HF_TOKEN,
                                                  revision='467f0bbb4f7a5cb9295446f20abb72e33e954e15')
my_model_v2

Downloading model.safetensors:   0%|          | 0.00/578M [00:00<?, ?B/s]



SpeechT5ForTextToSpeech(
  (speecht5): SpeechT5Model(
    (encoder): SpeechT5EncoderWithTextPrenet(
      (prenet): SpeechT5TextEncoderPrenet(
        (embed_tokens): Embedding(81, 768, padding_idx=1)
        (encode_positions): SpeechT5ScaledPositionalEncoding(
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (wrapped_encoder): SpeechT5Encoder(
        (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (layers): ModuleList(
          (0-11): 12 x SpeechT5EncoderLayer(
            (attention): SpeechT5Attention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (dropout): Dropout(p=0.1, inplace=False)
     

In [5]:
from datasets import load_dataset
processed_imda_dataset = load_dataset('JET2001/hf-imda-dataset-with-embeddings-150-speakers')
processed_imda_dataset = processed_imda_dataset['train']
dataset = processed_imda_dataset.train_test_split(test_size = 0.2, seed = 1708)
test_split = dataset['test']

Downloading readme:   0%|          | 0.00/397 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/454M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/451M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/469M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/516M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/427M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/434M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/473M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/511M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/507M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/468M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/489M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/498M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/504M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/479M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/460M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/460M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/498M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/492M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/88090 [00:00<?, ? examples/s]

  table = cls._concat_blocks(blocks, axis=0)


In [6]:
dataset_path = "hf-imda-dataset-with-embeddings-150-speakers"
processed_imda_dataset.save_to_disk(dataset_path)

Saving the dataset (0/20 shards):   0%|          | 0/88090 [00:00<?, ? examples/s]

In [43]:
## Get a speaker from the test set
example = dataset['test'][1500]
print("example = ", example.keys())
sg_speaker1_v2_embeddings = torch.tensor(example['speaker_embeddings']).unsqueeze(0)
sg_speaker1_v2_embeddings.shape

example =  dict_keys(['input_ids', 'labels', 'speaker_embeddings'])


torch.Size([1, 512])

In [44]:
texts = ["Hello, how are you?",
    "I tell you already, I don't like this",
    "Today so hot leh",
    "Walau"]
output = play_processed_model(texts, my_model_v2, sg_speaker1_v2_embeddings)

Encoded input for the model:  {'input_ids': tensor([[ 4, 35,  5, 15, 15,  8, 23,  4, 11,  8, 20,  4,  7, 13,  5,  4, 22,  8,
         16, 41,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Encoded input for the model:  {'input_ids': tensor([[ 4, 30,  4,  6,  5, 15, 15,  4, 22,  8, 16,  4,  7, 15, 13,  5,  7, 14,
         22, 23,  4, 30,  4, 14,  8,  9, 31,  6,  4, 15, 10, 28,  5,  4,  6, 11,
         10, 12,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Encoded input for the model:  {'input_ids': tensor([[ 4, 32,  8, 14,  7, 22,  4, 12,  8,  4, 11,  8,  6,  4, 15,  5, 11,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Encoded input for the model:  {'input_ids': tensor([[ 4, 38,  7, 15,  7, 16,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


'Hello, how are you?'

"I tell you already, I don't like this"

'Today so hot leh'

'Walau'

In [45]:
# Text without Singlish
texts = [
    "The two houses of Congress are much alike in their concern with local and special-interest legislation", 
    "their intricate legislative and parliamentary procedure", 
    "their tendency toward voting by blocks and interest groups in defiance of party ties.",
    "Yet the upper chamber has a character on its own."
]
output = play_processed_model(texts, my_model_v2, sg_speaker1_v2_embeddings)

Encoded input for the model:  {'input_ids': tensor([[ 4, 32, 11,  5,  4,  6, 20,  8,  4, 11,  8, 16, 12,  5, 12,  4,  8, 19,
          4, 42,  8,  9, 21, 13,  5, 12, 12,  4,  7, 13,  5,  4, 18, 16, 17, 11,
          4,  7, 15, 10, 28,  5,  4, 10,  9,  4,  6, 11,  5, 10, 13,  4, 17,  8,
          9, 17,  5, 13,  9,  4, 20, 10,  6, 11,  4, 15,  8, 17,  7, 15,  4,  7,
          9, 14,  4, 12, 24,  5, 17, 10,  7, 15, 39, 10,  9,  6,  5, 13,  5, 12,
          6,  4, 15,  5, 21, 10, 12, 15,  7,  6, 10,  8,  9,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}
Encoded input for the model:  {'input_ids': tensor([[ 4,  6, 11,  5, 10, 13,  4, 10,  9,  6, 13, 10, 17, 

'The two houses of Congress are much alike in their concern with local and special-interest legislation'

'their intricate legislative and parliamentary procedure'

'their tendency toward voting by blocks and interest groups in defiance of party ties.'

'Yet the upper chamber has a character on its own.'

In [46]:
# Text with Singlish
texts = ["We usually start with a kopi or teh from the kopitiam before heading off to work or school.", 
         "Traffic can be a bit sian, especially during rush hour.",
         "Work or classes keep us busy until lunch, where we might grab some hawker food like chicken rice or laksa.",
         "Singaporeans love to makan, so dinner can be quite a spread.",
         "Weekends ah? That's when we really let loose.",
         "Maybe head to the heartland mall for some shopping or catch a movie.",
         "Sometimes we just lepak at a friend's place or explore the nightlife.",
         "but not every day is the same tho"]
output = play_processed_model(texts, my_model_v2, sg_speaker1_v2_embeddings)

Encoded input for the model:  {'input_ids': tensor([[ 4, 38,  5,  4, 16, 12, 16,  7, 15, 15, 22,  4, 12,  6,  7, 13,  6,  4,
         20, 10,  6, 11,  4,  7,  4, 28,  8, 24, 10,  4,  8, 13,  4,  6,  5, 11,
          4, 19, 13,  8, 18,  4,  6, 11,  5,  4, 28,  8, 24, 10,  6, 10,  7, 18,
          4, 25,  5, 19,  8, 13,  5,  4, 11,  5,  7, 14, 10,  9, 21,  4,  8, 19,
         19,  4,  6,  8,  4, 20,  8, 13, 28,  4,  8, 13,  4, 12, 17, 11,  8,  8,
         15, 26,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Encoded input for the model:  {'input_ids': tensor([[ 4, 32, 13,  7, 19, 19, 10, 17,  4, 17,  7,  9,  4, 25,  5,  4,  7,  4,
         25, 10,  6,  4, 12, 10,  7,  9, 23,  4,  5, 12, 24,  5, 1

'We usually start with a kopi or teh from the kopitiam before heading off to work or school.'

'Traffic can be a bit sian, especially during rush hour.'

'Work or classes keep us busy until lunch, where we might grab some hawker food like chicken rice or laksa.'

'Singaporeans love to makan, so dinner can be quite a spread.'

"Weekends ah? That's when we really let loose."

'Maybe head to the heartland mall for some shopping or catch a movie.'

"Sometimes we just lepak at a friend's place or explore the nightlife."

'but not every day is the same tho'