In [28]:
from transformers import VitsModel, VitsTokenizer

model = VitsModel.from_pretrained("intelsense/mms-tts-ben")
tokenizer = VitsTokenizer.from_pretrained("intelsense/mms-tts-ben")

In [29]:
import torch.nn as nn

# Add new tokens
new_tokens = ['৫', 'ড়', '৬', '৯', '২', '৮', '৭', '৪', '১', 'য়', '৩', 'ঢ়']
num_added = tokenizer.add_tokens(new_tokens)

# Assume `embeddings` is the original embedding layer in the VITS model
old_embeddings = model.text_encoder.embed_tokens
old_embedding_weight = old_embeddings.weight.data

# Define new embedding layer with updated size
new_embedding_layer = nn.Embedding(len(tokenizer) - 1, old_embedding_weight.shape[1])

# Copy old weights into the new embedding layer
new_embedding_layer.weight.data[:old_embedding_weight.size(0), :] = old_embedding_weight

# Initialize new token embeddings (e.g., with the mean of existing ones)
new_token_embeddings = old_embedding_weight.mean(dim=0, keepdim=True).repeat(len(new_tokens), 1)
new_embedding_layer.weight.data[-len(new_tokens):, :] = new_token_embeddings

# Replace the embedding layer in the model
model.text_encoder.embed_tokens = new_embedding_layer

In [30]:
model

VitsModel(
  (text_encoder): VitsTextEncoder(
    (embed_tokens): Embedding(86, 192)
    (encoder): VitsEncoder(
      (layers): ModuleList(
        (0-5): 6 x VitsEncoderLayer(
          (attention): VitsAttention(
            (k_proj): Linear(in_features=192, out_features=192, bias=True)
            (v_proj): Linear(in_features=192, out_features=192, bias=True)
            (q_proj): Linear(in_features=192, out_features=192, bias=True)
            (out_proj): Linear(in_features=192, out_features=192, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
          (layer_norm): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
          (feed_forward): VitsFeedForward(
            (conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,))
            (conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,))
            (dropout): Dropout(p=0.1, inplace=False)
            (act_fn): ReLU()
          )
          (final_layer_norm): LayerNorm((192,), eps=1e-05, eleme

In [34]:
model.save_pretrained("mms-tts-ben-v2")
tokenizer.save_pretrained("mms-tts-ben-v2")
# edit config.json vocab_size in saved model

('mms-tts-ben-v2/tokenizer_config.json',
 'mms-tts-ben-v2/special_tokens_map.json',
 'mms-tts-ben-v2/vocab.json',
 'mms-tts-ben-v2/added_tokens.json')

In [31]:
waveform = model(tokenizer.encode("আমি বাংলায় গান গাই।", return_tensors="pt")).waveform

In [32]:
from IPython.display import Audio

Audio(waveform.detach().numpy(), rate=22050)

In [None]:
curl -X POST "http://20.212.243.159:9000/analyze_image/" -H "Content-Type: application/json" -d "{\"image_path\":\"/home/samikhan/Downloads/nid.png\"}"