In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-mya")

def preprocess_text(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True)
    return tokens.input_ids


In [2]:
from phonemizer import phonemize

def phonemize_text(text, language):
    return phonemize(text, language=language, backend="espeak")


In [None]:
import torchaudio

def preprocess_audio(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    spectrogram = torchaudio.transforms.MelSpectrogram()(waveform)
    return spectrogram


def preprocess_function(example):
    speech_array, sampling_rate = torchaudio.load(example["audio_path"])
    example["speech"] = speech_array
    example["sampling_rate"] = sampling_rate
    return example

In [19]:
from datasets import Dataset
import json

def load_dataset(json_filepath: str):
    with open(json_filepath, "r") as f:
        data = json.load(f)
       
        transformed_data = {key: [d[key] for d in data] for key in data[0].keys()}
        return Dataset.from_dict(transformed_data)

dataset = load_dataset("data.json")
 

In [20]:
print(dataset)

Dataset({
    features: ['text', 'audio_path'],
    num_rows: 2
})


In [29]:
from transformers import VitsModel, AutoTokenizer
import torch

model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

text = "ကဲ ဒါဆိုရင်တော့, character တွေရဲ့ Abilities တွေကို စတင်ကြည့်ရှုရအောင်"
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    output = model(**inputs).waveform
from IPython.display import Audio

Audio(output, rate=model.config.sampling_rate)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
trainer.train()


In [None]:
def synthesize_speech(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model.generate(inputs.input_ids)
    return outputs

result = synthesize_speech("မင်္ဂလာပါ, Hello!")
