Here's what this notebook does, step-by-step:

1. Install the `montreal-force-aligner` (MFA) library using conda.
2. Download the Viet Bible Vox dataset from Huggingface.
3. Generate a normalized transcript file for each .mp3 file in the dataset.
4. Train an MFA model, then align speech and phonemes (creating a timestamp for each phoneme).
5. Generate a TensorFlow dataset (tfrecord files) that is ready for training TTS models.

In [None]:
%pip install -Uqq tensorflow numpy torch tqdm librosa numba regex

In [None]:
# ./data is our working directory
!mkdir -p data
%cd data

In [None]:
#### INSTALL MFA  ####
!wget https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-Linux-x86_64.sh -qO ./miniconda.sh
# !wget https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -qO $PWD/miniconda.sh # for Apple M1
!bash ./miniconda.sh -b -p ./miniconda
!source ./miniconda/bin/activate && conda create -n aligner -c conda-forge montreal-forced-aligner=2.2.15 -y --quiet

In [None]:
!git lfs install # LFS is needed to download the data
!git clone https://huggingface.co/datasets/ntt123/VietBibleVox
!cd VietBibleVox; unzip -qq data.zip
!cd VietBibleVox; ls *.mp3 | wc -l
!cd VietBibleVox; ls *.txt | wc -l

In [None]:
import regex
import unicodedata
from pathlib import Path

In [None]:
vietnamese_characters = [
    'a', 'à', 'á', 'ả', 'ã', 'ạ',
    'ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ',
    'â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ',
    'e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ',
    'ê', 'ề', 'ế', 'ể', 'ễ', 'ệ',
    'i', 'ì', 'í', 'ỉ', 'ĩ', 'ị',
    'o', 'ò', 'ó', 'ỏ', 'õ', 'ọ',
    'ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ',
    'ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ',
    'u', 'ù', 'ú', 'ủ', 'ũ', 'ụ',
    'ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự',
    'y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ',
    'b', 'c', 'd', 'đ', 'g', 'h', 
    'k', 'l', 'm', 'n', 'p', 'q', 
    'r', 's', 't', 'v', 'x'
]
alphabet = "".join(vietnamese_characters)
space_re = regex.compile(r"\s+")
number_re = regex.compile("([0-9]+)")
digits = ["không", "một", "hai", "ba", "bốn", "năm", "sáu", "bảy", "tám", "chín"]
num_re = regex.compile(r"([0-9.,]*[0-9])")
keep_text_and_num_re = regex.compile(rf'[^\s{alphabet}.,0-9]')
keep_text_re = regex.compile(rf'[^\s{alphabet}]')

In [None]:
def read_number(num: str) -> str:
    if len(num) == 1:
        return digits[int(num)]
    elif len(num) == 2 and num.isdigit():
        n = int(num)
        end = digits[n % 10]
        if n == 10:
            return "mười"
        if n % 10 == 5:
            end = "lăm"
        if n % 10 == 0:
            return digits[n // 10] + " mươi"
        elif n < 20:
            return "mười " + end
        else:
            if n % 10 == 1:
                end = "mốt"
            return digits[n // 10] + " mươi " + end
    elif len(num) == 3 and num.isdigit():
        n = int(num)
        if n % 100 == 0:
            return digits[n // 100] + " trăm"
        elif num[1] == "0":
            return digits[n // 100] + " trăm lẻ " + digits[n%100]
        else:
            return digits[n // 100] + " trăm " + read_number(num[1:])
    elif "," in num:
        n1, n2 = num.split(",")
        return read_number(n1) + " phẩy " + read_number(n2)
    elif "." in num:
        parts = num.split(".")
        if len(parts) == 2:
            if parts[1] == "000":
                return read_number(parts[0]) + " ngàn"
            elif parts[1].startswith("00"):
                end = digits[int(parts[1][2:])]
                return read_number(parts[0]) + " ngàn lẻ " + end
            else:
                return read_number(parts[0]) + " ngàn " + read_number(parts[1])
        elif len(parts) == 3:
            if num == "1.000.000":
                return "một triệu"
            elif num == "1.100.000":
                return "một triệu một trăm ngàn"
            elif num == "1.820.000":
                return "một triệu tám trăm hai mươi ngàn"
            elif num == "3.640.000":
                return "ba triệu sáu trăm bốn mươi ngàn"
            else:
                raise ValueError(f"Cannot convert {num}")
    else:
        raise ValueError(f"Cannot convert {num}")
    return num

In [None]:
def normalize_text(x):
    x = unicodedata.normalize('NFKC', x)
    x = x.lower()
    x = num_re.sub(r" \1 ", x)
    x = keep_text_and_num_re.sub(" ", x)
    words = x.split()
    words = [ read_number(w) if num_re.fullmatch(w) else w for w in words ]
    x = " ".join(words)
    x = keep_text_re.sub(" ", x)
    x = space_re.sub(" ", x)
    x = x.strip()
    return x

In [None]:
all_text = []
for fp in sorted(Path("VietBibleVox").glob("*.txt")):
    with open(fp, "r", encoding="utf-8") as f:
        text = f.read()
        text = normalize_text(text)
        all_text.append(text)
    # override the text file
    with open(fp, "w", encoding="utf-8") as f:
        f.write(text)
all_words = sorted(set((" ".join(all_text)).split()))

In [None]:
with open("lexicon.txt", "w") as f:
    for w in all_words:
        w = w.strip()
        p = list(w)
        p = " ".join(p)
        f.write(f"{w}\t{p}\n")

In [None]:
# replace `nproc` by `sysctl -n hw.physicalcpu` if you are using MacOS
!source miniconda/bin/activate aligner; \
mfa train \
    --num_jobs `nproc` \
    --use_mp \
    --clean \
    --overwrite \
    --no_textgrid_cleanup \
    --single_speaker \
    --output_format json \
    --output_directory VietBibleVox \
    VietBibleVox ./lexicon.txt vbx_mfa

In [None]:
#### PREPARE TF DATASET ####
# Note: We are using a sampling rate of 16k,
# even though the original data has a sampling rate of 48k.

In [None]:
import json
from pathlib import Path
import numpy as np
import torch
import json
import librosa
import tensorflow as tf
from tqdm.auto import tqdm
import random

In [None]:
mel_basis = {}
hann_window = {}

def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
    global hann_window
    if torch.min(y) < -1.0:
        print("min value is ", torch.min(y))
    if torch.max(y) > 1.0:
        print("max value is ", torch.max(y))

    dtype_device = str(y.dtype) + "_" + str(y.device)
    wnsize_dtype_device = str(win_size) + "_" + dtype_device
    if wnsize_dtype_device not in hann_window:
        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
            dtype=y.dtype, device=y.device
        )

    y = torch.nn.functional.pad(
        y.unsqueeze(1),
        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
        mode="reflect",
    )
    y = y.squeeze(1)

    spec = torch.stft(
        y,
        n_fft,
        hop_length=hop_size,
        win_length=win_size,
        window=hann_window[wnsize_dtype_device],
        center=center,
        pad_mode="reflect",
        normalized=False,
        onesided=True,
        return_complex=True
    )
    spec = torch.view_as_real(spec)
    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
    spec = spec.squeeze(0)
    return torch.swapaxes(spec, 0, 1)


def tensor_to_bytes(t):
    t = tf.constant(t)
    t = tf.io.serialize_tensor(t)
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[t.numpy()]))


def write_tfdata(data, out_file):
    with tf.io.TFRecordWriter(out_file) as file_writer:
        for wav_file, seq, _ in data:
            phone_seq = []
            for phone, duration in seq:
                phone_idx = phone_set.index(phone)
                phone_seq.append((phone_idx, duration))
            phone_seq = np.array(phone_seq, dtype=np.float32)

            # load wav
            wav, sr = librosa.load(wav_file, sr=config["data"]["sampling_rate"], dtype=np.float32)
            wav = torch.from_numpy(wav)
            # compute spec
            spec = spectrogram_torch(
                wav[None],
                n_fft=config["data"]["filter_length"],
                sampling_rate=config["data"]["sampling_rate"],
                hop_size=config["data"]["hop_length"],
                win_size=config["data"]["win_length"],
                center=False
            )
            features = {
                "phone_idx": tensor_to_bytes(phone_seq[:, 0].astype(np.int32)),
                "phone_duration": tensor_to_bytes(phone_seq[:, 1]),
                "wav": tensor_to_bytes(wav.half().numpy()),
                "spec": tensor_to_bytes(spec.half().numpy())
            }
            example = tf.train.Example(features=tf.train.Features(feature=features))
            file_writer.write(example.SerializeToString())

def write_split(split, data, num_chunks):
    data = np.array(data, dtype=object)
    chunks = list(np.array_split(data, num_chunks))
    for i, chunk in enumerate(tqdm(chunks)):
        write_tfdata(chunk, f"tfdata/{split}/part_{i:03d}.tfrecords")

In [None]:
!mkdir -p tfdata/{train,test}

In [None]:
with open("../config.json", "rb") as f:
    config = json.load(f)
device = "cuda" if torch.cuda.is_available() else "cpu"
data_dir = Path("VietBibleVox")
json_files = sorted(data_dir.glob("*.json"))
dataset = []
phone_set = []

for file_path in json_files:
    with open(file_path, "rb") as f:
        data = json.load(f)
    seq = []
    word_index = 0
    words =  data["tiers"]["words"]["entries"]
    for start, end, phone in data["tiers"]["phones"]["entries"]:
        if start > words[word_index][1] - 1e-5:
            seq.append( ("<SEP>", 0) )
            word_index += 1
        duration = end * 1000 - start * 1000 # ms
        phone_set.append(phone)
        seq.append( (phone, duration) )
    wav_file = file_path.with_suffix(".mp3")
    dataset.append((wav_file, seq, data["end"]))

phone_set = ["<SEP>"] + sorted(set(phone_set))
assert len(phone_set) <= 256
with open("phone_set.json", "w", encoding="utf-8") as f:
    json.dump(phone_set, f)

assert phone_set.index("<SEP>") == 0

In [None]:
random.Random(42).shuffle(dataset)
L = len(dataset) - 256
train_data = dataset[:L]
test_data = dataset[L:]
print("Train data size:", len(train_data))
print("Test data size:", len(test_data))

In [None]:
write_split("test", test_data, 1)

In [None]:
write_split("train", train_data, 256)