In [4]:
from transformers import WhisperProcessor
import datasets
from datasets import Audio

In [5]:
def transliterate_cir2lat(text: str) -> str:
    """
    Pretvara tekst napisan ćirilicom u latinicu
    :param text: Tekst na ćirilici
    :return: Tekst na latinici
    """
    mappings = {
        "а": "a",
        "б": "b",
        "в": "v",
        "г": "g",
        "д": "d",
        "ђ": "đ",
        "е": "e",
        "ж": "ž",
        "з": "z",
        "и": "i",
        "ј": "j",
        "к": "k",
        "л": "l",
        "љ": "lj",
        "м": "m",
        "н": "n",
        "њ": "nj",
        "о": "o",
        "п": "p",
        "р": "r",
        "с": "s",
        "т": "t",
        "ћ": "ć",
        "у": "u",
        "ф": "f",
        "х": "h",
        "ц": "c",
        "ч": "č",
        "џ": "dž",
        "ш": "š",
        "А": "A",
        "Б": "B",
        "В": "V",
        "Г": "G",
        "Д": "D",
        "Ђ": "Đ",
        "Е": "E",
        "Ж": "Ž",
        "З": "Z",
        "И": "I",
        "Ј": "J",
        "К": "K",
        "Л": "L",
        "Љ": "Lj",
        "М": "M",
        "Н": "N",
        "Њ": "Nj",
        "О": "O",
        "П": "P",
        "Р": "R",
        "С": "S",
        "Т": "T",
        "Ћ": "Ć",
        "У": "U",
        "Ф": "F",
        "Х": "H",
        "Ц": "C",
        "Ч": "Č",
        "Џ": "Dž",
        "Ш": "Š",
    }
    translit = ""
    for char in text:
        if char in mappings.keys():
            translit = translit + mappings[char]
        else:
            translit = translit + char
    return translit

In [6]:
# include serbian, bosnian and croatian
dataset_configs_to_use = [
    {
        "dataset_name": "mozilla-foundation/common_voice_13_0",
        "languages": ["sr", "bs", "hr"],
        "audio_column": "audio",
        "text_column": "sentence",
    },
    {
        "dataset_name": "facebook/voxpopuli",
        "languages": ["hr"],
        "audio_column": "audio",
        "text_column": "sentence",
    },
    {
        "dataset_name": "google/fleurs",
        "languages": ["Serbian", "Croatian", "Bosnian"],
        "audio_column": "audio",
        "text_column": "sentence",
    },
]
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small", language="sinhalese", task="transcribe"
)

In [8]:
def preprocess_data(sample):
    sample["text"] = transliterate_cir2lat(sample["text"])
    return sample


def create_dataset():
    dataset_list = []
    for config in dataset_configs_to_use:
        for language in config["languages"]:
            print("DATASETNAME:", config["dataset_name"])
            data = datasets.load_dataset(
                config["dataset_name"], language, split="train+validation+test"
            )
            data = datasets.Dataset.from_dict(
                {
                    "text": data[config["text_column"]],
                    "audio": data[config["audio_column"]],
                }
            )
            sampling_rate = processor.feature_extractor.sampling_rate()
            data = data.cast_column("audio", Audio(sampling_rate=sampling_rate))
            new_data = data.map(
                preprocess_data,
                input_columns=[config["audio_column"], config["text_column"]],
            )
            dataset_list.append(new_data)
            exit()

    new_dataset = datasets.concatenate_datasets(dataset_list)
    new_dataset = new_dataset.shuffle(seed=42)
    return new_dataset


dataset = create_dataset()

DATASETNAME: mozilla-foundation/common_voice_13_0


Downloading data files: 100%|██████████| 5/5 [00:00<00:00, 6668.21it/s]
Extracting data files: 100%|██████████| 5/5 [00:00<00:00, 1791.37it/s]
Downloading data files: 100%|██████████| 5/5 [00:00<00:00, 9304.13it/s]
Extracting data files: 100%|██████████| 5/5 [00:00<00:00, 2080.71it/s]
Reading metadata...: 1499it [00:00, 140992.12it/s]les/s]
Generating train split: 0 examples [00:00, ? examples/s]


DatasetGenerationError: An error occurred while generating the dataset

In [None]:
dataset.save_to_disk("test_data")