In [1]:
from os import path
# Need to install datasets
from datasets import load_dataset, DatasetDict, Audio

In [2]:
main_path = "<main_path>" # main path to the folder with both audio files and transcriptions

audio_path = "audio" # name of the folder with audio files
transcriptions_path = "transcriptions" # name of the folder with transcriptions

In [None]:
audio_ds = load_dataset(
    "audiofolder",
    data_dir=path.join(main_path, audio_path)
).cast_column("audio", Audio(sampling_rate=16_000)) # Need to cast all audio files to mono 16kHz
                                                    # since most ASR models are trained on 16kHz audio
                                                    # and are not supporting any other sampling rates

transcriptions_ds = load_dataset(path.join(main_path, transcriptions_path))

Add transcriptions to the dataset

In [4]:
train_ds = audio_ds["train"].add_column("transcription", transcriptions_ds["train"]["text"])
test_ds = audio_ds["test"].add_column("transcription", transcriptions_ds["test"]["text"])

In [5]:
full_ds = DatasetDict(
    {
        "train": train_ds,
        "test": test_ds,
    }
)

## Look at the data

In [None]:
full_ds

In [None]:
full_ds["train"][75]

## Push the dataset to Hugging Face repository

!! Don't forget to log in to Hugging Face with `huggingface-cli login`

In [None]:
dataset_name = "VDK/hse_lectures_dataset_private"

full_ds.push_to_hub(
    dataset_name,
    private=True,  # Makes the dataset private since it contains not publicly available HSE lectures
    max_shard_size="300MB", # Splits the dataset into shards of 300MB each to avoid errors
)

## Check the dataset

In [None]:
dataset = load_dataset(dataset_name)

Look at the data and check its format

In [None]:
dataset

In [None]:
dataset["train"][75]