# Creating HF-format dataset from a file directory with audios and transcriptions and then uploading it to HF

In [None]:
%pip install -U datasets
# Need to install datasets library

In [None]:
from os import path

from datasets import load_dataset, Audio

Set up folder names:

In [None]:
main_path = "<main_path>" # main path to the folder with both audio files and transcriptions

audio_folder = "audio" # name of the folder with audio files
transcriptions_folder = "transcriptions" # name of the folder with transcriptions

_Note: both audio and transcriptions folder should contain .wav and .txt files respectively in the same subfolders as in the example below:_

    \audio
        \lectures
            \lecture_1.wav
            ...
            \lecture_10.wav
        \seminars
            \seminar_1.wav
            ...
            \seminar_20.wav
        \tutorials
            \tutorial_1.wav
            ...
            \tutorial_123.wav

    \transcriptions
        \lectures
            \lecture_1.txt
            ...
            \lecture_10.txt
        \seminars
            \seminar_1.txt
            ...
            \seminar_20.txt
        \tutorials
            \tutorial_1.txt
            ...
            \tutorial_123.txt

## Loading data

In [None]:
audio_ds = load_dataset(
    "audiofolder",
    data_dir=path.join(main_path, audio_folder)
).cast_column(
    "audio",
    Audio(                      # Need to cast all audio files to mono 16kHz
        sampling_rate=16_000,   # since most ASR models are trained on 16kHz audio
        mono=True,              # and do not support any other sampling rates
    ),
)

transcriptions_ds = load_dataset(path.join(main_path, transcriptions_folder))

Adding transcriptions to the dataset

In [None]:
full_ds = audio_ds["train"].add_column(
    name="transcription",
    column=transcriptions_ds["train"]["text"],
)

Splitting the dataset into train and test splits

In [None]:
full_ds = full_ds.train_test_split(
    test_size=0.2, # Let test set be 20% of the data
    stratify_by_column="label", # Make sure split saves the label proportions
    shuffle=True,
    seed=42, # Just for reproducibility
)

## Look at the dataset

In [None]:
full_ds.shape

In [None]:
full_ds["train"].features

In [None]:
full_ds["train"][42]

## Push the dataset to Hugging Face repository

In [None]:
hf_token = "<HF_TOKEN>" # Replace with your token !!! (it must have write permissions)

_Optionally, you can log in to your Hugging Face account using the `huggingface-cli login` command and omit the `token` argument._

In [None]:
dataset_name = "<user_name>/<repository_name>" # Dataset repository name

full_ds.push_to_hub(
    dataset_name,
    private=True,
    max_shard_size="300MB", # Splits the dataset into shards of 300MB each to avoid errors on uploading
    token=hf_token,
)

## Check the dataset

In [None]:
dataset = load_dataset(dataset_name, token=hf_token)

Look at the data and check its format

In [None]:
dataset

In [None]:
dataset.shape

In [None]:
dataset["train"].features

In [None]:
dataset["train"][42]