In [None]:
import logging
import os

from hirundo import (
    Domain,
    GitPlainAuth,
    GitRepo,
    HirundoCSV,
    LabelingType,
    QADataset,
    StorageConfig,
    StorageGit,
    StorageTypes,
)

logger = logging.getLogger(__name__)

unique_id = os.getenv("UNIQUE_ID", "").replace(".", "-").replace("/", "-")

run_ids = [run.run_id for run in QADataset.list_runs() if run.run_id]
for run_id in run_ids:
    QADataset.cancel_by_id(run_id)
dataset_ids = [dataset.id for dataset in QADataset.list_datasets()]
for dataset_id in dataset_ids:
    QADataset.delete_by_id(dataset_id)
storage_ids = [storage_config.id for storage_config in StorageConfig.list()]
for storage_id in storage_ids:
    StorageConfig.delete_by_id(storage_id)
git_repo_ids = [git_repo.id for git_repo in GitRepo.list()]
for git_repo_id in git_repo_ids:
    GitRepo.delete_by_id(git_repo_id)

test_storage_git = StorageGit(
    repo=GitRepo(
        name=f"STT-RoboShaulGolden-dataset{unique_id}",
        repository_url="https://huggingface.co/datasets/hirundo-io/RoboShaulGolden",
        plain_auth=GitPlainAuth(
            username="blewis-hir",
            password=os.environ["HUGGINGFACE_ACCESS_TOKEN"],
        ),
    ),
    branch="main",
)
test_dataset = QADataset(
    name=f"TEST-STT-RoboShaulGolden-dataset{unique_id}",
    domain=Domain.SPEECH,
    labeling_type=LabelingType.SPEECH_TO_TEXT,
    language="he",
    storage_config=StorageConfig(
        name=f"STT-RoboShaulGolden-dataset{unique_id}",
        type=StorageTypes.GIT,
        git=test_storage_git,
    ),
    data_root_url=test_storage_git.get_url("/wavs"),
    labeling_info=HirundoCSV(
        csv_url=test_storage_git.get_url("/meta.csv"),
    ),
)

test_dataset.run_qa()

In [None]:
results = test_dataset.check_run()
results.suspects.to_csv("he-on-prem-audio-test.csv")