In [None]:
import logging
import os
from pathlib import Path

from hirundo import (
    HirundoCSV,
    LabelingType,
    QADataset,
    StorageConfig,
    StorageTypes,
)
from pydantic_core import Url

logger = logging.getLogger(__name__)

unique_id = os.getenv("UNIQUE_ID", "").replace(".", "-").replace("/", "-")

run_ids = [run.run_id for run in QADataset.list_runs() if run.run_id]
for run_id in run_ids:
    QADataset.cancel_by_id(run_id)
dataset_ids = [dataset.id for dataset in QADataset.list_datasets()]
for dataset_id in dataset_ids:
    QADataset.delete_by_id(dataset_id)
storage_ids = [storage_config.id for storage_config in StorageConfig.list()]
for storage_id in storage_ids:
    StorageConfig.delete_by_id(storage_id)

test_dataset = QADataset(
    name=f"TEST-STT-RoboShaulTiny-dataset{unique_id}",
    labeling_type=LabelingType.SPEECH_TO_TEXT,
    language="he",
    storage_config=StorageConfig(
        name=f"STT-RoboShaulTiny-dataset{unique_id}",
        type=StorageTypes.LOCAL,
    ),
    data_root_url=Url(Path("/datasets/RoboShaulTiny/wavs").as_uri()),
    labeling_info=HirundoCSV(
        csv_url=Url(Path("/datasets/RoboShaulTiny/meta.csv").as_uri()),
    ),
)

test_dataset.run_qa()
results = test_dataset.check_run()
print(results)

  from .autonotebook import tqdm as notebook_tqdm
Deleted storage integration with ID: 9
Created storage integration with ID: 11
Created dataset with ID: 8
Started the run with ID: 2f7e0a41-75c4-4568-95fc-a177411c5162
Dataset QA run completed successfully: 100%|██████████| 100.0/100.0 [08:35<00:00,  5.16s/it]      

suspects=     audio_segment_id           audio_path  \
0                   6  wavs/automatic_0012   
1                  20  wavs/automatic_0017   
2                  15  wavs/automatic_0018   
3                  36  wavs/automatic_0020   
4                  37  wavs/automatic_0010   
..                ...                  ...   
120                32  wavs/automatic_0004   
121                34  wavs/automatic_0012   
122                39  wavs/automatic_0017   
123                39  wavs/automatic_0017   
124                 3  wavs/automatic_0017   

                                            transcript  \
0                 ‏וגם אם לא תהיה פה השעה בסופו של דבר   
1    ‏פרשה שהתחילה בחקירה שנפתחה יום אחד ב2017 בעצם...   
2    ‏ללשכת ראש הממשלה הדבר הזה נולד רק דרך אילן יש...   
3                                                   ‏ה   
4    ‏הפרשה הזאת בעצם שואלת האם לחץ להטות את הסיכור...   
..                                                 ...   
120  ‏היום אנחנו יוצאים אה נו לא


