In [1]:
import logging
import os

from hirundo.dataset_optimization import OptimizationDataset
from hirundo.enum import LabellingType
from hirundo.git import GitPlainAuthBase
from hirundo.storage import (
    GitRepo,
    StorageGit,
    StorageIntegration,
    StorageLink,
    StorageTypes,
)

logger = logging.getLogger(__name__)

unique_id = os.getenv("UNIQUE_ID", "").replace(".", "-").replace("/", "-")

run_ids = [
    dataset["run_id"] for dataset in OptimizationDataset.list() if dataset["run_id"]
]
for run_id in run_ids:
    OptimizationDataset.cancel_by_id(run_id)
dataset_ids = [dataset["id"] for dataset in OptimizationDataset.list()]
for dataset_id in dataset_ids:
    OptimizationDataset.delete_by_id(dataset_id)
storage_integration_ids = [
    storage_integration["id"] for storage_integration in StorageIntegration.list()
]
for storage_integration_id in storage_integration_ids:
    StorageIntegration.delete_by_id(storage_integration_id)
git_repo_ids = [git_repo["id"] for git_repo in GitRepo.list()]
for git_repo_id in git_repo_ids:
    GitRepo.delete_by_id(git_repo_id)

test_dataset = OptimizationDataset(
    name=f"TEST-STT-RoboShaul-dataset{unique_id}",
    labelling_type=LabellingType.SPEECH_TO_TEXT,
    language="he",
    dataset_storage=StorageLink(
        storage_integration=StorageIntegration(
            name=f"STT-RoboShaul-dataset{unique_id}",
            type=StorageTypes.GIT,
            git=StorageGit(
                repo=GitRepo(
                    name=f"STT-RoboShaul-dataset{unique_id}",
                    repository_url="https://huggingface.co/datasets/hirundo-io/RoboShaul",
                    plain_auth=GitPlainAuthBase(
                        username="blewis-hir",
                        password=os.environ["HUGGINGFACE_ACCESS_TOKEN"],
                    ),
                ),
                branch="main",
            ),
        ),
    ),
    dataset_metadata_path="meta-old.csv",
)

test_dataset.run_optimization()

  from .autonotebook import tqdm as notebook_tqdm
Cancelling run with ID: 08b6144d-2872-415f-9fdc-33451477a70b
Deleted dataset with ID: 18
Deleted storage integration with ID: 21
Created storage integration with ID: 22
Created dataset with ID: 19
Started the run with ID: 65358256-6bed-4ad3-9fbc-fa35a6cf4d5b


'65358256-6bed-4ad3-9fbc-fa35a6cf4d5b'

In [3]:
results = test_dataset.check_run()
print(results)

Optimization run completed successfully: 100%|██████████| 100.0/100.0 [15:06<00:00,  9.06s/it]




In [5]:
results.suspects.to_csv("test.csv")