diff --git a/jigsawstack/audio.py b/jigsawstack/audio.py index 575b839..22b1f7a 100644 --- a/jigsawstack/audio.py +++ b/jigsawstack/audio.py @@ -10,13 +10,44 @@ class SpeechToTextParams(TypedDict): url: NotRequired[str] + """ + the url of the audio file to transcribe, optional if file_store_key is provided + """ + file_store_key: NotRequired[str] + """ + the file store key of the audio file to transcribe, optional if url is provided + """ + language: NotRequired[Union[str, Literal["auto"]]] + """ + The language to transcribe or translate the file into. Use “auto” for automatic language detection, or specify a language code. If not specified, defaults to automatic detection. All supported language codes can be found + """ + translate: NotRequired[bool] + """ + When set to true, translates the content into English (or the specified language if language parameter is provided) + """ + by_speaker: NotRequired[bool] + """ + Identifies and separates different speakers in the audio file. When enabled, the response will include a speakers array with speaker-segmented transcripts. + """ + webhook_url: NotRequired[str] + """ + Webhook URL to send result to. When provided, the API will process asynchronously and send results to this URL when completed. + """ + batch_size: NotRequired[int] + """ + The batch size to return. Maximum value is 40. This controls how the audio is chunked for processing. + """ + chunk_duration: NotRequired[int] + """ + the duration of each chunk in seconds, maximum value is 15, defaults to 3 + """ class ChunkParams(TypedDict): @@ -32,8 +63,29 @@ class BySpeakerParams(ChunkParams): class SpeechToTextResponse(BaseResponse): text: str + """ + the text of the transcription + """ + chunks: List[ChunkParams] + """ + the chunks of the transcription + """ + speakers: Optional[List[BySpeakerParams]] + """ + the speakers of the transcription, available if by_speaker is set to true + """ + + language_detected: Optional[str] + """ + the language detected in the transcription, available if language is set to auto + """ + + confidence: Optional[float] + """ + the confidence of the transcription language detection, available if language is set to auto + """ class SpeechToTextWebhookResponse(BaseResponse):