Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions jigsawstack/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,44 @@

class SpeechToTextParams(TypedDict):
url: NotRequired[str]
"""
the url of the audio file to transcribe, optional if file_store_key is provided
"""

file_store_key: NotRequired[str]
"""
the file store key of the audio file to transcribe, optional if url is provided
"""

language: NotRequired[Union[str, Literal["auto"]]]
"""
The language to transcribe or translate the file into. Use “auto” for automatic language detection, or specify a language code. If not specified, defaults to automatic detection. All supported language codes can be found
"""

translate: NotRequired[bool]
"""
When set to true, translates the content into English (or the specified language if language parameter is provided)
"""

by_speaker: NotRequired[bool]
"""
Identifies and separates different speakers in the audio file. When enabled, the response will include a speakers array with speaker-segmented transcripts.
"""

webhook_url: NotRequired[str]
"""
Webhook URL to send result to. When provided, the API will process asynchronously and send results to this URL when completed.
"""

batch_size: NotRequired[int]
"""
The batch size to return. Maximum value is 40. This controls how the audio is chunked for processing.
"""

chunk_duration: NotRequired[int]
"""
the duration of each chunk in seconds, maximum value is 15, defaults to 3
"""


class ChunkParams(TypedDict):
Expand All @@ -32,8 +63,29 @@ class BySpeakerParams(ChunkParams):

class SpeechToTextResponse(BaseResponse):
text: str
"""
the text of the transcription
"""

chunks: List[ChunkParams]
"""
the chunks of the transcription
"""

speakers: Optional[List[BySpeakerParams]]
"""
the speakers of the transcription, available if by_speaker is set to true
"""

language_detected: Optional[str]
"""
the language detected in the transcription, available if language is set to auto
"""

confidence: Optional[float]
"""
the confidence of the transcription language detection, available if language is set to auto
"""


class SpeechToTextWebhookResponse(BaseResponse):
Expand Down