Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion modelslab_py/core/apis/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def song_generator(self, schema: SongGenerator):
return response

def speech_to_text(self, schema: Speech2Text):
base_endpoint = "https://modelslab.com/api/v6/whisper/transcribe"
base_endpoint = self.base_url + "speech_to_text"
data = schema.dict()
response = self.client.post(base_endpoint, data=data)
return response
Expand Down
36 changes: 29 additions & 7 deletions modelslab_py/schemas/audio.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, List, Dict, Any
from typing import Optional, List, Dict, Any, Literal
from modelslab_py.schemas.base import BaseSchema
from pydantic import Field

Expand Down Expand Up @@ -65,6 +65,11 @@ class Text2Speech(BaseSchema):
description="Speed of the speech generation."
)

output_format: Optional[Literal["wav", "mp3"]] = Field(
"wav",
description="The format of the generated audio. Either 'wav' or 'mp3'. Defaults to 'wav'."
)

emotion: Optional[str] = Field(
None,
description="Emotion for the speech generation."
Expand Down Expand Up @@ -134,11 +139,6 @@ class VoiceCover(BaseSchema):

)

language: Optional[str] = Field(
None,
description="Language for the voice cover."
)

emotion : Optional[str] = Field(
"neutral",
description="Emotion for the voice cover."
Expand Down Expand Up @@ -226,10 +226,22 @@ class MusicGenSchema(BaseSchema):
None,
description="Initial audio for the music generation."
)

output_format: Optional[Literal["wav", "mp3", "flac"]] = Field(
"wav",
description="The format of the generated audio. Either 'wav', 'mp3', or 'flac'. Defaults to 'wav'."
)

bitrate: Optional[Literal["128k", "192k", "320k"]] = Field(
"320k",
description="Bitrate of the generated audio. Options: '128k', '192k', '320k' Defaults to `320k`."
)

base64: Optional[str] = Field(
None,
description="Base64 encoded audio data."
)

temp : Optional[float] = Field(
None,
description="Upload files in temp s3 directory for the audio generation."
Expand Down Expand Up @@ -284,7 +296,7 @@ class Speech2Text(BaseSchema):
description="Input language for speech-to-text conversion."
)

timestamp_level : Optional[str] = Field(
timestamp_level : Optional[Literal["word", "sentence"]] = Field(
None,
description="Timestamp level for speech-to-text conversion."
)
Expand All @@ -300,6 +312,16 @@ class SFX(BaseSchema):
description="Duration for the sound effect generation."
)

output_format: Optional[Literal["wav", "mp3", "flac"]] = Field(
Comment thread
anurag12-webster marked this conversation as resolved.
"wav",
description="The format of the generated audio. Either 'wav', 'mp3', or 'flac'. Defaults to 'wav'."
)

bitrate: Optional[Literal["128k", "192k", "320k"]] = Field(
"320k",
description="Bitrate of the generated audio. Options: '128k', '192k', '320k' Defaults to `320k`."
)

temp : Optional[bool] = Field(
None,
description="Upload files in temp s3 directory for the sound effect generation."
Expand Down
1 change: 0 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,3 @@

# print(response)