ModelsLab · Tanmaypatil123 · Sep 16, 2025 · Sep 15, 2025 · Sep 16, 2025
diff --git a/modelslab_py/core/apis/audio.py b/modelslab_py/core/apis/audio.py
@@ -61,7 +61,7 @@ def song_generator(self, schema: SongGenerator):
         return response
 
     def speech_to_text(self, schema: Speech2Text):
-        base_endpoint = "https://modelslab.com/api/v6/whisper/transcribe"
+        base_endpoint = self.base_url + "speech_to_text"
         data = schema.dict()
         response = self.client.post(base_endpoint, data=data)
         return response

diff --git a/modelslab_py/schemas/audio.py b/modelslab_py/schemas/audio.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Literal
 from modelslab_py.schemas.base import BaseSchema
 from pydantic import Field
 
@@ -65,6 +65,11 @@ class Text2Speech(BaseSchema):
         description="Speed of the speech generation."
     )
 
+    output_format: Optional[Literal["wav", "mp3"]] = Field(
+        "wav",
+        description="The format of the generated audio. Either 'wav' or 'mp3'. Defaults to 'wav'."
+    )
+
     emotion: Optional[str] = Field(
         None,
         description="Emotion for the speech generation."
@@ -134,11 +139,6 @@ class VoiceCover(BaseSchema):
 
     )
 
-    language: Optional[str] = Field(
-        None,
-        description="Language for the voice cover."
-    )
-
     emotion : Optional[str] = Field(
         "neutral",
         description="Emotion for the voice cover."
@@ -226,10 +226,22 @@ class MusicGenSchema(BaseSchema):
         None,
         description="Initial audio for the music generation."
     )
+
+    output_format: Optional[Literal["wav", "mp3", "flac"]] = Field(
+        "wav",
+        description="The format of the generated audio. Either 'wav', 'mp3', or 'flac'. Defaults to 'wav'."
+    )
+
+    bitrate: Optional[Literal["128k", "192k", "320k"]] = Field(
+        "320k",
+        description="Bitrate of the generated audio. Options: '128k', '192k', '320k' Defaults to `320k`."
+    )
+
     base64: Optional[str] = Field(
         None,
         description="Base64 encoded audio data."
     )
+
     temp   : Optional[float] = Field(
         None,
         description="Upload files in temp s3 directory for the audio generation."
@@ -284,7 +296,7 @@ class Speech2Text(BaseSchema):
         description="Input language for speech-to-text conversion."
     )
 
-    timestamp_level : Optional[str] = Field(
+    timestamp_level : Optional[Literal["word", "sentence"]] = Field(
         None,
         description="Timestamp level for speech-to-text conversion."
     )
@@ -300,6 +312,16 @@ class SFX(BaseSchema):
         description="Duration for the sound effect generation."
     )
 
+    output_format: Optional[Literal["wav", "mp3", "flac"]] = Field(
+        "wav",
+        description="The format of the generated audio. Either 'wav', 'mp3', or 'flac'. Defaults to 'wav'."
+    )
+
+    bitrate: Optional[Literal["128k", "192k", "320k"]] = Field(
+        "320k",
+        description="Bitrate of the generated audio. Options: '128k', '192k', '320k' Defaults to `320k`."
+    )
+
     temp : Optional[bool]  = Field(
         None,
         description="Upload files in temp s3 directory for the sound effect generation."

diff --git a/test.py b/test.py
@@ -55,4 +55,3 @@
 
 # print(response)
 
-