Add Support for OpenAI TTS Service (#84)

* Commit changes from @hdeep03's branch * Update docs * Bump version
ManimCommunity · Feb 25, 2024 · 575530b · 575530b
1 parent f02a5f8
commit 575530b
Show file tree

Hide file tree

Showing 8 changed files with 369 additions and 9 deletions.
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -38,6 +38,10 @@ Speech services
    :members:
    :show-inheritance:
 
+.. automodule:: manim_voiceover.services.openai
+   :members:
+   :show-inheritance:
+
 .. automodule:: manim_voiceover.services.pyttsx3
    :members:
    :show-inheritance:

diff --git a/docs/source/services.rst b/docs/source/services.rst
@@ -36,7 +36,7 @@ Manim Voiceover defines the :py:class:`~~base.SpeechService` class for adding ne
      - Very good, human-like
      - No
      - Yes
-     - `ElevenLabs <elevenlabs.io>`__ develops very advanced voice generative AI models. It has a range of realistic and emotive voices, and also allows you to clone your own voice by uploading a few minutes of your speech.
+     - Requires ElevenLabs account. Click `here <https://elevenlabs.io/sign-up>`__ to sign up.
    * - :py:class:`~coqui.CoquiService`
      - Good, human-like
      - Yes
@@ -47,6 +47,11 @@ Manim Voiceover defines the :py:class:`~~base.SpeechService` class for adding ne
      - No
      - No
      - It's a free API subsidized by Google, so there is a likelihood it may stop working in the future.
+   * - :py:class:`~openai.OpenAIService`
+     - Very good, human-like
+     - No
+     - Yes
+     - Requires OpenAI developer account. See `platform <https://platform.openai.com/signup>`__ to sign up, and the `pricing page <https://openai.com/pricing#:~:text=%24-,0.030,-/%201K%20characters>`__ for more details.
    * - :py:class:`~pyttsx3.PyTTSX3Service`
      - Bad
      - Yes
@@ -136,6 +141,32 @@ Install Manim Voiceover with the ``gtts`` extra in order to use :py:class:`~gtts
 
 Refer to the `example usage <https://github.com/ManimCommunity/manim-voiceover/blob/main/examples/gtts-example.py>`__ to get started.
 
+:py:class:`~openai.OpenAIService`
+*************************************
+`OpenAI <https://platform.openai.com/docs/api-reference/audio/createSpeech/>`__ provides a text-to-speech service. It is through an API, so it requires an internet connection to work. It also requires an API key to use. Register for one `here <https://platform.openai.com/>`__.
+
+Install Manim Voiceover with the ``openai`` extra in order to use :py:class:`~openai.OpenAIService`:
+
+.. code:: sh
+
+   pip install "manim-voiceover[openai]"
+
+Then, you need to find out your api key:
+
+- Sign in to `OpenAI platform <https://platform.openai.com/>`__ and click into Api Keys from the left panel.
+- Click create a new secret key and copy it.
+
+Create a file called ``.env`` that contains your authentication
+information in the same directory where you call Manim.
+
+.. code:: sh
+
+   OPENAI_API_KEY="..." # insert the secret key here. It should start with "sk-"
+
+Check out `OpenAI docs <https://platform.openai.com/docs/guides/text-to-speech/>`__ for more details.
+
+Refer to the `example usage <https://github.com/ManimCommunity/manim-voiceover/blob/main/examples/openai-example.py>`__ to get started.
+
 :py:class:`~pyttsx3.PyTTSX3Service`
 ***********************************
 
@@ -154,7 +185,7 @@ Refer to the `example usage <https://github.com/ManimCommunity/manim-voiceover/b
 :py:class:`~elevenlabs.ElevenLabsService`
 ******************************************
 
-`ElevenLabs <https://www.elevenlabs.io/>`__ offers one of the most natural sounding speech service APIs. To use it, you will need to create an account at `Eleven Labs <https://elevenlabs.io/sign-up>`__.
+`ElevenLabs <https://www.elevenlabs.io/>`__ offers one of the most natural sounding speech service APIs. It has a range of realistic and emotive voices, and also allows you to clone your own voice by uploading a few minutes of your speech. To use it, you will need to create an account at `Eleven Labs <https://elevenlabs.io/sign-up>`__.
 
 .. tip::
     ElevenLabs currently offers free TTS of 10,000 characters/month and up to 3 custom voices.

diff --git a/examples/openai-example.py b/examples/openai-example.py
@@ -0,0 +1,30 @@
+from manim import *
+from manim_voiceover import VoiceoverScene
+from manim_voiceover.services.openai import OpenAIService
+
+
+class OpenAIExample(VoiceoverScene):
+    def construct(self):
+        self.set_speech_service(
+            OpenAIService(
+                voice="fable",
+                model="tts-1-hd",
+            )
+        )
+
+        circle = Circle()
+        square = Square().shift(2 * RIGHT)
+
+        with self.voiceover(text="This circle is drawn as I speak.") as tracker:
+            self.play(Create(circle), run_time=tracker.duration)
+
+        with self.voiceover(text="Let's shift it to the left 2 units.") as tracker:
+            self.play(circle.animate.shift(2 * LEFT), run_time=tracker.duration)
+
+        with self.voiceover(text="Now, let's transform it into a square.") as tracker:
+            self.play(Transform(circle, square), run_time=tracker.duration)
+
+        with self.voiceover(text="Thank you for watching.", speed=0.75): # You can also change the audio speed by specifying the speed argument.
+            self.play(Uncreate(circle))
+
+        self.wait()
diff --git a/manim_voiceover/services/elevenlabs.py b/manim_voiceover/services/elevenlabs.py
@@ -150,8 +150,10 @@ def generate_from_text(
         input_data = {
             "input_text": input_text,
             "service": "elevenlabs",
-            "model": self.model,
-            "voice": self.voice.model_dump(exclude_none=True),
+            "config": {
+                "model": self.model,
+                "voice": self.voice.model_dump(exclude_none=True),
+            },
         }
 
         # if not config.disable_caching:
@@ -164,8 +166,9 @@ def generate_from_text(
             audio_path = self.get_audio_basename(input_data) + ".mp3"
         else:
             audio_path = path
+
         try:
-            audio = generate(text=text, voice=self.voice, model=self.model)
+            audio = generate(text=input_text, voice=self.voice, model=self.model)
             save(audio, str(Path(cache_dir) / audio_path))  # type: ignore
         except Exception as e:
             logger.error(e)

diff --git a/manim_voiceover/services/openai.py b/manim_voiceover/services/openai.py
@@ -0,0 +1,118 @@
+import os
+import sys
+from pathlib import Path
+from manim import logger
+from dotenv import load_dotenv, find_dotenv
+
+from manim_voiceover.helper import (
+    create_dotenv_file,
+    prompt_ask_missing_extras,
+    remove_bookmarks,
+)
+
+try:
+    import openai
+except ImportError:
+    logger.error(
+        "Missing packages. "
+        'Run `pip install "manim-voiceover[openai]"` to use OpenAIService.'
+    )
+
+from manim_voiceover.services.base import SpeechService
+
+load_dotenv(find_dotenv(usecwd=True))
+
+
+def create_dotenv_openai():
+    logger.info(
+        "Check out https://voiceover.manim.community/en/stable/services.html "
+        "to learn how to create an account and get your subscription key."
+    )
+    if not create_dotenv_file(["OPENAI_API_KEY"]):
+        raise ValueError(
+            "The environment variable OPENAI_API_KEY is not set. Please set it "
+            "or create a .env file with the variables."
+        )
+    logger.info("The .env file has been created. Please run Manim again.")
+    sys.exit()
+
+
+class OpenAIService(SpeechService):
+    """
+    Speech service class for OpenAI TTS Service. See the `OpenAI API page
+    <https://platform.openai.com/docs/api-reference/audio/createSpeech>`__
+    for more information about voices and models.
+    """
+
+    def __init__(
+        self,
+        voice: str = "alloy",
+        model: str = "tts-1-hd",
+        transcription_model="base",
+        **kwargs
+    ):
+        """
+        Args:
+            voice (str, optional): The voice to use. See the
+            `API page <https://platform.openai.com/docs/api-reference/audio/createSpeech>`__
+            for all the available options. Defaults to ``"alloy"``.
+            model (str, optional): The TTS model to use.
+            See the `API page <https://platform.openai.com/docs/api-reference/audio/createSpeech>`__
+            for all the available options. Defaults to ``"tts-1-hd"``.
+        """
+        prompt_ask_missing_extras("openai", "openai", "OpenAIService")
+        self.voice = voice
+        self.model = model
+
+        SpeechService.__init__(self, transcription_model=transcription_model, **kwargs)
+
+    def generate_from_text(
+        self, text: str, cache_dir: str = None, path: str = None, **kwargs
+    ) -> dict:
+        """"""
+        if cache_dir is None:
+            cache_dir = self.cache_dir
+
+        speed = kwargs.get("speed", 1.0)
+
+        if not (0.25 <= speed <= 4.0):
+            raise ValueError("The speed must be between 0.25 and 4.0.")
+
+        input_text = remove_bookmarks(text)
+        input_data = {
+            "input_text": input_text,
+            "service": "openai",
+            "config": {
+                "voice": self.voice,
+                "model": self.model,
+                "speed": speed,
+            },
+        }
+
+        cached_result = self.get_cached_result(input_data, cache_dir)
+        if cached_result is not None:
+            return cached_result
+
+        if path is None:
+            audio_path = self.get_audio_basename(input_data) + ".mp3"
+        else:
+            audio_path = path
+
+        if os.getenv("OPENAI_API_KEY") is None:
+            create_dotenv_openai()
+
+        response = openai.audio.speech.create(
+            model=self.model,
+            voice=self.voice,
+            input=input_text,
+            speed=speed,
+        )
+        response.stream_to_file(str(Path(cache_dir) / audio_path))
+
+        json_dict = {
+            "input_text": text,
+            "input_data": input_data,
+            "original_audio": audio_path,
+        }
+
+        return json_dict
diff --git a/manim_voiceover/tracker.py b/manim_voiceover/tracker.py
@@ -57,10 +57,43 @@ def __init__(self, scene: Scene, data: dict, cache_dir: str):
         if "word_boundaries" in self.data:
             self._process_bookmarks()
 
+    def _get_fallback_word_boundaries(self):
+        """
+        Returns dummy word boundaries assuming a linear mapping between
+        text and audio. Used when word boundaries are not available.
+        """
+        input_text = remove_bookmarks(self.data["input_text"])
+        return [
+            {
+                "audio_offset": 0,
+                "text_offset": 0,
+                "word_length": len(input_text),
+                "text": self.data["input_text"],
+                "boundary_type": "Word",
+            },
+            {
+                "audio_offset": self.duration * AUDIO_OFFSET_RESOLUTION,
+                "text_offset": len(input_text),
+                "word_length": 1,
+                "text": ".",
+                "boundary_type": "Word",
+            },
+        ]
+
     def _process_bookmarks(self) -> None:
         self.bookmark_times = {}
         self.bookmark_distances = {}
-        self.time_interpolator = TimeInterpolator(self.data["word_boundaries"])
+
+        word_boundaries = self.data["word_boundaries"]
+        if not word_boundaries or len(word_boundaries) < 2:
+            logger.warning(
+                f"Word boundaries for voiceover {self.data['input_text']} are not "
+                "available or are insufficient. Using fallback word boundaries."
+            )
+            word_boundaries = self._get_fallback_word_boundaries()
+
+        self.time_interpolator = TimeInterpolator(word_boundaries)
+
         net_text_len = len(remove_bookmarks(self.data["input_text"]))
         if "transcribed_text" in self.data:
             transcribed_text_len = len(self.data["transcribed_text"].strip())