## EDENAI

In [5]:
import base64
import json
import requests
import os

def base64_to_mp3(audio_string, output_filename):
    """
    Converts a Base64 encoded string representation of an audio file to an MP3 file.

    Args:
    audio_string (str): The Base64 encoded string representation of the audio file.
    output_filename (str): The name of the output MP3 file.
    """
    decoded_audio = base64.b64decode(audio_string)

    with open(output_filename, 'wb') as f:
        f.write(decoded_audio)  
        
def text_to_speech_mp3(text, output_filename, language='en-US', option='MALE'):
    """
    Converts a text string to speech using the Eden AI API and writes the resulting audio to an MP3 file.

    Args:
    text (str): The text string to convert to speech.
    output_filename (str): The name of the output MP3 file.
    api_key (str): Your Eden AI API key.
    language (str): The language code for the speech synthesis. Defaults to 'en-US'.
    option (str): The voice option for the speech synthesis. Defaults to 'MALE'.
    """
    api_key = os.environ.get('EDENAI_API_KEY')
    headers = {"Authorization": f"Bearer {api_key}"}
    url = "https://api.edenai.run/v2/audio/text_to_speech"
    payload = {
        "providers": "google",
        "language": language,
        "option": option,
        "text": text,
    }

    response = requests.post(url, json=payload, headers=headers)
    result = json.loads(response.text)
    # print(result)
    audio_string = result['google']['audio']
    print('cost : ',result['google']['cost'])
    decoded_audio = base64.b64decode(audio_string)

    with open(output_filename, 'wb') as f:
        f.write(decoded_audio)

In [6]:
text = "“Houdini” is the lead single of Eminem’s twelfth studio album The Death of Slim Shady (Coup de Grâce). The track is named after Harry Houdini, a popular magician known for his death-defying stunts like the Chinese Water Torture Cell, which Eminem replicates in the intro to his 1999 track, “Role Model.”"
text_to_speech_mp3(text, "output_tts.mp3")

{'google': {'status': 'success', 'audio': '//NExAAAAANIAAAAADGIbYdaj1E/BGAuFY1gADAocjRjgA4H8aHxAcBwTxBiAByjiZcQWB+I3ggQJiDB8Rn84Q4ePoE4EPiDh9nhgT7AtHROQTIWHP/QNhIErKSUOyKX//NExFMAAANIAAAAADOGuM4cItgvyINcfxQTb7zXgTP77jwKMbkaDaehYFInCcGeLGulhjn3AYDwOwPrnPbX2cvk5qxzV9jASDzsc36dt5pRy9232HKMLKcdn6+ys4c2//NExKYSeD3sAMPSBPA42na8wR8nh9v1wiZ8g+H4fJ5n9eJ+PPA4IP/hnX+IBenNtqfMEA/IBflYS06llzVigYFXtWIZFZ3hzuUd+b5xj4qNwSQ0Wsm5PCWnm5L8dOIX//NExK8fAa4wANPYlXbEKaQnXN9AUYYnCeamjRrKVHyQMT2GVnUQEmeSjE+RoyO6t83sX5MZSNukBA6PsgIklT8u0K8OANj4/Ne7+f/BHFfAXnwPnT1vGsqjMo8OAeKo//NExIYe6cpgAHvSlZMWJHRBWMMcOYq6bWlqQDTU7sL1Vwyu1IqICvixjxQ5bHwQYuaEExYicqo8Fehbg1ObMztc8ja7o7Y791Hg2gWf2tanipKRSKu3vEAsxTqqhtFi//NExF0hoo50ANPQucDzNeRN1uOIWHGSq2kVMfFa71W8VaVelN/8x/vuPaaH9ztLJriSzFBSlZ+liIrMA4EmEtDMI0MYLMSTa0FAIGTxvrebktkMg3kbydoexKZUSTMC//NExCkbQiKYANPEmX2JhEcHmf7mrWbMFjnvmB2edkgZiZ0QmhDv9GafupyMcIhGIlnyPQ6b701Rmpf2SvO9Q42AjzP//+O+PeBaARzPzLdkvdsioIw5lcdkZIAOUNI4//NExA8ViWKoANPKlPWmRQvcE3UZEw+xeOwsDk/sppYC4FuMFFjERtryq1wj6

## times of words

In [8]:
import json
import requests

headers = {"Authorization": f"Bearer {os.environ.get('EDENAI_API_KEY')}"}

url = "https://api.edenai.run/v2/audio/speech_to_text_async"
data = {
    "providers": "OpenAI",
    "language": "en-US",
}

files = {'file': open("audio1.mp3", 'rb')}

response = requests.post(url, data=data, files=files, headers=headers)
result = json.loads(response.text)
print(result)


{'error': {'type': 'Invalid request', 'message': {'providers': ["No such provider 'OpenAI' for 'audio.speech_to_text_async'"]}}}


# OPEN AI

## TTS

In [71]:
from openai import OpenAI

api_key = os.environ.get('OPENAI_API_KEY_LORRAIN')
client = OpenAI(api_key=api_key)

def text_to_speech_mp3(text, output_filename, model="tts-1", voice="alloy"):
    """
    Converts a text string to speech using the OpenAI API and writes the resulting audio to an MP3 file.

    Args:
    text (str): The text string to convert to speech.
    output_filename (str): The name of the output MP3 file.
    model (str): The model to use for speech synthesis. Defaults to 'tts-1'.
    voice (str): The voice to use for speech synthesis. Defaults to 'alloy'.
    """
    global client
    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY_LORRAIN')) if client is None else client
    speech_file_path = output_filename


    with client.audio.speech.with_streaming_response.create(
        model=model,
        voice=voice,
        input=text,
    ) as response:
        response.stream_to_file(speech_file_path+('.mp3' if not speech_file_path.endswith('.mp3') else ''))


In [82]:
original_text = "Today is a wonderful day to build something people love! Today is a beautiful day"
text_to_speech_mp3(original_text, "output_tts.mp3")

## Times of the text 

In [73]:
from openai import OpenAI
import os

def getTextTimingsOfMp3(mp3file):
    global client
    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY_LORRAIN')) if client is None else client
    
    audio_file = open(mp3file, "rb")
    transcript = client.audio.transcriptions.create(
        file=audio_file,
        model="whisper-1",
        response_format="verbose_json",
        timestamp_granularities=["word"]
    )

    return transcript



In [74]:
text = getTextTimingsOfMp3("output_tts.mp3")
print(text)

Transcription(text='Today is a wonderful day to build something people love. Today is a beautiful day.', task='transcribe', language='english', duration=5.010000228881836, words=[{'word': 'Today', 'start': 0.0, 'end': 0.3199999928474426}, {'word': 'is', 'start': 0.3199999928474426, 'end': 0.5400000214576721}, {'word': 'a', 'start': 0.5400000214576721, 'end': 0.6800000071525574}, {'word': 'wonderful', 'start': 0.6800000071525574, 'end': 1.0}, {'word': 'day', 'start': 1.0, 'end': 1.3799999952316284}, {'word': 'to', 'start': 1.3799999952316284, 'end': 1.659999966621399}, {'word': 'build', 'start': 1.659999966621399, 'end': 1.8799999952316284}, {'word': 'something', 'start': 1.8799999952316284, 'end': 2.259999990463257}, {'word': 'people', 'start': 2.259999990463257, 'end': 2.640000104904175}, {'word': 'love', 'start': 2.640000104904175, 'end': 2.9800000190734863}, {'word': 'Today', 'start': 3.5999999046325684, 'end': 3.7799999713897705}, {'word': 'is', 'start': 3.7799999713897705, 'end': 

In [89]:
def verify_transcription(original_text, transcription):
    """
    Verifies that the text is the same after the tts and stt.

    Args:
    original_text (str): The original text.
    transcription (dict): The transcription dictionary.

    Returns:
    int: ratio of the original text and the transcription text.
    """
    print(transcription.text)
    print(original_text)
    transcription_text = transcription.text.lower().replace('!', '').replace('.', '').replace(',', '').replace('?', '')
    original_text = original_text.lower().replace('!', '').replace('.', '').replace(',', '').replace('?', '')
    
    if transcription_text == original_text:
        return 1
    
    # correct_words = 0
    for word in original_text.split():
        transcription_text = transcription_text.replace(word, '', 1)
#
    return 1 - len(transcription_text) / len(original_text)

print(verify_transcription(original_text, text))

Today is a wonderful day to build something people love. Today is a beautiful day.
Today is a wonderful day to build something people love! Today is a beautiful day
1


In [79]:
def get_sentence_timings(transcription, sentence):
    """
    Returns the start and end timings of a sentence in a transcription object.

    Args:
    transcription (Transcription): The transcription object.
    sentence (str): The sentence to search for.

    Returns:
    tuple: A tuple containing the start and end timings of the sentence.
    """
    words = transcription.words
    sentence_words = sentence.split()
    sentence_start = None
    sentence_end = None

    i = 0
    while i < len(words):
        if words[i]['word'].lower() == sentence_words[0].lower():
            j = 1
            while j < len(sentence_words) and i + j < len(words) and words[i + j]['word'].lower() == sentence_words[j].lower():
                j += 1
            if j == len(sentence_words):
                sentence_start = words[i]['start']
                sentence_end = words[i + j - 1]['end']
                break
        i += 1

    return sentence_start, sentence_end

In [81]:
print(text)
sentence_start, sentence_end = get_sentence_timings(text, "Today is a beautiful day")
print(f"Sentence start: {sentence_start}, sentence end: {sentence_end}")

Transcription(text='Today is a wonderful day to build something people love. Today is a beautiful day.', task='transcribe', language='english', duration=5.010000228881836, words=[{'word': 'Today', 'start': 0.0, 'end': 0.3199999928474426}, {'word': 'is', 'start': 0.3199999928474426, 'end': 0.5400000214576721}, {'word': 'a', 'start': 0.5400000214576721, 'end': 0.6800000071525574}, {'word': 'wonderful', 'start': 0.6800000071525574, 'end': 1.0}, {'word': 'day', 'start': 1.0, 'end': 1.3799999952316284}, {'word': 'to', 'start': 1.3799999952316284, 'end': 1.659999966621399}, {'word': 'build', 'start': 1.659999966621399, 'end': 1.8799999952316284}, {'word': 'something', 'start': 1.8799999952316284, 'end': 2.259999990463257}, {'word': 'people', 'start': 2.259999990463257, 'end': 2.640000104904175}, {'word': 'love', 'start': 2.640000104904175, 'end': 2.9800000190734863}, {'word': 'Today', 'start': 3.5999999046325684, 'end': 3.7799999713897705}, {'word': 'is', 'start': 3.7799999713897705, 'end': 