In [1]:
#Dependencies


import os
import telebot
from datetime import datetime
import re

%load_ext dotenv
%dotenv

bot_token = os.getenv('BOT_TOKEN')

In [2]:
#Implementation of technologies


import speech_recognition as sr
import assemblyai as aai
from pydub import AudioSegment
from os import path
from google.cloud import speech
from google.oauth2 import service_account

def ogg_to_wav(file):
    audio = AudioSegment.from_file(file + ".ogg", format="ogg")

    audio.export(os.path.join(os.getcwd(), file + ".wav"), format="wav")

def wav32_wav16(file):
    audio = AudioSegment.from_file(file)
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)  # 16-bit
    audio.export(file, format="wav")

def p_w_SpeechRecognition(file_name):

    AUDIO_FILE = file_name
    r = sr.Recognizer()
    with sr.AudioFile(AUDIO_FILE) as source:
        audio = r.record(source) 
    
    try:
        return r.recognize_google(audio)
    except sr.UnknownValueError:
        return "Not understand audio"
    except sr.RequestError as e:
        return f"Could not request results: {e}"
        
def p_w_apiai(file_name):
    return "Not available"

def p_w_assemblyai(file_name):
    
    aai.settings.api_key = os.getenv("ASSEMBLYAI")
    config = aai.TranscriptionConfig(speech_model=aai.SpeechModel.nano, language_code="es", speakers_expected = 1)
    
    transcriber = aai.Transcriber(config = config)
    transcript = transcriber.transcribe(file_name)
    
    return transcript.text

def p_w_google_cloud_speech(file_name):
    wav32_wav16(file_name)
    client_file = "caramel-anvil-444906-t6-ada6380ad4f0.json"
    client_file = os.path.join(os.getcwd(), client_file)
    credentials = service_account.Credentials.from_service_account_file(client_file)
    client = speech.SpeechClient(credentials = credentials)

    with open(file_name, "rb") as audio_file:
        content = audio_file.read()
    
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code="es-CO"
    )

    response = client.recognize(config=config, audio=audio)

    text = ""
    for result in response.results:
        text += result.alternatives[0].transcript
    
    return text

In [None]:
#Bot implementation


bot = telebot.TeleBot(bot_token)

tecnologies = {"1": "SpeechRecognition", "2": "google-cloud-speech", "3": "assemblyai"}
selected_technology = "0"


@bot.message_handler(commands=['start'])
def send_welcome(message):
    user_name = message.chat.first_name + " " + message.chat.last_name
    msg = "Hi " + user_name + f"\n\nSelect a technology to test\n"
    msg += f"(Selected -> {tecnologies[selected_technology] if selected_technology != "0" else "None"}):\n\n"
    tg_list = ""
    for number, tecnologie in tecnologies.items():
        tg_list += number + ". " + tecnologie + "\n"
    msg += tg_list
    
    bot.reply_to(message, msg)

def valid_selection(number):
    global selected_technology
    n = -1
    try:
        n = int(number)
        if n > 0 and n <= len(tecnologies):
            selected_technology = str(n)
            return n
        else:
            return None
    except ValueError:
        return None
        
def speech_to_text(file_name):
    global selected_technology
    text = ""
    
    if selected_technology == "1":
        text = p_w_SpeechRecognition(file_name)
    elif selected_technology == "2":
        text = p_w_google_cloud_speech(file_name)
    elif selected_technology == "3":
        text = p_w_assemblyai(file_name)
        
        
    return text

@bot.message_handler(content_types=['voice'])
def handle_voice(message):
    global selected_technology

    if selected_technology != "0":
        file_id = message.voice.file_id
        file_info = bot.get_file(file_id)
        downloaded_file = bot.download_file(file_info.file_path)

        date_current_time = str(datetime.now())
        user_name = message.chat.first_name + " " + message.chat.last_name

        data = date_current_time + " "+ user_name
        data = re.sub(r"[ \-:.]", "_", data)
        file_name = data + ".ogg"
        print(file_name)
    
        with open(file_name, "wb") as new_file:
            new_file.write(downloaded_file)
        
        input_file = os.path.join(os.getcwd(), data)
        ogg_to_wav(input_file)

        wav_audio = data + ".wav"
        path_wav_audio = os.path.join(os.getcwd(), wav_audio)
        text = speech_to_text(path_wav_audio)
    
        bot.reply_to(message, f"Your voice message was processed with -> {tecnologies[selected_technology]}\n\nResult: \"{text}\" ")
        send_welcome(message)
    else:
        bot.reply_to(message, "You must first select a technology")
        send_welcome(message)
        
    
@bot.message_handler(func=lambda message: True)
def usr_message(message):
    global selected_technology
    if valid_selection(message.text) is not None:
        bot.reply_to(message, "Selected option: " + tecnologies[selected_technology] + ".\nYou can now send your voice message")
    else:
        bot.reply_to(message, "Please select a valid option")
        send_welcome(message)

bot.infinity_polling(timeout=30, long_polling_timeout=60)