<a href="https://colab.research.google.com/github/JerichElano/talklas/blob/main/OpenAI_Whisper_ASR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/openai/whisper.git
!pip install gradio gtts deep-translator
!pip install httpx>=0.23.0

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-6h4u35uo
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-6h4u35uo
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-

In [None]:
import uuid
import time
import whisper
import gradio as gr
from gtts import gTTS
import os
from deep_translator import GoogleTranslator

# Load Whisper Model
model = whisper.load_model("medium")

# Supported Languages
LANGUAGES = {
    "auto": "Auto Detect",
    "en": "English",
    "tl": "Tagalog"
}

LANGUAGE_CODES = {v: k for k, v in LANGUAGES.items()}

def translate_text(text, source_lang, target_lang, max_retries=3):
    for attempt in range(max_retries):
        try:
            translator = GoogleTranslator(source=source_lang, target=target_lang)
            return translator.translate(text)
        except Exception as e:
            if attempt == max_retries - 1:  # If last attempt
                print(f"Translation failed after {max_retries} attempts: {str(e)}")
                return text  # Return original text if translation fails
            time.sleep(1)  # Wait before retrying

def transcribe_translate_tts(audio, selected_language_name):
    try:
        # Convert language name to code
        selected_language = LANGUAGE_CODES[selected_language_name]

        # Load and preprocess audio
        audio = whisper.load_audio(audio)
        audio = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(audio).to(model.device)

        # Detect language if auto is selected
        if selected_language == "auto":
            _, probs = model.detect_language(mel)  # Fixed syntax error here
            detected_lang = max(probs, key=probs.get)
            print(f"Detected language: {detected_lang}")
            lang_to_use = detected_lang
        else:
            lang_to_use = selected_language

        # Transcribe the audio
        result = model.transcribe(
            audio,
            language=lang_to_use if selected_language != "auto" else None,
            temperature=0.0,
            beam_size=5
        )

        text_output = result["text"]

        # Translate if needed
        if lang_to_use == "en":
            translated_text = translate_text(text_output, "en", "tl")
        elif lang_to_use == "tl":
            translated_text = translate_text(text_output, "tl", "en")
        else:
            translated_text = text_output  # No translation needed

        # Convert text to speech using gTTS
        tts_file = f"output_{str(uuid.uuid4())}.mp3"  # Unique filename
        tts = gTTS(text=translated_text, lang="tl" if lang_to_use == "en" else "en")
        tts.save(tts_file)
        time.sleep(1)  # Give time for file to be saved

        return text_output, translated_text, tts_file

    except Exception as e:
        return f"Error: {str(e)}", None, None

# Create Gradio Interface
demo = gr.Interface(
    fn=transcribe_translate_tts,
    inputs=[
        gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input"),
        gr.Dropdown(choices=list(LANGUAGES.values()), value="Auto Detect", label="Select Language")
    ],
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Textbox(label="Translated Text"),
        gr.Audio(label="Generated Speech")
    ],
    title="Whisper Speech-to-Speech Translation Demo",
    description="Upload audio or record from microphone. Get transcription, translation, and synthesized speech output."
)

demo.launch()  # Fixed launch() method name

100%|█████████████████████████████████████| 1.42G/1.42G [00:27<00:00, 55.0MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b7e7fd0c21c497b8e3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


