In [1]:
!apt-get update -qq

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [2]:
!apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-hin tesseract-ocr-guj tesseract-ocr-pan

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
tesseract-ocr-eng is already the newest version (1:4.00~git30-7274cfa-1.1).
tesseract-ocr-eng set to manually installed.
The following NEW packages will be installed:
  tesseract-ocr-guj tesseract-ocr-hin tesseract-ocr-pan
0 upgraded, 3 newly installed, 0 to remove and 58 not upgraded.
Need to get 1,895 kB of archives.
After this operation, 3,084 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-guj all 1:4.00~git30-7274cfa-1.1 [660 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-hin all 1:4.00~git30-7274cfa-1.1 [913 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-pan all 1:4.00~git30-7274cfa-1.1 [322 kB]
Fetched 1,895 kB in 1s (2,729 kB/s)
Selecting previously unselected package tesseract-ocr-guj.
(Readi

In [3]:
!pip install -q pytesseract Pillow anuvaad-rev gTTS gradio

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Building wheel for typing (setup.py) ... [?25l[?25hdone


In [4]:
import pytesseract

In [5]:
from PIL import Image

In [6]:
import numpy as np

In [7]:
from anuvaad_rev import IndicTranslator

In [8]:
from gtts import gTTS

In [9]:
import gradio as gr

In [10]:
import tempfile

In [11]:
import os

In [12]:
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"

In [13]:
OCR_LANGS = {
    "English": "eng",
    "Hindi": "hin",
    "Gujarati": "guj",
    "Punjabi": "pan",
}

In [14]:
TRANSLATE_LANGS = {
    "English": "en",
    "Hindi": "hi",
    "Gujarati": "gu",
    "Punjabi": "pa",
}

In [15]:
TTS_LANGS = {
    "English": "en",
    "Hindi": "hi",
    "Gujarati": "gu",
}

In [16]:
translator = IndicTranslator()

In [17]:
def ocr_image(pil_img, ocr_lang_name):
    """
    pil_img: PIL.Image
    ocr_lang_name: one of OCR_LANGS keys
    """
    tess_lang = OCR_LANGS.get(ocr_lang_name, "eng")
    text = pytesseract.image_to_string(pil_img, lang=tess_lang)
    return text.strip()

In [18]:
def translate_text(text, target_lang_name):
    """
    Uses IndicTrans2 API via anuvaad-rev.
    Auto-detects source language; we only specify target.
    """
    if not text or not text.strip():
        return ""
    target_code = TRANSLATE_LANGS.get(target_lang_name, "en")
    try:
        out = translator.translate(text, target_lang=target_code)
        return out if out is not None else ""
    except Exception as e:
        print("Translation error:", e)
        return ""

In [19]:
def text_to_speech(text, speech_lang_name):
    """
    Uses gTTS to generate speech audio file.
    Returns path to a temporary .mp3 or None.
    """
    if not text or not text.strip():
        return None

    if speech_lang_name not in TTS_LANGS:
        # Punjabi is not supported in gTTS: return None and handle in UI
        return None

    lang_code = TTS_LANGS[speech_lang_name]

    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
            tts = gTTS(text=text, lang=lang_code)
            tts.save(fp.name)
            audio_path = fp.name
        return audio_path
    except Exception as e:
        print("TTS error:", e)
        return None

In [20]:
# ====== Cell 4: Gradio app for end-to-end pipeline ======

def full_pipeline(img, ocr_lang_name, target_lang_name, speak_language_same_as_target=True):
    if img is None:
        return "Please upload an image.", "", None, "No audio generated."

    pil_img = Image.fromarray(img.astype("uint8")).convert("RGB")

    # Step 1: OCR
    detected_text = ocr_image(pil_img, ocr_lang_name)
    if not detected_text.strip():
        return "No text detected.", "", None, "No audio generated."

    # Step 2: Translation
    translated_text = translate_text(detected_text, target_lang_name)
    if not translated_text.strip():
        translated_text = "(Translation failed – showing original text only)\n\n" + detected_text

    # Step 3: TTS
    speech_lang_name = target_lang_name if speak_language_same_as_target else "English"
    audio_path = text_to_speech(translated_text, speech_lang_name)

    if speech_lang_name not in TTS_LANGS:
        msg = "Punjabi TTS is not supported in gTTS. You can still see the translated text."
    elif audio_path is None:
        msg = "Could not generate audio. Check internet connection or text length."
    else:
        msg = f"Audio generated in {speech_lang_name}."

    return detected_text, translated_text, audio_path, msg


with gr.Blocks() as demo:
    gr.Markdown("## Multilingual Image → Text → Speech (Hindi / Gujarati / Punjabi / English)")
    gr.Markdown(
        "Upload an image containing text (medicine label, board, sign, etc.). "
        "The system will run OCR, translate it, and speak it out."
    )

    with gr.Row():
        img_input = gr.Image(type="numpy", label="Input Image")

        with gr.Column():
            ocr_lang = gr.Dropdown(
                choices=list(OCR_LANGS.keys()),
                value="English",
                label="OCR language (script of text in image)",
            )

            target_lang = gr.Dropdown(
                choices=list(TRANSLATE_LANGS.keys()),
                value="Hindi",
                label="Target language for translation",
            )

            speak_same = gr.Checkbox(
                value=True,
                label="Speak in the target language",
            )

            btn = gr.Button("Run")

    with gr.Row():
        detected_box = gr.Textbox(
            label="Detected text (before translation)",
            lines=6,
        )
        translated_box = gr.Textbox(
            label="Translated text (for user)",
            lines=6,
        )

    audio_out = gr.Audio(
        label="Spoken Output",
        type="filepath",
    )
    status_box = gr.Textbox(
        label="Status / Notes",
        interactive=False
    )

    btn.click(
        fn=full_pipeline,
        inputs=[img_input, ocr_lang, target_lang, speak_same],
        outputs=[detected_box, translated_box, audio_out, status_box],
    )

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0cb1b7d1aefb6a05b3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


