In [1]:
!apt-get update -qq

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [2]:
!apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-hin

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
tesseract-ocr-eng is already the newest version (1:4.00~git30-7274cfa-1.1).
tesseract-ocr-eng set to manually installed.
The following NEW packages will be installed:
  tesseract-ocr-hin
0 upgraded, 1 newly installed, 0 to remove and 46 not upgraded.
Need to get 913 kB of archives.
After this operation, 1,138 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-hin all 1:4.00~git30-7274cfa-1.1 [913 kB]
Fetched 913 kB in 2s (514 kB/s)
Selecting previously unselected package tesseract-ocr-hin.
(Reading database ... 121713 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-hin_1%3a4.00~git30-7274cfa-1.1_all.deb ...
Unpacking tesseract-ocr-hin (1:4.00~git30-7274cfa-1.1) ...
Setting up tesseract-ocr-hin (1:4.00~git30-7274cfa-1.1) ...


In [3]:
!pip install -q pytesseract Pillow transformers accelerate sentencepiece bitsandbytes gTTS gradio pandas

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import torch

In [5]:
from PIL import Image

In [6]:
import numpy as np

In [7]:
import pytesseract

In [8]:
from gtts import gTTS

In [9]:
import gradio as gr

In [10]:
import pandas as pd

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Path to tesseract binary (default in Colab)
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"

In [19]:
OCR_LANGS = {
    "English": "eng",
    "Hindi": "hin",
}

# For TTS (gTTS supports both)
TTS_LANGS = {
    "English": "en",
    "Hindi": "hi",
}

# ---- Load a small chat LLM from Hugging Face ----
# You can change this model later if you want.
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_8bit=True,   # saves memory on Colab
)

llm_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=64,
    do_sample=False,
    temperature=0.3,
)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [21]:
# ====== Cell 3: Core helper functions ======

def ocr_image(pil_img, lang_name):
    tess_lang = OCR_LANGS.get(lang_name, "eng")
    text = pytesseract.image_to_string(pil_img, lang=tess_lang)
    return text.strip()


def llm_translate(text, src_lang, tgt_lang, explain=False):
    """
    Uses the chat LLM to translate from src_lang to tgt_lang.
    If explain=True, also asks for a simple explanation.
    """
    if not text or not text.strip():
        return ""

    if explain:
        prompt = (
            f"You are a helpful translator.\n"
            f"Source language: {src_lang}\n"
            f"Target language: {tgt_lang}\n\n"
            f"Text:\n{text}\n\n"
            f"First give ONLY the translation in the target language.\n"
            f"Then on a new line write: Explanation: <simple explanation in {tgt_lang}>."
        )
    else:
        prompt = (
            f"Translate the following {src_lang} text to {tgt_lang}. "
            f"Only output the translation.\n\n{text}\n\nTranslation:"
        )

    out = llm_pipe(prompt)[0]["generated_text"]

    # Try to cut off the prompt part and keep only model answer
    if "Translation:" in out:
        out = out.split("Translation:", 1)[-1]

    return out.strip()


def text_to_speech(text, lang_name):
    """
    gTTS for English/Hindi.
    Returns path to a temporary .mp3 file.
    """
    if not text or not text.strip():
        return None

    if lang_name not in TTS_LANGS:
        return None

    lang_code = TTS_LANGS[lang_name]

    import tempfile
    import os

    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
            tts = gTTS(text=text, lang=lang_code)
            tts.save(fp.name)
            return fp.name
    except Exception as e:
        print("TTS error:", e)
        return None


In [20]:
# ====== Cell 4: Complete pipeline + UI ======

def full_pipeline(img, ocr_lang_name, src_lang_name, tgt_lang_name, explain_flag):
    if img is None:
        return "Please upload an image.", "", "", None, "No audio generated."

    # Convert from numpy to PIL
    pil_img = Image.fromarray(img.astype("uint8")).convert("RGB")

    # 1. OCR
    detected_text = ocr_image(pil_img, ocr_lang_name)
    if not detected_text.strip():
        return "No text detected.", "", "", None, "No audio generated."

    # 2. LLM Translation (and explanation)
    llm_output = llm_translate(detected_text, src_lang_name, tgt_lang_name, explain=explain_flag)

    # Try to split translation + explanation if present
    explanation = ""
    translated_text = llm_output

    if "Explanation:" in llm_output:
        parts = llm_output.split("Explanation:", 1)
        translated_text = parts[0].strip()
        explanation = parts[1].strip()

    # 3. TTS (using target language speech)
    audio_path = text_to_speech(translated_text, tgt_lang_name)

    if audio_path is None:
        msg = "Could not generate audio (check internet / text)."
    else:
        msg = f"Audio generated in {tgt_lang_name}."

    return detected_text, translated_text, explanation, audio_path, msg


with gr.Blocks() as demo:
    gr.Markdown("## LLM-based Multilingual Image → Text → Translation → Speech (English & Hindi)")

    with gr.Row():
        img_input = gr.Image(type="numpy", label="Input Image with Text")

        with gr.Column():
            ocr_lang = gr.Dropdown(
                choices=list(OCR_LANGS.keys()),
                value="English",
                label="OCR language (script in image)",
            )

            src_lang = gr.Dropdown(
                choices=["English", "Hindi"],
                value="English",
                label="Meaning of the original text",
            )

            tgt_lang = gr.Dropdown(
                choices=["English", "Hindi"],
                value="Hindi",
                label="Target language for translation",
            )

            explain_flag = gr.Checkbox(
                value=True,
                label="Ask LLM to also give a simple explanation",
            )

            btn = gr.Button("Run pipeline")

    with gr.Row():
        detected_box = gr.Textbox(label="Detected text (OCR output)", lines=5)
        translated_box = gr.Textbox(label="LLM translation", lines=5)
        explanation_box = gr.Textbox(label="LLM explanation (optional)", lines=5)

    audio_out = gr.Audio(label="Spoken translation", type="filepath")
    status_box = gr.Textbox(label="Status", interactive=False)

    btn.click(
        fn=full_pipeline,
        inputs=[img_input, ocr_lang, src_lang, tgt_lang, explain_flag],
        outputs=[detected_box, translated_box, explanation_box, audio_out, status_box],
    )

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5669d546e0d15045f4.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [22]:
# ====== Cell 5: Simple English-Hindi dataset ======

data = [
    {
        "id": 1,
        "src_lang": "English",
        "tgt_lang": "Hindi",
        "text": "Take one tablet after breakfast with water.",
    },
    {
        "id": 2,
        "src_lang": "English",
        "tgt_lang": "Hindi",
        "text": "Do not drive or operate machines after taking this medicine.",
    },
    {
        "id": 3,
        "src_lang": "Hindi",
        "tgt_lang": "English",
        "text": "दवा को ठंडी और सूखी जगह पर रखें।",
    },
    {
        "id": 4,
        "src_lang": "Hindi",
        "tgt_lang": "English",
        "text": "यदि चक्कर आए या घबराहट महसूस हो तो तुरंत डॉक्टर से संपर्क करें।",
    },
]

df = pd.DataFrame(data)
df


Unnamed: 0,id,src_lang,tgt_lang,text
0,1,English,Hindi,Take one tablet after breakfast with water.
1,2,English,Hindi,Do not drive or operate machines after taking ...
2,3,Hindi,English,दवा को ठंडी और सूखी जगह पर रखें।
3,4,Hindi,English,यदि चक्कर आए या घबराहट महसूस हो तो तुरंत डॉक्ट...


In [23]:
# ====== Cell 6: Use the LLM translator on the dataset (with progress) ======

translations = []

for i, row in df.iterrows():
    print(f"Translating row {i+1}/{len(df)} ...")
    out = llm_translate(row["text"], row["src_lang"], row["tgt_lang"], explain=False)
    translations.append(out)

df["llm_translation"] = translations
df


Translating row 1/4 ...
Translating row 2/4 ...
Translating row 3/4 ...
Translating row 4/4 ...


Unnamed: 0,id,src_lang,tgt_lang,text,llm_translation
0,1,English,Hindi,Take one tablet after breakfast with water.,एक स्टेट्स का खाना ऑन एक फांसी से फूड से प्राप...
1,2,English,Hindi,Do not drive or operate machines after taking ...,इस मेले के दौरान इस मेले के समय उसके उपयोग के लिए
2,3,Hindi,English,दवा को ठंडी और सूखी जगह पर रखें।,"To cut the grass and water the plants,\nIn the..."
3,4,Hindi,English,यदि चक्कर आए या घबराहट महसूस हो तो तुरंत डॉक्ट...,If you are in a hurry and feel a sudden urge t...


In [26]:
df.to_csv("sample_en_hi_llm_translations.csv", index=False)


In [27]:
from google.colab import files
files.download("sample_en_hi_llm_translations.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>