In [1]:
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q torch pillow pdf2image accelerate gradio
!apt-get install -y poppler-utils > /dev/null 2>&1

import torch
from transformers import LightOnOcrForConditionalGeneration, LightOnOcrProcessor
from PIL import Image
import pdf2image
import gradio as gr
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if device == "cuda" else torch.float32

model = LightOnOcrForConditionalGeneration.from_pretrained(
    "lightonai/LightOnOCR-2-1B",
    torch_dtype=dtype
).to(device)

processor = LightOnOcrProcessor.from_pretrained("lightonai/LightOnOCR-2-1B")

def pdf_to_images(pdf_path, dpi=300):
    try:
        images = pdf2image.convert_from_path(pdf_path, dpi=dpi)
        return images
    except Exception as e:
        return []

def extract_text_from_image(image, model, processor, device, dtype):
    try:
        if image.mode != 'RGB':
            image = image.convert('RGB')
        conversation = [{"role": "user", "content": [{"type": "image", "image": image}]}]
        inputs = processor.apply_chat_template(
            conversation, add_generation_prompt=True, tokenize=True,
            return_dict=True, return_tensors="pt",
        )
        inputs = {k: v.to(device=device, dtype=dtype) if v.is_floating_point() else v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            output_ids = model.generate(**inputs, max_new_tokens=2048)
        generated_ids = output_ids[0, inputs["input_ids"].shape[1]:]
        output_text = processor.decode(generated_ids, skip_special_tokens=True)
        return output_text.strip()
    except Exception as e:
        return f"Error: {str(e)}"

def process_pdf(pdf_file):
    if pdf_file is None:
        return "No file uploaded.", None

    images = pdf_to_images(pdf_file.name, dpi=300)
    if not images:
        return "Failed to convert PDF.", None

    all_extracted_text = []
    for idx, image in enumerate(images):
        extracted_text = extract_text_from_image(image, model, processor, device, dtype)
        page_header = f"\n{'='*70}\nPage {idx + 1}\n{'='*70}"
        all_extracted_text.append(f"{page_header}\n{extracted_text}")

    combined_text = "\n".join(all_extracted_text)

    output_filename = "extracted_text.txt"
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(combined_text)

    return combined_text, output_filename

with gr.Blocks() as demo:
    gr.Markdown("# Arabic OCR Tool")
    with gr.Row():
        with gr.Column():
            input_file = gr.File(label="Upload PDF", file_types=[".pdf"])
            btn = gr.Button("Extract Text", variant="primary")
        with gr.Column():
            output_text = gr.Textbox(label="Extracted Content", lines=15)
            output_file = gr.File(label="Download TXT File")

    btn.click(process_pdf, inputs=input_file, outputs=[output_text, output_file])

demo.launch(share=True)

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

You are using a model of type mistral3 to instantiate a model of type lighton_ocr. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/2.01G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/532 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/219 [00:00<?, ?B/s]

processor_config.json: 0.00B [00:00, ?B/s]

chat_template.jinja:   0%|          | 0.00/720 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b0e3d8dd591478ffbb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


