<a href="https://colab.research.google.com/github/Khairul-islam99/Qwen3_VL_8B_OCR/blob/main/Qwen3_VL_8B_OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Model

In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.57.0
!pip install --no-deps trl==0.22.2

In [None]:
from unsloth import FastVisionModel
import torch

# Verify if the model is already active in the current session
if 'model' not in globals() or 'tokenizer' not in globals():
    print("[INFO] Model not detected in active memory. Initializing download and load sequence...")
    model, tokenizer = FastVisionModel.from_pretrained(
        "unsloth/Qwen3-VL-8B-Instruct",
        load_in_4bit = False,
        load_in_8bit = True,
        use_gradient_checkpointing = "unsloth",
    )
    print("[SUCCESS] Model successfully loaded and ready for inference!")
else:
    print("[INFO] Model is already present in memory. Skipping initialization.")

### Gradio

In [None]:
!pip install gradio PyMuPDF

In [None]:
import gradio as gr
from PIL import Image
import io
from threading import Thread
from transformers import TextIteratorStreamer
import fitz

# --- THE UPGRADED MASTER PROMPT (STRICT EDITION) ---
MASTER_PROMPT = """You are a highly precise OCR assistant. Extract EVERYTHING from the provided image with 100% accuracy. Output ONLY the extracted text. Do not include any conversational filler.

STRICT GUIDELINES:
1. ZERO HALLUCINATION & NO SOLVING (CRITICAL): Transcribe exactly what is visible. DO NOT calculate missing math answers, DO NOT solve equations, and DO NOT auto-complete patterns.
2. BLANKS & BOXES (CRITICAL): If a table cell is empty or there is a blank space, you MUST write exactly `[ ]`. DO NOT fill in the blanks with your own calculations (e.g., do not calculate squares for missing cells).
3. TABLES (CRITICAL): You MUST recreate all tables using strict Markdown pipe formatting (e.g., `| Column 1 | Column 2 |`). Do not use raw tabs or spaces. Any table not using `|` is a failure.
4. BENGALI ACCURACY: Pay extreme attention to Bengali numerals. Strictly differentiate '‡ß™' (four) from '‡ßÆ' (eight), and '‡ß≠' (seven) from '‡ßß' (one). Preserve complex conjuncts (‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶æ‡¶ï‡ßç‡¶∑‡¶∞) and Bengali punctuation.
5. CAPTIONS: You must extract all headers, footers, page numbers, and image captions exactly as they appear in the original layout.
6. MATH & SYMBOLS: Use standard LaTeX formatting for equations, fractions, superscripts, and subscripts.
7. CODE & ENGLISH: Maintain strict case sensitivity. Do not auto-correct typos.
8. LAYOUT: Replicate the original line breaks, paragraphs, and bullet points exactly as seen."""

def format_chat_history(chat_history):
    messages = []
    if chat_history is None:
        return messages
    for turn in chat_history:
        user_msg, assistant_msg = turn
        messages.append({"role": "user", "content": [{"type": "text", "text": user_msg}]})
        if assistant_msg:
             messages.append({"role": "assistant", "content": [{"type": "text", "text": assistant_msg}]})
    return messages

def chat_stream(file_input, text_input, chat_history):
    messages = format_chat_history(chat_history)
    new_user_content = []
    pil_image = None

    if file_input is not None:
        file_path = file_input

        # Check if uploaded file is a PDF
        if file_path.lower().endswith(".pdf"):
            print("[INFO] PDF file detected in Gradio. Converting page 0 to image...")
            pdf_doc = fitz.open(file_path)
            page = pdf_doc.load_page(0)
            pix = page.get_pixmap(dpi=300)
            pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            pdf_doc.close()
        else:
            print("[INFO] Standard image detected in Gradio. Processing directly...")
            pil_image = Image.open(file_path).convert("RGB")

        new_user_content.append({"type": "image"})

    if text_input:
        new_user_content.append({"type": "text", "text": text_input})
    else:
        if file_input is not None:
             new_user_content.append({"type": "text", "text": MASTER_PROMPT})
             text_input = "Executing Strict Master OCR Prompt..."
        else:
            yield chat_history
            return

    messages.append({"role": "user", "content": new_user_content})

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = tokenizer(pil_image, input_text, add_special_tokens=False, return_tensors="pt").to("cuda")

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # --- DETERMINISTIC SETTINGS (100% NO SAMPLING) ---
    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=8192,
        use_cache=True,
        do_sample=False,
        temperature=None,
        top_p=None,
        min_p=None
    )

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    if chat_history is None:
        chat_history = []

    chat_history.append((text_input, ""))
    response_text = ""
    for new_token in streamer:
        response_text += new_token
        chat_history[-1] = (text_input, response_text)
        yield chat_history

def clear_chat():
    return [], None, ""

# --- Build the Gradio Interface ---
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue", font=gr.themes.GoogleFont("Open Sans"))

with gr.Blocks(theme=theme, title="Bini AI Assistant") as demo:
    with gr.Column():
        gr.Markdown(
            """
            <div style="text-align: center; margin-bottom: 1rem;">
                <h2>ü§ñ Bini AI Assistant üíª</h2>
                <p>Advanced Multimodal OCR Interface (Supports Image & PDF)</p>
            </div>
            """
        )
        chatbot = gr.Chatbot(
            value=[[None, "Hello! I am Bini, a highly precise OCR assistant. Upload an image or PDF, and I will extract the text exactly as it appears."]],
            label="Interaction Log",
            height=400,
            bubble_full_width=False,
        )
        with gr.Row(equal_height=False):
            with gr.Column(scale=4, min_width=200):
                file_box = gr.File(file_types=["image", ".pdf"], label="Upload Image or PDF", height=200)
            with gr.Column(scale=6):
                text_box = gr.Textbox(
                    label="Message",
                    placeholder="Leave blank to use the Master OCR Prompt, or type a custom question...",
                    show_label=False,
                    lines=4
                )
        with gr.Row():
            clear_btn = gr.Button("Clear Context", variant="stop")
            send_btn = gr.Button("Submit Request", variant="primary")

        gr.Markdown(
            """
            <div style="text-align: center; font-size: 0.85em; color: gray; margin-top: 2rem;">
                <p><b>Bini AI</b> was developed by <b>Md Khairul Islam</b>.<br>
                Powered by unsloth/Qwen3-VL-8B-Instruct</p>
            </div>
            """
        )

    send_btn.click(chat_stream, inputs=[file_box, text_box, chatbot], outputs=[chatbot]).then(lambda: (None, ""), outputs=[file_box, text_box])
    text_box.submit(chat_stream, inputs=[file_box, text_box, chatbot], outputs=[chatbot]).then(lambda: (None, ""), outputs=[file_box, text_box])
    clear_btn.click(clear_chat, outputs=[chatbot, file_box, text_box])

demo.launch(debug=True, share=True)

### Fast_API

In [None]:
!pip install fastapi uvicorn python-multipart pyngrok PyMuPDF

In [None]:
import time
from pyngrok import ngrok
import uvicorn
import threading
import asyncio
from fastapi import FastAPI, File, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from PIL import Image
import io
import torch
import fitz

# --- 1. Ngrok Network Configuration ---
NGROK_TOKEN = "paste Token" # ‡¶Ü‡¶™‡¶®‡¶æ‡¶∞ ‡¶ü‡ßã‡¶ï‡ßá‡¶®
ngrok.set_auth_token(NGROK_TOKEN)

# --- 2. FastAPI Application Initialization ---
app = FastAPI(title="Qwen Unified OCR API Engine")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.post("/extract-text")
def extract_text(file: UploadFile = File(...)):
    global torch, tokenizer, model

    print(f"\n[INFO] Incoming Request Detected. Target File: {file.filename}")
    start_time = time.time()

    try:
        file_bytes = file.file.read()

        # --- PDF & Image Logic ---
        if file.filename.lower().endswith(".pdf") or file.content_type == "application/pdf":
            print("[INFO] PDF file detected. Converting the first page to a high-resolution image...")
            pdf_document = fitz.open(stream=file_bytes, filetype="pdf")
            page = pdf_document.load_page(0)
            pix = page.get_pixmap(dpi=300)
            pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            pdf_document.close()
        else:
            pil_image = Image.open(io.BytesIO(file_bytes)).convert("RGB")

        # --- THE UPGRADED MASTER PROMPT (STRICT EDITION) ---
        prompt = """You are a highly precise OCR assistant. Extract EVERYTHING from the provided image with 100% accuracy. Output ONLY the extracted text. Do not include any conversational filler.

STRICT GUIDELINES:
1. ZERO HALLUCINATION & NO SOLVING (CRITICAL): Transcribe exactly what is visible. DO NOT calculate missing math answers, DO NOT solve equations, and DO NOT auto-complete patterns.
2. BLANKS & BOXES (CRITICAL): If a table cell is empty or there is a blank space, you MUST write exactly `[ ]`. DO NOT fill in the blanks with your own calculations (e.g., do not calculate squares for missing cells).
3. TABLES (CRITICAL): You MUST recreate all tables using strict Markdown pipe formatting (e.g., `| Column 1 | Column 2 |`). Do not use raw tabs or spaces. Any table not using `|` is a failure.
4. BENGALI ACCURACY: Pay extreme attention to Bengali numerals. Strictly differentiate '‡ß™' (four) from '‡ßÆ' (eight), and '‡ß≠' (seven) from '‡ßß' (one). Preserve complex conjuncts (‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶æ‡¶ï‡ßç‡¶∑‡¶∞) and Bengali punctuation.
5. CAPTIONS: You must extract all headers, footers, page numbers, and image captions exactly as they appear in the original layout.
6. MATH & SYMBOLS: Use standard LaTeX formatting for equations, fractions, superscripts, and subscripts.
7. CODE & ENGLISH: Maintain strict case sensitivity. Do not auto-correct typos.
8. LAYOUT: Replicate the original line breaks, paragraphs, and bullet points exactly as seen."""

        msg_content = [{"type": "image"}, {"type": "text", "text": prompt}]
        messages = [{"role": "user", "content": msg_content}]

        input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
        inputs = tokenizer(pil_image, input_text, add_special_tokens=False, return_tensors="pt").to("cuda")

        print("[INFO] Initializing inference via Qwen architecture. Processing in progress...")

        # --- DETERMINISTIC SETTINGS (100% NO SAMPLING) ---
        with torch.inference_mode():
            output_ids = model.generate(
                **inputs,
                max_new_tokens=2048,
                use_cache=True,
                do_sample=False,
                temperature=None,
                top_p=None,
                min_p=None
            )

        generated_ids = [out[len(inp):] for inp, out in zip(inputs.input_ids, output_ids)]
        extracted_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        process_time = time.time() - start_time
        print(f"[SUCCESS] Extraction payload compiled successfully in {process_time:.2f} seconds.")

        return {
            "status": "success",
            "extracted_text": extracted_text
        }

    except Exception as e:
        print(f"[ERROR] An exception occurred during execution: {str(e)}")
        return {"status": "error", "message": str(e)}

# --- 3. Background Server Configuration ---
class BackgroundUvicorn(threading.Thread):
    def __init__(self, config):
        super().__init__(daemon=True)
        self.server = uvicorn.Server(config)

    def run(self):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.server.serve())

if __name__ == "__main__":
    ngrok.kill()
    tunnel = ngrok.connect(8000)

    print("="*75)
    print(f"üöÄ PRIMARY API ENDPOINT: {tunnel.public_url}/extract-text")
    print(f"üìÑ INTERACTIVE API DOCUMENTATION (Swagger UI): {tunnel.public_url}/docs")
    print("="*75)

    config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="warning")
    server_thread = BackgroundUvicorn(config)
    server_thread.start()