<a href="https://colab.research.google.com/github/Khairul-islam99/Qwen3_VL_8B_OCR/blob/main/Qwen3_VL_8B_OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.57.0
!pip install --no-deps trl==0.22.2

In [None]:
from unsloth import FastVisionModel
import torch

# Verify if the model is already active in the current session
if 'model' not in globals() or 'tokenizer' not in globals():
    print("[INFO] Model not detected in active memory. Initializing download and load sequence...")
    model, tokenizer = FastVisionModel.from_pretrained(
        "unsloth/Qwen3-VL-8B-Instruct",
        load_in_4bit = False,
        load_in_8bit = True,
        use_gradient_checkpointing = "unsloth",
    )
    print("[SUCCESS] Model successfully loaded and ready for inference!")
else:
    print("[INFO] Model is already present in memory. Skipping initialization.")

### Gradio

In [None]:
!pip install gradio

In [None]:
import gradio as gr
from PIL import Image
import io
from threading import Thread
from transformers import TextIteratorStreamer

def format_chat_history(chat_history):
    """
    Converts the Gradio chat history (a list of tuples) into the
    standardized list of dictionaries expected by the generation model.
    """
    messages = []
    if chat_history is None:
        return messages

    for turn in chat_history:
        user_msg, assistant_msg = turn
        messages.append({"role": "user", "content": [{"type": "text", "text": user_msg}]})
        if assistant_msg:
             messages.append({"role": "assistant", "content": [{"type": "text", "text": assistant_msg}]})

    return messages

def chat_stream(image_input, text_input, chat_history):
    """
    A generator function that processes inputs and streams the model's response
    back to the user interface in real-time.
    """

    # 1. Format the history and current prompt
    messages = format_chat_history(chat_history)

    # 2. Prepare the new user prompt
    new_user_content = []
    pil_image = None

    if image_input is not None:
        pil_image = image_input
        new_user_content.append({"type": "image"})

    if text_input:
        new_user_content.append({"type": "text", "text": text_input})
    else:
        if image_input is not None:
             new_user_content.append({"type": "text", "text": "Extract text from this image."})
             text_input = "Extract text from this image."
        else:
            yield chat_history
            return

    messages.append({"role": "user", "content": new_user_content})

    # 3. Apply the chat template
    input_text = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt = True
    )

    # 4. Prepare inputs for the model
    inputs = tokenizer(
        pil_image,
        input_text,
        add_special_tokens = False,
        return_tensors = "pt",
    ).to("cuda")

    # 5. Setup the text iterator streamer
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=8192,
        use_cache=True,
        temperature=0.2,
        min_p=0.1
    )

    # 6. Initialize generation in a separate background thread
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # 7. Yield new tokens iteratively to the Gradio UI
    if chat_history is None:
        chat_history = []

    chat_history.append((text_input, ""))

    response_text = ""
    for new_token in streamer:
        response_text += new_token
        chat_history[-1] = (text_input, response_text)
        yield chat_history

def clear_chat():
    """Clears the chat history and resets inputs."""
    return [], None, ""

# --- Build the Gradio Interface ---

theme = gr.themes.Soft(
    primary_hue="indigo",
    secondary_hue="blue",
    font=gr.themes.GoogleFont("Open Sans")
)

with gr.Blocks(theme=theme, title="Bini AI Assistant") as demo:
    with gr.Column():
        # --- Application Header ---
        gr.Markdown(
            """
            <div style="text-align: center; margin-bottom: 1rem;">
                <h2>ü§ñ Bini AI Assistant üíª</h2>
                <p>Advanced Multimodal Inference Interface</p>
            </div>
            """
        )

        # --- Chat Window ---
        chatbot = gr.Chatbot(
            value=[[None, "Hello! I am Bini, a multimodal AI assistant developed by Md Khairul Islam. How may I assist you today?"]],
            label="Interaction Log",
            height=400,
            bubble_full_width=False,
        )

        # --- Input Section ---
        with gr.Row(equal_height=False):
            with gr.Column(scale=4, min_width=200):
                image_box = gr.Image(
                    type="pil",
                    label="Upload Image Reference (Optional)",
                    sources=["upload"],
                    height=200
                )
            with gr.Column(scale=6):
                text_box = gr.Textbox(
                    label="Message",
                    placeholder="Enter your prompt or ask a question regarding the uploaded image...",
                    show_label=False,
                    lines=4
                )

        # --- Action Buttons ---
        with gr.Row():
            clear_btn = gr.Button("Clear Context", variant="stop")
            send_btn = gr.Button("Submit Request", variant="primary")

        # --- Example Prompts ---
        gr.Examples(
            examples=[
                ["Please transcribe all the text visible in this image."],
                ["Analyze this image and describe its primary contents."],
                ["Extract the data from this image and format it as a Markdown table."],
            ],
            inputs=[text_box],
            label="Suggested Prompts"
        )

        # --- Footer ---
        gr.Markdown(
            """
            <div style="text-align: center; font-size: 0.85em; color: gray; margin-top: 2rem;">
                <p><b>Bini AI</b> was developed by <b>Md Khairul Islam</b>.<br>
                Powered by unsloth/Qwen3-VL-8B-Instruct (4-bit quantization)</p>
            </div>
            """
        )

    # --- Event Handler Routines ---

    # Triggered via Send Button
    send_btn.click(
        chat_stream,
        inputs=[image_box, text_box, chatbot],
        outputs=[chatbot],
    ).then(
        lambda: (None, ""),
        outputs=[image_box, text_box]
    )

    # Triggered via Enter Key
    text_box.submit(
        chat_stream,
        inputs=[image_box, text_box, chatbot],
        outputs=[chatbot],
    ).then(
        lambda: (None, ""),
        outputs=[image_box, text_box]
    )

    # Triggered via Clear Button
    clear_btn.click(
        clear_chat,
        outputs=[chatbot, image_box, text_box]
    )

# Launch the interactive application
demo.launch(debug=True, share=True)

### Fast_API

In [None]:
!pip install fastapi uvicorn python-multipart pyngrok

In [None]:
import time
from pyngrok import ngrok
import uvicorn
import threading
import asyncio
from fastapi import FastAPI, File, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from PIL import Image
import io
import torch

# --- 1. Ngrok Network Configuration ---
NGROK_TOKEN = "Paste Token" # Developer authentication token
ngrok.set_auth_token(NGROK_TOKEN)

# --- 2. FastAPI Application Initialization ---
app = FastAPI(title="Qwen Unified OCR API Engine")

# Implement CORS Middleware to prevent cross-origin resource sharing errors
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Permit all external origins for testing
    allow_credentials=True,
    allow_methods=["*"],  # Permit all HTTP methods (GET, POST, etc.)
    allow_headers=["*"],  # Permit all headers
)

@app.post("/extract-text")
def extract_text(file: UploadFile = File(...)):
    global torch, tokenizer, model

    # Log incoming requests to the console
    print(f"\n[INFO] Incoming Request Detected. Target File: {file.filename}")
    start_time = time.time()

    try:
        # Read and convert the uploaded image
        image_bytes = file.file.read()
        pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

        prompt = """You are an expert OCR assistant. Your sole task is to extract all text, data, and mathematical equations from the image exactly as they appear.

Follow these strict rules:
1. ZERO HALLUCINATION & NO SOLVING: DO NOT solve mathematical equations, DO NOT answer questions, DO NOT translate, and DO NOT summarize. Just transcribe exactly what is visible.
2. BLANKS & EMPTY BOXES: If there is an empty box, blank line, or empty table cell, represent it exactly as [ ] without guessing the content.
3. BENGALI TEXT & NUMBERS: Pay extreme attention to Bengali numerals. Strictly differentiate between '‡ß™' (four) and '‡ßÆ' (eight), and '‡ß≠' (seven) and '‡ßß' (one). Ensure 100% accuracy for complex conjuncts (‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶æ‡¶ï‡ßç‡¶∑‡¶∞) and correct punctuation (like the '‡¶¶‡¶æ‡¶Å‡ßú‡¶ø' | ).
4. MATH & SYMBOLS: Use standard LaTeX formatting for all mathematical equations, fractions, superscripts, and subscripts. Keep geometry labels exact. Keep question numbers exact.
5. TABLES: Recreate all tables using strict Markdown formatting (| Column | Column |). Do not skip empty columns.
6. ENGLISH & CODE: Maintain strict case sensitivity (uppercase/lowercase). Do not auto-correct typos present in the image. If there is computer code, wrap it in triple backticks (```) and preserve exact spacing.
7. LAYOUT: Preserve original line breaks, stanzas, paragraphs, page numbers, and bullet points exactly as seen."""

        msg_content = [{"type": "image"}, {"type": "text", "text": prompt}]
        messages = [{"role": "user", "content": msg_content}]

        input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
        inputs = tokenizer(pil_image, input_text, add_special_tokens=False, return_tensors="pt").to("cuda")

        print("[INFO] Initializing inference via Qwen architecture. Processing in progress...")

        # Enforce strict inference parameters for accurate OCR
        with torch.inference_mode():
            output_ids = model.generate(
                **inputs,
                max_new_tokens=2048,
                use_cache=True,
                temperature=0.1,
                min_p=0.1
            )

        generated_ids = [out[len(inp):] for inp, out in zip(inputs.input_ids, output_ids)]
        extracted_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        process_time = time.time() - start_time
        print(f"[SUCCESS] Extraction payload compiled successfully in {process_time:.2f} seconds.")

        return {
            "status": "success",
            "extracted_text": extracted_text
        }

    except Exception as e:
        print(f"[ERROR] An exception occurred during execution: {str(e)}")
        return {"status": "error", "message": str(e)}

# --- 3. Background Server Configuration ---
class BackgroundUvicorn(threading.Thread):
    def __init__(self, config):
        super().__init__(daemon=True)
        self.server = uvicorn.Server(config)

    def run(self):
        # Establish an isolated event loop for the Uvicorn server thread
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.server.serve())

if __name__ == "__main__":
    # Terminate existing tunnels to prevent conflicts
    ngrok.kill()
    tunnel = ngrok.connect(8000)

    print("="*75)
    print(f"üöÄ PRIMARY API ENDPOINT: {tunnel.public_url}/extract-text")
    print(f"üìÑ INTERACTIVE API DOCUMENTATION (Swagger UI): {tunnel.public_url}/docs")
    print("="*75)

    # Boot up the server on the background thread
    config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="warning")
    server_thread = BackgroundUvicorn(config)
    server_thread.start()