In [None]:
# ========================================
# CELL 1: Install EXACT Official Versions
# ========================================

print("Installing official DeepSeek-OCR dependencies...")

!pip install "numpy<2.0" -q
!pip uninstall flash-attn -y -q 2>/dev/null

# Use EXACT versions from official requirements
!pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu121 -q
!pip install transformers==4.46.3 tokenizers==0.20.3 accelerate pillow requests sentencepiece protobuf addict easydict einops -q

print("\n" + "="*60)
print("✅ Installed official versions")
print("="*60)
print("\n⚠️  RESTART RUNTIME NOW")
print("   Runtime → Restart runtime")
print("\nThen run CELL 2")
print("="*60)

Installing official DeepSeek-OCR dependencies...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
pytensor 2.35.1 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,

In [None]:
# ========================================
# CELL 2: Load Model (NO PATCHING NEEDED!)
# ========================================

from transformers import AutoModel, AutoTokenizer
import torch
from PIL import Image
import os
import warnings
warnings.filterwarnings('ignore')

import transformers
print(f"Transformers: {transformers.__version__}")
print(f"PyTorch: {torch.__version__}")

os.environ["CUDA_VISIBLE_DEVICES"] = '0'

model_name = 'deepseek-ai/DeepSeek-OCR'
print(f"\nLoading: {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Load without flash_attention (can't compile in Colab)
model = AutoModel.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map='auto'
).eval()

print("\n✅ Model ready!")

# Functions
def perform_ocr(image_path, prompt_type="markdown", base_size=1024):
    prompts = {
        "markdown": "<image>\n<|grounding|>Convert the document to markdown.",
        "free_ocr": "<image>\nFree OCR.",
        "ocr": "<image>\n<|grounding|>OCR this image.",
        "describe": "<image>\nDescribe this image in detail.",
        "figure": "<image>\nParse the figure."
    }

    os.makedirs("./output", exist_ok=True)
    print(f"\n🔍 {os.path.basename(image_path)}")
    print(f"📝 {prompt_type} | {base_size}x{base_size}")

    result = model.infer(
        tokenizer,
        prompt=prompts[prompt_type],
        image_file=image_path,
        output_path="./output",
        base_size=base_size,
        image_size=640,
        crop_mode=True,
        save_results=True,
        test_compress=True
    )

    return result

from google.colab import files
import requests
from io import BytesIO

def download_image(url, save_path="image.jpg"):
    img = Image.open(BytesIO(requests.get(url).content))
    img.save(save_path)
    print(f"✅ Downloaded: {save_path}")
    return save_path

def upload_and_process(prompt_type="markdown", base_size=1024):
    print("📤 Upload image...")
    uploaded = files.upload()

    if uploaded:
        img_path = list(uploaded.keys())[0]
        result = perform_ocr(img_path, prompt_type, base_size)

        print("\n" + "="*60)
        print("📄 RESULT:")
        print("="*60)
        print(result)
        print("="*60)

        return result
    return None

print("\n🎉 READY! Try: upload_and_process()")

Transformers: 4.46.3
PyTorch: 2.8.0+cu126

Loading: deepseek-ai/DeepSeek-OCR


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

modeling_deepseekocr.py: 0.00B [00:00, ?B/s]

conversation.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


configuration_deepseek_v2.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- configuration_deepseek_v2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_deepseekv2.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- modeling_deepseekv2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


deepencoder.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- deepencoder.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- modeling_deepseekocr.py
- conversation.py
- configuration_deepseek_v2.py
- modeling_deepseekv2.py
- deepencoder.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
You are using a model of type deepseek_vl_v2 to instantiate a model of type DeepseekOCR. This is not supported for all configurations of models and can yield errors.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/1 [00:00<?, ?it/s]

model-00001-of-000001.safetensors:   0%|          | 0.00/6.67G [00:00<?, ?B/s]

Some weights of DeepseekOCRForCausalLM were not initialized from the model checkpoint at deepseek-ai/DeepSeek-OCR and are newly initialized: ['model.vision_model.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



✅ Model ready!

🎉 READY! Try: upload_and_process()


In [None]:
upload_and_process()

📤 Upload image...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Saving आधुनिक बनाम पारंपरिक_ AI और आवाज पृथक्करण.png to आधुनिक बनाम पारंपरिक_ AI और आवाज पृथक्करण (1).png

🔍 आधुनिक बनाम पारंपरिक_ AI और आवाज पृथक्करण (1).png
📝 markdown | 1024x1024
BASE:  torch.Size([1, 256, 1280])
PATCHES:  torch.Size([6, 100, 1280])
<|ref|>title<|/ref|><|det|>[[205, 48, 810, 216]]<|/det|>
# Old vs. New: How AI Redefines Voice Isolation  

<|ref|>table<|/ref|><|det|>[[48, 234, 945, 956]]<|/det|>

<table><tr><td>Aspect</td><td>Traditional Audio Filtering (FFT)</td></tr><tr><td>Core Logic</td><td>Frequency filtering – manually cuts bands to remove noise</td></tr><tr><td>Accuracy</td><td>Lose voice quality</td></tr><tr><td>Speed</td><td>Retains full vocal timbre</td></tr><tr><td>Skill Needed</td><td>Instant, one-click separation</td></tr><tr><td>Output Quality</td><td>Anyone with API key or web access</td></tr><tr><td>Analogy</td><td>Often muffled or metallic</td></tr><tr><td>Analogy</td><td>Using a sieve to separate salt and sand</td></tr><tr><td>Analogy</td><td>Using 

image: 0it [00:00, ?it/s]
other: 100%|██████████| 2/2 [00:00<00:00, 20815.40it/s]


📄 RESULT:
None





In [None]:
# ========================================
# CELL 3: Extended Prompt Support
# ========================================

def perform_ocr_custom(image_path, prompt=None, prompt_type=None, ref_text=None, base_size=1024):
    """
    Extended OCR with custom prompt support

    Args:
        image_path: Path to image
        prompt: Custom prompt string (overrides prompt_type)
        prompt_type: Preset type ("markdown", "ocr", "free_ocr", "describe", "figure", "rec")
        ref_text: Reference text for "rec" type (e.g., "table", "title")
        base_size: Resolution (512, 640, 1024, 1280)

    Returns:
        OCR result text
    """
    # Preset prompts
    presets = {
        "markdown": "<image>\n<|grounding|>Convert the document to markdown.",
        "ocr": "<image>\n<|grounding|>OCR this image.",
        "free_ocr": "<image>\nFree OCR.",
        "figure": "<image>\nParse the figure.",
        "describe": "<image>\nDescribe this image in detail.",
        "rec": "<image>\nLocate <|ref|>{ref}<|/ref|> in the image."
    }

    # Determine which prompt to use
    if prompt:
        # Custom prompt takes priority
        final_prompt = prompt
    elif prompt_type == "rec" and ref_text:
        # Special handling for location/recognition
        final_prompt = presets["rec"].format(ref=ref_text)
    elif prompt_type in presets:
        # Use preset
        final_prompt = presets[prompt_type]
    else:
        # Default
        final_prompt = presets["markdown"]

    print(f"\n🔍 {os.path.basename(image_path)}")
    print(f"📝 Prompt: {final_prompt[:50]}..." if len(final_prompt) > 50 else f"📝 Prompt: {final_prompt}")
    print(f"📐 Resolution: {base_size}x{base_size}")

    # Run OCR
    import tempfile
    import shutil

    with tempfile.TemporaryDirectory() as temp_dir:
        model.infer(
            tokenizer,
            prompt=final_prompt,
            image_file=image_path,
            output_path=temp_dir,
            base_size=base_size,
            image_size=640,
            crop_mode=True,
            save_results=True,
            test_compress=True
        )

        # Read result
        result_file = os.path.join(temp_dir, "result.mmd")
        if os.path.exists(result_file):
            with open(result_file, 'r') as f:
                result = f.read()

            # Save to output
            os.makedirs("./output", exist_ok=True)
            shutil.copy(result_file, "./output/result.mmd")
            if os.path.exists(os.path.join(temp_dir, "result_with_boxes.jpg")):
                shutil.copy(os.path.join(temp_dir, "result_with_boxes.jpg"),
                           "./output/result_with_boxes.jpg")

            return result
        return "No result generated"


def locate_in_image(image_path, reference_text, base_size=1024):
    """
    Locate specific text/element in image with bounding boxes

    Args:
        image_path: Path to image
        reference_text: Text to locate (e.g., "title", "table", "先天下之忧而忧")
        base_size: Resolution

    Example:
        locate_in_image("doc.jpg", "title")
        locate_in_image("doc.jpg", "table")
    """
    return perform_ocr_custom(
        image_path,
        prompt_type="rec",
        ref_text=reference_text,
        base_size=base_size
    )


def quick_examples():
    """Show usage examples"""
    print("""
📚 USAGE EXAMPLES:

1️⃣ Preset prompts:
   result = perform_ocr_custom("image.jpg", prompt_type="markdown")
   result = perform_ocr_custom("image.jpg", prompt_type="ocr")
   result = perform_ocr_custom("image.jpg", prompt_type="free_ocr")
   result = perform_ocr_custom("image.jpg", prompt_type="figure")
   result = perform_ocr_custom("image.jpg", prompt_type="describe")

2️⃣ Locate specific elements:
   result = locate_in_image("image.jpg", "title")
   result = locate_in_image("image.jpg", "table")
   result = locate_in_image("image.jpg", "chart")
   result = locate_in_image("image.jpg", "先天下之忧而忧")  # Chinese text

3️⃣ Completely custom prompt:
   prompt = "<image>\\nExtract all mathematical equations."
   result = perform_ocr_custom("image.jpg", prompt=prompt)

   prompt = "<image>\\nList all phone numbers and email addresses."
   result = perform_ocr_custom("image.jpg", prompt=prompt)

4️⃣ Different resolutions:
   result = perform_ocr_custom("image.jpg", prompt_type="markdown", base_size=512)   # Fast
   result = perform_ocr_custom("image.jpg", prompt_type="markdown", base_size=1280)  # Best quality

5️⃣ Upload and process with custom prompt:
   uploaded = files.upload()
   img = list(uploaded.keys())[0]
   result = locate_in_image(img, "table")
   print(result)
""")

# Show examples
quick_examples()

print("\n✅ Extended functions loaded!")
print("💡 Try: locate_in_image('your_image.jpg', 'title')")


📚 USAGE EXAMPLES:

1️⃣ Preset prompts:
   result = perform_ocr_custom("image.jpg", prompt_type="markdown")
   result = perform_ocr_custom("image.jpg", prompt_type="ocr")
   result = perform_ocr_custom("image.jpg", prompt_type="free_ocr")
   result = perform_ocr_custom("image.jpg", prompt_type="figure")
   result = perform_ocr_custom("image.jpg", prompt_type="describe")

2️⃣ Locate specific elements:
   result = locate_in_image("image.jpg", "title")
   result = locate_in_image("image.jpg", "table")
   result = locate_in_image("image.jpg", "chart")
   result = locate_in_image("image.jpg", "先天下之忧而忧")  # Chinese text

3️⃣ Completely custom prompt:
   prompt = "<image>\nExtract all mathematical equations."
   result = perform_ocr_custom("image.jpg", prompt=prompt)
   
   prompt = "<image>\nList all phone numbers and email addresses."
   result = perform_ocr_custom("image.jpg", prompt=prompt)

4️⃣ Different resolutions:
   result = perform_ocr_custom("image.jpg", prompt_type="markdown", bas

In [None]:
# Example 1: Locate specific elements
result = locate_in_image("आधुनिक बनाम पारंपरिक_ AI और आवाज पृथक्करण.png", "skill")
print(result)




🔍 आधुनिक बनाम पारंपरिक_ AI और आवाज पृथक्करण.png
📝 Prompt: <image>
Locate <|ref|>skill<|/ref|> in the image.
📐 Resolution: 1024x1024


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


BASE:  torch.Size([1, 256, 1280])
PATCHES:  torch.Size([6, 100, 1280])
<|ref|>skill<|/ref|><|det|>[[50, 234, 950, 955]]<|/det|>
image size:  (1024, 1536)
valid image tokens:  770
output texts tokens (valid):  18
compression ratio:  0.02


image: 0it [00:00, ?it/s]
other: 100%|██████████| 1/1 [00:00<00:00, 11155.06it/s]







In [None]:
# Example 2: Custom prompt
result = perform_ocr_custom(
    "आधुनिक बनाम पारंपरिक_ AI और आवाज पृथक्करण (1).png",
    prompt="<image>\nExtract only the headings and subheadings."
)
print(result)




🔍 आधुनिक बनाम पारंपरिक_ AI और आवाज पृथक्करण (1).png
📝 Prompt: <image>
Extract only the headings and subheadings.
📐 Resolution: 1024x1024


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


BASE:  torch.Size([1, 256, 1280])
PATCHES:  torch.Size([6, 100, 1280])
| Aspect                                      | Traditional Audio Filtering (FFT)                                                                 |
|--------------------------------------------|---------------------------------------------------------------------------------------------|
| **Core Logic**                            | Frequency filtering – manually cuts bands to remove noise                                    |
| **Accuracy**                             | Lose voice quality                                                                             |
| **Speed**                                | Retains full vocal timbre                                                                       |
| **Skill Needed**                          | Instant, one-click separation                                                                   |
| **Output Quality**                        | Anyone with API key or w

image: 0it [00:00, ?it/s]
other: 0it [00:00, ?it/s]

| Aspect                                      | Traditional Audio Filtering (FFT)                                                                 |
|--------------------------------------------|---------------------------------------------------------------------------------------------|
| **Core Logic**                            | Frequency filtering – manually cuts bands to remove noise                                    |
| **Accuracy**                             | Lose voice quality                                                                             |
| **Speed**                                | Retains full vocal timbre                                                                       |
| **Skill Needed**                          | Instant, one-click separation                                                                   |
| **Output Quality**                        | Anyone with API key or web access                                                              




In [None]:
# Example 1: Locate specific elements
result = perform_ocr_custom("receipt.jpg", "extract all details")
print(result)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



🔍 receipt.jpg
📝 Prompt: extract all details
📐 Resolution: 1024x1024
BASE:  torch.Size([1, 256, 1280])
NO PATCHES


HOTEL

52,Idgah Bus Stand Road,Agra-282 001 (INDIA)

Phone:91-562-2421158,3199900

Riya Palace

Date.

No.

728

Room No...103/2344

Room Rent...

the sum of Rs.

Dated.

on account of.

Customer Signature

Authorised Signature
No result generated


In [None]:
# Example 3: Preset prompts
result = perform_ocr_custom("your_image.jpg", prompt_type="figure")
print(result)

# Example 4: Upload and locate
uploaded = files.upload()
img = list(uploaded.keys())[0]
result = locate_in_image(img, "title")
print(result)

In [None]:
# View the saved results
!ls -la ./output/
!cat ./output/*.txt 2>/dev/null || echo "Text file not found"

total 200
drwxr-xr-x 3 root root   4096 Oct 26 11:27 .
drwxr-xr-x 1 root root   4096 Oct 26 11:30 ..
drwxr-xr-x 2 root root   4096 Oct 26 11:27 images
-rw-r--r-- 1 root root    703 Oct 26 11:27 result.mmd
-rw-r--r-- 1 root root 185353 Oct 26 11:27 result_with_boxes.jpg
Text file not found


In [None]:
!cat ./output/result.mmd

| Aspect                                      | Traditional Audio Filtering (FFT)                                                                 |
|--------------------------------------------|---------------------------------------------------------------------------------------------|
| **Core Logic**                            | Frequency filtering – manually cuts bands to remove noise                                    |
| **Accuracy**                             | Lose voice quality                                                                             |
| **Speed**                                | Retains full vocal timbre                                                                       |
| **Skill Needed**                          | Instant, one-click separation                                                                   |
| **Output Quality**                        | Anyone with API key or web access                                                              