In [4]:
from transformers import GotOcr2ForConditionalGeneration # You might need to find the exact import name if this is custom

# Attempt to load using the specific model class
try:
    model = GotOcr2ForConditionalGeneration.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
except ImportError:
    print("Could not import GotOcr2ForConditionalGeneration. This class might not be available in your transformers version or requires a custom script from the model provider.")
except Exception as e:
    print(f"Error loading with GotOcr2ForConditionalGeneration: {e}")

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

In [7]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [8]:
# Save directory (relative to notebook location)
save_path = "./GOT-OCR-2.0-hf"  # Creates a folder in the same directory as the notebook

# Save both model and processor
model.save_pretrained(save_path)
processor.save_pretrained(save_path)

print(f"Model and processor saved to: {save_path}")

Model and processor saved to: ./GOT-OCR-2.0-hf


In [1]:
# OR SNAPSHOT DOWNLOAD ALL
from huggingface_hub import snapshot_download

# this will grab everything in the model repo (including the custom .py files)
snapshot_download(
    repo_id="stepfun-ai/GOT-OCR-2.0-hf",
    repo_type="model",
    local_dir="./GOT-OCR-2-hf"
)
print("Saved GOT-OCR2 to ./GOT-OCR-2.0-hf")


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Saved GOT-OCR2 to ./GOT-OCR-2.0-hf


In [10]:
import os
from PIL import Image, ImageDraw, ImageFont
# Ensure you have transformers, torch, and Pillow installed:
# pip install transformers torch torchvision Pillow
# For transformers version 4.37.0, some newer features or models might require trust_remote_code=True
try:
    from transformers import AutoProcessor, GotOcr2ForConditionalGeneration
except ImportError:
    print("Failed to import from transformers. Ensure it's installed correctly.")
    print("You might be missing GotOcr2ForConditionalGeneration if your transformers version is too old and doesn't support it even with remote code.")
    exit()

# --- Configuration ---
# Absolute path to the image
# This should point to your actual 'sample_0.jpg'
IMAGE_PATH_ABSOLUTE = "/home/jupyter/novice/ocr/sample_0.jpg"

# Relative path to the model directory (or Hugging Face model identifier)
# Option 1: If you have the model downloaded locally in a folder (e.g., "./name_of_model")
# relative to where you run this script. This folder should contain config.json, model weights, etc.
MODEL_PATH_RELATIVE = "./GOT-OCR-2-hf"

# Option 2: Replace with a Hugging Face model identifier if you want to download/use it directly.
# MODEL_ID_HF = "stepfun-ai/GOT-OCR-2.0-hf"
# model_to_load = MODEL_ID_HF # Use this if you prefer the HF identifier

# For this script, we'll use MODEL_PATH_RELATIVE and handle if it's a placeholder.
model_to_load = MODEL_PATH_RELATIVE


def perform_ocr(image_path, model_name_or_path):
    """
    Performs OCR on a single image using the specified model.

    Args:
        image_path (str): Absolute or relative path to the image file.
        model_name_or_path (str): Path to the local model directory or Hugging Face model identifier.
    """
    print(f"Attempting to load image from: {image_path}")
    if not os.path.exists(image_path):
        print(f"Error: Image not found at {image_path}")
        print("Please ensure the image path is correct or a dummy image can be created.")
        return

    try:
        image = Image.open(image_path).convert("RGB")
        print("Image loaded successfully.")
    except Exception as e:
        print(f"Error loading image: {e}")
        return

    print(f"\nAttempting to load processor from: {model_name_or_path}")
    try:
        # trust_remote_code=True might be necessary if the model includes custom code.
        processor = AutoProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
        print("Processor loaded successfully.")
    except Exception as e:
        print(f"Error loading processor: {e}")
        print("Hints: ")
        print("- Ensure the model path/identifier is correct.")
        print("- If using a local path, it should contain 'preprocessor_config.json' and other necessary files.")
        print("- You might need to be logged in to Hugging Face CLI ('huggingface-cli login') if downloading.")
        print("- For older transformers versions, 'trust_remote_code=True' can be crucial for models with custom code.")
        return

    print(f"\nAttempting to load model: {model_name_or_path}")
    device = "cpu" # Default to CPU; use "cuda" if torch.cuda.is_available() and you have a GPU
    # import torch # Uncomment if you want to explicitly manage device with torch.cuda
    # if torch.cuda.is_available():
    #     device = "cuda"

    try:
        # Using the specific model class 'GotOcr2ForConditionalGeneration'
        # This is often more reliable for specific architectures, especially with older transformers.
        model = GotOcr2ForConditionalGeneration.from_pretrained(model_name_or_path, trust_remote_code=True)
        model.to(device) # Move model to the device (CPU or GPU)
        model.eval() # Set model to evaluation mode
        print(f"Model loaded successfully to {device} using GotOcr2ForConditionalGeneration.")
    except ImportError:
        print("Error: GotOcr2ForConditionalGeneration class not found.")
        print("This specific class might not be available in your transformers version (e.g., 4.37.0) without 'trust_remote_code=True' or if the model's custom code isn't fetched/registered.")
        return
    except Exception as e:
        print(f"Error loading model with GotOcr2ForConditionalGeneration: {e}")
        print("Hints: ")
        print("- Double-check the model path/identifier.")
        print("- If local, ensure the directory contains all model files (config.json, pytorch_model.bin, etc.).")
        print("- Compatibility issues with transformers version 4.37.0 can occur for newer models.")
        return

    print("\nPreparing inputs for the model...")
    try:
        # The processor prepares the image and any necessary text prompts (though for basic OCR, text is minimal)
        inputs = processor(images=image, return_tensors="pt").to(device)
        print("Inputs prepared.")
    except Exception as e:
        print(f"Error preparing inputs with processor: {e}")
        return

    print("\nGenerating text (performing OCR)...")
    try:
        # Prepare arguments for model.generate
        # The `tokenizer` and `max_new_tokens` are common.
        # `stop_strings` might be specific to models with custom generation logic (via trust_remote_code=True)
        # In transformers 4.37.0, `stop_strings` is not a standard `model.generate` parameter.
        # It will be attempted and caught if it causes a TypeError.
        generate_kwargs = {
            "pixel_values": inputs.get("pixel_values"),
            "input_ids": inputs.get("input_ids"), # If processor adds BOS tokens or similar
            "tokenizer": processor.tokenizer,
            "max_new_tokens": 512, # Adjust as needed
            "num_beams": 3 # Example: using beam search
        }
        
        generated_ids = None
        try:
            # Some models like GOT-OCR-2.0 might accept 'stop_strings' if custom code is loaded
            generated_ids = model.generate(**generate_kwargs, stop_strings=["<|im_end|>"])
        except TypeError as te:
            if "stop_strings" in str(te).lower():
                print("Warning: 'stop_strings' parameter caused a TypeError. Retrying generate without it.")
                print("         (This is common if the model's generate method or your transformers version doesn't support it directly).")
                # Remove the problematic kwarg and try again
                generate_kwargs_no_stop = {k: v for k, v in generate_kwargs.items()}
                generated_ids = model.generate(**generate_kwargs_no_stop)
            else:
                raise te # Re-raise if it's a different TypeError

        if generated_ids is None:
             print("Error: Text generation failed to produce output.")
             return
        print("Text generation complete.")

    except Exception as e:
        print(f"Error during model.generate: {e}")
        return

    print("\nDecoding generated text...")
    try:
        recognized_text = ""
        # Decoding logic based on Hugging Face examples for image-to-text models:
        # It skips the input_ids part if they were part of the prompt.
        if inputs.get("input_ids") is not None:
            start_index = inputs["input_ids"].shape[1]
            if generated_ids.shape[1] > start_index:
                recognized_text = processor.decode(generated_ids[0, start_index:], skip_special_tokens=True)
            else: # If generated sequence is not longer than input, decode the whole thing
                recognized_text = processor.decode(generated_ids[0], skip_special_tokens=True)
        else: # If no input_ids were in the initial processor output
            recognized_text = processor.decode(generated_ids[0], skip_special_tokens=True)

        print("--- Recognized Text ---")
        print(recognized_text)
        print("-----------------------")
    except Exception as e:
        print(f"Error decoding text: {e}")
        return

if __name__ == "__main__":
    # --- Setup: Create dummy image if it doesn't exist ---
    # This helps make the script runnable for testing purposes.
    # In a real scenario, IMAGE_PATH_ABSOLUTE would point to your actual image.
    dummy_ocr_dir = os.path.dirname(IMAGE_PATH_ABSOLUTE)
    if not os.path.exists(dummy_ocr_dir):
        try:
            os.makedirs(dummy_ocr_dir)
            print(f"Created dummy directory for image: {dummy_ocr_dir}")
        except OSError as e:
            print(f"Error creating dummy directory {dummy_ocr_dir}: {e}")

    if not os.path.exists(IMAGE_PATH_ABSOLUTE):
        try:
            print(f"Attempting to create a dummy image at: {IMAGE_PATH_ABSOLUTE}")
            img = Image.new('RGB', (600, 150), color = (220, 220, 220))
            d = ImageDraw.Draw(img)
            try:
                # Try to use Arial, fallback to default if not found
                font = ImageFont.truetype("arial.ttf", 30)
            except IOError:
                font = ImageFont.load_default()
            d.text((20,20), "Sample Text for OCR Test\n12345 ABCDE", fill=(0,0,0), font=font)
            img.save(IMAGE_PATH_ABSOLUTE)
            print(f"Created dummy image: {IMAGE_PATH_ABSOLUTE}")
        except Exception as e:
            print(f"Could not create dummy image (PIL/font issue): {e}")
            print(f"Please ensure an image exists at '{IMAGE_PATH_ABSOLUTE}' or that PIL can create one.")

    # --- Model Path Handling ---
    # If the relative model path is the placeholder and doesn't exist,
    # default to a Hugging Face identifier to make the example runnable.
    current_model_to_load = model_to_load
    if model_to_load == "./name_of_model" and not os.path.isdir(model_to_load):
        print(f"\nWarning: The specified relative model path '{model_to_load}' does not exist or is not a directory.")
        print("This script expects it to be a local directory containing model files if not a Hugging Face identifier.")
        print("For a runnable example, defaulting to 'stepfun-ai/GOT-OCR-2.0-hf' from Hugging Face Hub.")
        print("If you have a local model at './name_of_model', ensure the path is correct and the directory exists.")
        current_model_to_load = "stepfun-ai/GOT-OCR-2.0-hf" # Default to HF model ID

    # --- Run OCR ---
    print(f"\n--- Starting OCR Test ---")
    print(f"Using image (absolute path): {IMAGE_PATH_ABSOLUTE}")
    print(f"Using model (relative path/HF ID): {current_model_to_load}")
    
    perform_ocr(image_path=IMAGE_PATH_ABSOLUTE, model_name_or_path=current_model_to_load)

    print("\n--- Script Finished ---")
    print("\nTo run this script, you need to install: pip install Pillow transformers torch torchvision")
    print("If downloading models from Hugging Face Hub, you might need to log in: `huggingface-cli login`")
    print("Your other files (ocr.jsonl, sample_0_test.txt, sample_0.hocr) are not used in this basic inference script but are noted.")



--- Starting OCR Test ---
Using image (absolute path): /home/jupyter/novice/ocr/sample_0.jpg
Using model (relative path/HF ID): ./GOT-OCR-2-hf
Attempting to load image from: /home/jupyter/novice/ocr/sample_0.jpg
Image loaded successfully.

Attempting to load processor from: ./GOT-OCR-2-hf
Processor loaded successfully.

Attempting to load model: ./GOT-OCR-2-hf
Model loaded successfully to cpu using GotOcr2ForConditionalGeneration.

Preparing inputs for the model...
Inputs prepared.

Generating text (performing OCR)...
Text generation complete.

Decoding generated text...
--- Recognized Text ---
TOP SECRET: Operation Iron Claw Report on Potential BH- 2000 Hideouts Analysis indicates potential BH- 2000 hideouts maybe located in remote areas identified as high- risk zones based on current geo spatial data. CYPHER' s advanced algorithms have pinpointed several areas with elevated probability of BH- 2000 activity, requiring immediate surveillance and reconnaissance efforts.  operational un