In [None]:
# Install necessary packages
!pip install --upgrade label-studio-sdk 
!pip install docling

In [1]:
### API Keys and URLS 

# URL of your Label Studio Instance
label_studio_url = "http://localhost:8080" 
# Label Studio API Key (in your user settings)
label_studio_api_key = 


## Create Label Studio Project for OCR

In [2]:
from label_studio_sdk.client import LabelStudio

# Connect to Label Studio 
ls = LabelStudio(
    base_url=label_studio_url,  
    api_key=label_studio_api_key,
)

In [3]:
# Labeling Config for OCR using Multi-page document annotation
# We draw bounding boxes over the text in the image, and then put the text inside the bounding box

labeling_config = """
<View>
  <Image name="image" valueList="$pages"/>
  <Labels name="label" toName="image">
    <Label value="Body" background="green"/>
    <Label value="Heading" background="blue"/>
    <Label value="Metadata" background="pink"/>

  </Labels>
  <Rectangle name="bbox" toName="image" strokeWidth="3" perRegion="true"/>
  <TextArea name="transcription" toName="image"
            editable="true"
            perRegion="true"
            required="true"
            maxSubmissions="1"
            rows="5"
            placeholder="Recognized Text"
            displayMode="region-list"
            />
</View>
"""

In [4]:
# Create Label Studio Project
project = ls.projects.create(
    title="SmolDocling",
    description="OCR using the new SmolDocling model",
    label_config=labeling_config
)


## Set up sample task 

In [5]:
# This is a single sample task: a 3 page document. If you want to replace with your own task, replace the pages list 
# with a list of URLs to your document in iamge form. 
# You can make this a loop to add multiple documents! 

sample_task = {
    "pages": [
      "https://htx-pub.s3.amazonaws.com/demo/images/demo_stock_purchase_agreement/0001.jpg",
      "https://htx-pub.s3.amazonaws.com/demo/images/demo_stock_purchase_agreement/0002.jpg",
      "https://htx-pub.s3.amazonaws.com/demo/images/demo_stock_purchase_agreement/0003.jpg"
    ]
}

In [6]:
# Upload task to Label Studio 
task = ls.tasks.create(
    project=project.id,
    data=sample_task
)

## Do OCR with SmolDocling

In [7]:
# Helper Function 

def convert_bbox_to_ls(bbox, width, height): 
    """
    BBoxes in SmolDocling are given in LTRB format. We need to convert to the format Label Studio expects:
    the top left coordinate as a percentage of the total image, and the width and height as percents. 

    Args: 
        bbox: the bbox dictionary from SmolDocling's response object
        width: the width of the image in pixles 
        height: the height of the image in pixles 

    Returns: a dictionary containing all the information for the value field in Label Studio for Rectangle Labels.
    """
    #convert bbox to LS format
    bbox_x = (bbox["l"] / width) * 100 
    bbox_y = 100 - ((bbox["t"] / height) * 100)
    bbox_width = ((bbox["r"] - bbox["l"]) / width) * 100
    bbox_height = ((bbox["t"]- bbox["b"]) / height) * 100

    return {"x": bbox_x, 
            "y": bbox_y, 
            "width" : bbox_width, 
            "height" : bbox_height, 
           "rectanglelabels": ["Text"]}


In [8]:
# Helper Function
from PIL import Image
import requests
from io import BytesIO

def load_image_from_url(url):
    """Loads an image from a URL using PIL.

    Args:
        url: The URL of the image.

    Returns:
        An Image object if successful, None otherwise.
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        image = Image.open(BytesIO(response.content))
        return image
    except requests.exceptions.RequestException as e:
        print(f"Error fetching image from {url}: {e}")
        return None
    except Exception as e:
         print(f"Error opening image from {url}: {e}")
         return None

In [9]:
def do_ocr(task_pages):
    """
    Do the OCR on the list of images. 

    Args: 
        task_pages: a list of the urls for each image in the task

    Output: 
    
    """
    predictions = []
    for idx, page in enumerate(task_pages):
        print(f"processing page {idx + 1} of {len(task_pages)}")
        # Load the image as PIL to get the original height and width
        image = load_image_from_url(page)
        width, height = image.size

        # Convert from image to SmolDocling document 
        print("Doing ocr")
        converter = DocumentConverter()
        result = converter.convert(page)
        output_json = result.document.export_to_dict()

        # Process output josn
        print("Processing results")
        for recognized_text in output_json["texts"]:
            bbox = recognized_text["prov"][0]["bbox"]
            bbox_dict = convert_bbox_to_ls(bbox, width, height)
            
            text = recognized_text["text"]
            pred = {"item_index": idx, "bbox_value" : bbox_dict, "text_value": text}
            predictions.append(pred)
            
    return predictions 
            
        
        

## Create SmolDocling Predictions and Upload to Label Studio

In [None]:
# main method 
import time
import os
from PIL import Image
from docling.document_converter import DocumentConverter


# Get project information for uploading predictions 
upload_project = ls.projects.get(id=project.id)
li = upload_project.get_label_interface()


# for every task in the project, we'll get the SmolDocling predictions and upload them
for task in ls.tasks.list(project=upload_project.id):
    task_id = int(task.id)
    print(f"processing task {task_id}")
    task_pages = task.data["pages"]
    task_predictions = do_ocr(task_pages)
    print("OCR Completed")
    print("Creating Predictions for Label Studio")
    results = [
        {
            "id" : f"region{i}", 
            "from_name" : "label",
            "to_name" : "image",
            "type": "rectanglelabels",
            "value" : p["bbox_value"],
            "item_index" : p["item_index"],

        }
    for i, p in enumerate(task_predictions)]
    results.extend([{
        "id": f"region{i}", 
        "from_name" : "transcription", 
        "to_name" : "image", 
        "type" : "textarea", 
        "value" : {
            "text" : [p["text_value"]]
        },
        "item_index": p["item_index"]
    } for i, p in enumerate(task_predictions)])
        
    ls.predictions.create(task=task_id, result=results, model_version="SmolDocling")
    print(f"prediction for task {task_id} uploaded")