In [1]:
# Install necessary packages
!pip install --upgrade label-studio-sdk
!pip install mistralai
!pip install Markdown 
!pip install beautifulsoup4



In [20]:
### API Keys and URLS 
# API key for Mistral
mistral_api_key = 

# URL of your Label Studio Instance
label_studio_url = "http://localhost:8080" 
# Label Studio API Key (in your user settings)
label_studio_api_key = 


## Create Label Studio Project for OCR

In [142]:
from label_studio_sdk.client import LabelStudio

# Connect to Label Studio 
ls = LabelStudio(
    base_url=label_studio_url,  
    api_key=label_studio_api_key,
)

In [184]:
# Labeling Config for OCR using Multi-page document annotation
# We'll have 2 columns: the PDF on the left as a list of images, and the transcription on the right for editing 

labeling_config = """
<View style="display: flex;">
  <View style="flex: 50%">
  	<Image name="pdf" valueList="$pages" />
  </View>
  <View style="flex: 50%; margin-leftL 1em">
  	<TextArea name="transcription" toName="pdf" editable="true" perItem="true" placeholder="Recognized Text"/>
 </View>
</View>
"""

In [185]:
# Create Label Studio Project
project = ls.projects.create(
    title="MistralOCR",
    description="OCR using the new MistralOCR model",
    label_config=labeling_config
)


## Set up sample task 

In [186]:
# This is a single sample task: a 3 page document. If you want to replace with your own task, replace the pages list 
# with a list of URLs to your document in iamge form. 
# You can make this a loop to add multiple documents! 

sample_task = {
    "pages": [
      "https://htx-pub.s3.amazonaws.com/demo/images/demo_stock_purchase_agreement/0001.jpg",
      "https://htx-pub.s3.amazonaws.com/demo/images/demo_stock_purchase_agreement/0002.jpg",
      "https://htx-pub.s3.amazonaws.com/demo/images/demo_stock_purchase_agreement/0003.jpg"
    ]
}

In [187]:
# Upload task to Label Studio 
task = ls.tasks.create(
    project=project.id,
    data=sample_task
)

## Do OCR with MistralOCR

In [188]:
# Helper Function 
from bs4 import BeautifulSoup
from markdown import markdown 

# In the OSS version of Label Studio, we need to convert the markdown from MistralOCR to plain text so that it's readable.
# In Enterprise, you can skip this step if you want to use Custom Scripts to handle HTML/Markdown 
def strip_markdown(markdown_string):
    html = markdown(markdown_string)
    text = ' '.join(BeautifulSoup(html).findAll(text=True))
    return text 


In [189]:
# Name of the MistralOCR model to use
mistral_model = "mistral-ocr-latest"

def do_ocr(task_pages):
    """ 
    Do OCR on the given pages for the task. 
    param task_pages: a list of the urls for the images you want to do OCR on (probably all the pages of a single PDF)
    output: the text of the OCR (with markdown stripped out) for each page, as a list
    """
    print("Doing OCR")
    page_ocr = []
    for page in task_pages: 
        ocr_response = mistral.ocr.process(
            model=mistral_model,
            document={
                "type": "image_url",
                "image_url": page
            },
            include_image_base64=False
        )
        text = strip_markdown(ocr_response.pages[0].markdown)
        page_ocr.append(text)
    return page_ocr

## Create MistralOCR Predictions and Upload to Label Studio

In [190]:
# main method 
from mistralai import Mistral
from label_studio_sdk.label_interface.objects import PredictionValue

mistral = Mistral(api_key=mistral_api_key)


upload_project = ls.projects.get(id=project.id)
li = upload_project.get_label_interface()


# for every task in the project, we'll get the mistral predictions and upload them
for task in ls.tasks.list(project=upload_project.id):
    task_id = int(task.id)
    print(task_id)
    task_pages = task.data["pages"]
    task_predictions = do_ocr(task_pages)
    print("OCR Completed")
    print("Creating Predictions for Label Studio")
    results = [
        {
            "from_name" : "transcription",
            "to_name" : "pdf",
            "type": "textarea",
            "value" : {
                "text": [p],
            },
            "item_index" : i,

        }
    for i, p in enumerate(task_predictions)]
    ls.predictions.create(task=task_id, result=results, model_version=mistral_model)
    print(f"prediction for task {task_id} uploaded")

19
Doing OCR


  text = ' '.join(BeautifulSoup(html).findAll(text=True))


OCR Completed
Creating Predictions for Label Studio
[{'from_name': 'transcription', 'to_name': 'pdf', 'type': 'textarea', 'value': {'text': ['SERIES B PREFERRED STOCK PURCHASE AGREEMENT \n THIS SERIES B PREFERRED STOCK PURCHASE AGREEMENT (this "Agreement"), is made as of is made as of September 15, 2021, by and among Meetly, Inc, a Delaware corporation (the "Company") and the investors listed on Exhibit A attached to this Agreement (each a "Purchaser" and together the "Purchasers"). \n The parties hereby agree as follows: \n \n Purchase and Sale of Preferred Stock. \n \n 1.1 Sale and Issuance of Preferred Stock. \n (a) The Company shall have adopted and filed with the Secretary of State of the State of Delaware [on or before the Initial Closing (as defined below)] the Amended and Restated Certificate of Incorporation in the form of Exhibit A attached to this Agreement (the "Restated Certificate").\n(b) Subject to the terms and conditions of this Agreement, each Purchaser agrees to purc