## Intro to Programmatic Actions with Label Studio -- SDK

In [None]:
!pip install label-studio-sdk

In [None]:
# credentials and setup
label_studio_url = "your_label_studio_url_here"
label_studio_api_key = "your_api_key_here"

## Step 0: Download and Prep Your Data

In [None]:
data = [
    {"image": "https://htx-pub.s3.us-east-1.amazonaws.com/demo_pdf/rag_0.jpg"},
    {"image": "https://htx-pub.s3.us-east-1.amazonaws.com/demo_pdf/rag_1.jpg"},
    {"image": "https://htx-pub.s3.us-east-1.amazonaws.com/demo_pdf/rag_2.jpg"},
    {"image": "https://htx-pub.s3.us-east-1.amazonaws.com/demo_pdf/rag_3.jpg"},
    {"image": "https://htx-pub.s3.us-east-1.amazonaws.com/demo_pdf/rag_4.jpg"}
]

## Step 1: Create a Label Studio Client

In [None]:
from label_studio_sdk.client import LabelStudio

# the Label Studio Client is the main entry point for the SDK.
ls = LabelStudio(
    base_url=label_studio_url,
    api_key=label_studio_api_key,
)

## Step 2: Creating and Intitalizing a Project

In [None]:
from label_studio_sdk.label_interface import LabelInterface
from label_studio_sdk.label_interface.create import labels, choices

# We can define the labeling config in one of two ways:
# 1. a string
label_config = """
<View>
   <Image name="image" value="$image" zoom="true" zoomControl="false"
         rotateControl="true" width="100%" height="100%"
         maxHeight="auto" maxWidth="auto"/>

   <RectangleLabels name="bbox" toName="image" strokeWidth="1" smart="true">
      <Label value="Heading" background="green"/>
      <Label value="Paragraph" background="blue"/>
   </RectangleLabels>

   <TextArea name="transcription" toName="image"
   editable="true" perRegion="true" required="false"
   maxSubmissions="1" rows="5" placeholder="Recognized Text"
   displayMode="region-list"/>
</View>
"""

# 2. using the LabelInterface. Maps the fromName to the control tag type
label_config2 = LabelInterface.create({
      "image": "Image",
      "transcription" : "TextArea"
    })

print(label_config2)

# Projects need a title and a labeling config, at a minimum.
proj = ls.projects.create(
    title="OCR for RAG PDF (Acronyms)",
    label_config=label_config,
    workspace=110870,
    color="#00FFFF",
)
pid = proj.id
print(f"Created project {pid} in workspace {proj.workspace}")

## Step 3: Add your Data

In [None]:
# There are a few ways to upload your data to Label Studio

# 1. As a CSV:
# task_ids = project.import_tasks('/full/path/to/your_tasks.csv')
# print('Imported task IDs:', task_ids)


# 2 As a JSON:
ls.projects.import_tasks(
    id=pid,
    request=data
)


### Now, you're ready to label! 

## Step 4: Chaining Projects Together

In [None]:
# First, we create a new project as we did before

# 1. a string
ner_label_config = """
<View>
  	<Text name="text" value="$text" />
    <Labels name="ACRONYM_RECOGNITION" toName="text">
        <Label value="ACRONYM" />
  </Labels>
</View>
"""

# Projects need a title and a labeling config, at a minimum.
proj2 = ls.projects.create(
    title="NER on OCR: Acronyms",
    description="",
    label_config=ner_label_config,
    workspace=110870,
    color="#800080",
)
p2id = proj2.id

In [None]:
# Now, we can export data from our first project and upload it to our second project
import json
tasks = ls.tasks.list(project=proj.id)

#Let's see what a task annotation looks like

#IF TASK HAS ANNOTATIONS
for i, task in enumerate(tasks):
    text = ""
    # You can access annotations in Label Studio JSON format
    if i == 0:
        # print just the first task for investigation
        print(f"TASK {i}")
        print(json.dumps(task.annotations[0], indent=4))
    for a in task.annotations:
        results = a["result"]
        for r in results:
            if "text" in r["value"].keys():
                text = text + r["value"]["text"][0]
    if text:
        ls.tasks.create(
            project=p2id,
            data={'text': text}
        )

## Step 5: Refining Prompts with the Label Studio SDK

In [None]:
# First, create a prompt in the Label Studio UI and copy the ID
prompt_id = -1  # Change this to your prompt ID

# List all versions of the prompt
versions = ls.prompts.versions.list(
    prompt_id=prompt_id,
)

for v in versions:
    print("Version ID:", v.id)
    print("Title:", v.title)
    print("Provider:", v.provider)
    print("Provider Model ID:", v.provider_model_id)
    print("Created at:", v.created_at)
    print("Updated at:", v.updated_at)
    print("Prompt:", v.prompt[:100])
    print("\n")


In [None]:
# List model provider connections and find VertexAI connection ID
# List all model provider connections
model_providers = ls.model_providers.list(
)

print(f"Found {len(model_providers)} model provider connection(s)")
print()

# Filter for VertexAI provider
vertexai_connection = None
vertexai_connection_id = None

for provider in model_providers:
    print(f"Provider: {provider.provider} | ID: {provider.id}")

    if provider.provider == "VertexAI":
        vertexai_connection = provider
        vertexai_connection_id = provider.id

print()

if vertexai_connection:
    print("✓ Found VertexAI connection!")
    print(f"Connection ID: {vertexai_connection_id}")
else:
    print("⚠ No VertexAI provider connection found")


In [None]:
# Refine a specific prompt version using the VertexAI connection
# Specify the version ID you want to refine
version_id = -1  # Change this to the version ID you want to refine
project_id = -1  # Change this to your project ID
teacher_model_name = "gemini-2.5-pro"  # The model to use for refinement
teacher_model_provider_connection_id = vertexai_connection_id  # Using VertexAI connection ID from previous cell

# Refine the prompt version
refined_result = ls.prompts.versions.refine_prompt(
    prompt_id=prompt_id,
    version_id=version_id,
    project_id=project_id,
    teacher_model_name=teacher_model_name,
    teacher_model_provider_connection_id=teacher_model_provider_connection_id,
    # Optional: Set to True to run refinement asynchronously
    # async_=True
)

print("Prompt refinement completed!")
print()
print(f"Title: {refined_result.title}")
print(f"Refined Prompt: {refined_result.prompt}")
print()
print(f"Refinement Status: {refined_result.refinement_status}")
print(f"Refinement Job ID: {refined_result.refinement_job_id}")
print(f"Total Cost: ${refined_result.total_cost}")
print()
print("Reasoning:")
print(refined_result.reasoning)
print()


In [None]:
# Save the refined prompt as a new version

# Create a new version using the refined prompt
new_version = ls.prompts.versions.create(
    prompt_id=prompt_id,
    prompt=refined_result.prompt,  # Using the refined prompt from previous cell
    title=refined_result.title or "Refined Version",  # Use refined title or default
    provider_model_id="gemini-2.5-pro",  # Model ID to use
    provider="VertexAI",  # Provider type
    model_provider_connection=vertexai_connection_id,  # Connection ID
)

print("✓ New prompt version created successfully!")
print()
print(f"Version ID: {new_version.id}")
print(f"Title: {new_version.title}")
print(f"Provider: {new_version.provider}")
print(f"Model ID: {new_version.provider_model_id}")
print(f"Created at: {new_version.created_at}")
print()
print("Prompt text:")
print(new_version.prompt)


In [None]:
# Run inference on the new version using HasGT subset
import time

# Start the inference run
print("Starting inference run on HasGT subset...")
inference_run = ls.prompts.runs.create(
    prompt_id=prompt_id,
    version_id=new_version.id,  # Using the newly created version from previous cell
    project=project_id,
    project_subset="HasGT",  # Only run on tasks with ground truth annotations
)

print(f"✓ Inference run started!")
print(f"Run ID: {inference_run.id}")
print(f"Status: {inference_run.status}")
print(f"Project: {inference_run.project}")
print(f"Subset: {inference_run.project_subset}")
print()

# Poll for completion
print("Polling for completion...")
max_attempts = 60  # Maximum number of polling attempts
poll_interval = 5  # Seconds between polls

for attempt in range(max_attempts):
    # Get the current status of all inference runs for this version
    runs = ls.prompts.runs.list(
        prompt_id=prompt_id,
        version_id=new_version.id,
    )

    # Find our specific run
    current_run = None
    for run in runs:
        if run.id == inference_run.id:
            current_run = run
            break

    if current_run:
        status = current_run.status
        print(f"[{attempt + 1}/{max_attempts}] Status: {status}", end="")

        if status == "Completed":
            print()
            print()
            print("✓ Inference run completed successfully!")
            print(f"Total tasks: {current_run.total_tasks}")
            print(f"Total predictions: {current_run.total_predictions}")
            print(f"Total correct predictions: {current_run.total_correct_predictions}")
            print(f"Completed at: {current_run.completed_at}")
            break
        elif status == "Failed":
            print()
            print("✗ Inference run failed!")
            break
        elif status == "Canceled":
            print()
            print("⚠ Inference run was canceled!")
            break
        else:
            # Still in progress
            print(f" - waiting {poll_interval}s...")
            time.sleep(poll_interval)
    else:
        print(f"Could not find run {inference_run.id}")
        break
else:
    print()
    print(f"⚠ Inference run did not complete within {max_attempts * poll_interval} seconds")


In [None]:
# Get the micro_avg_prf1 indicator for the inference run
indicator_key = "micro_avg_prf1"

# Get the specific indicator
indicator = ls.prompts.indicators.get(
    id=current_run.id,  # Using the inference run ID from previous cell
    indicator_key=indicator_key
)

print(f"✓ Retrieved indicator: {indicator_key}")
print()
print(f"Title: {indicator.title}")
print()
print("Values:")
print(indicator.values)
