# Building Custom Benchmarks with Label Studio

### Step 0: Install PreRequisite Packages and Add Credentials

In [None]:
label_studio_url = "https://app.heartex.com" 
label_studio_api_key = ""


In [1]:
!pip install label-studio-sdk



### Step 1: Create your first Label Studio project for Rubric Creation

<a href="https://app.heartex.com/b/MTI1Ng=="
  target="_blank" rel="noopener" aria-label="Open in Label Studio" style="all:unset;cursor:pointer;display:inline-flex;align-items:center;justify-content:center;border-radius:4px;border:1px solid rgb(109,135,241);padding:8px 12px;background:rgb(87 108 193);color:white;font-weight:500;font-family:sans-serif;gap:6px;transition:background 0.2s ease;" onmouseover="this.style.background='rgb(97 122 218)'" onmouseout="this.style.background='rgb(87 108 193)'">
  <svg style="width:20px;height:20px" viewBox="0 0 26 26" fill="none"><path fill="#FFBAAA" d="M3.5 4.5h19v18h-19z"/><path fill-rule="evenodd" clip-rule="evenodd" d="M25.7 7.503h-7.087V5.147H7.588V2.792h11.025V.436H25.7v7.067Zm-18.112 0H5.225v10.994H2.863V7.503H.5V.436h7.088v7.067Zm0 18.061v-7.067H.5v7.067h7.088ZM25.7 18.497v7.067h-7.088v-2.356H7.588v-2.355h11.025v-2.356H25.7Zm-2.363 0V7.503h-2.363v10.994h2.363Z" fill="#FF7557"/></svg>
  <span style="font-size:14px">Open in Label Studio</span>
  <svg style="width:16px;height:16px" viewBox="0 0 24 24"><path d="M14,3V5H17.59L7.76,14.83L9.17,16.24L19,6.41V10H21V3M19,19H5V5H12V3H5C3.89,3 3,3.9 3,5V19A2,2 0 0,0 5,21H19A2,2 0 0,0 21,19V12H19V19Z" fill="white"/></svg>
</a>

In [44]:
rubric_creation_project_id = "212780"
target_workspace_id = "94519"

### Step 2: Create Label Studio Client

In [19]:
from label_studio_sdk.client import LabelStudio

ls = LabelStudio(
    base_url=label_studio_url,  
    api_key=label_studio_api_key,
)

### Step 3: Create a new project for evaluating against rubrics

In [45]:
from datetime import date

label_config = """<View>
  <!-- Top context -->
  <Header value="User Query" style="margin-top: 0.5em;" />
  <Text name="query" value="$User_Query" />

  <Header value="Sample AI Response" style="margin-top: 1em;" />
  <Text name="response" value="$AI_Response" />

  <!-- Criteria sections -->
  <Collapse accordion="false" bordered="true" open="true">

    <Panel value="Criteria 1">
      <View style="margin-top: 0.5em;">
        <Text name="criteria1_text" value="$criteria1" />
        <Choices name="criteria1" toName="criteria1_text" choice="single">
          <Choice value="Criteria 1 is met" />
          <Choice value="Criteria 1 is not met" />
        </Choices>
      </View>
    </Panel>

    <Panel value="Criteria 2">
      <View style="margin-top: 0.5em;">
        <Text name="criteria2_text" value="$criteria2" />
        <Choices name="criteria2" toName="criteria2_text" choice="single">
          <Choice value="Criteria 2 is met" />
          <Choice value="Criteria 2 is not met" />
        </Choices>
      </View>
    </Panel>

    <Panel value="Criteria 3">
      <View style="margin-top: 0.5em;">
        <Text name="criteria3_text" value="$criteria3" />
        <Choices name="criteria3" toName="criteria3_text" choice="single">
          <Choice value="Criteria 3 is met" />
          <Choice value="Criteria 3 is not met" />
        </Choices>
      </View>
    </Panel>

  </Collapse>
</View>
"""
proj = ls.projects.create(
    title=f"AskAI Rubric Evaluation {date.today()}",
    label_config=label_config, 
    color="#00FFFF",
    workspace=target_workspace_id
)

rubric_evaluation_project_id = proj.id

### Step 4: Parse the rubrics we created in LS and upload them with their task data to the new rubric evaluation project

In [46]:
# This cell enables us to "Martha Stewart" this project with predefined rubrics 
# for ones we didn't create in LS
import csv 
with open("Custom_Benchmarks_Full_Dataset.csv", newline='') as f:
    reader= csv.DictReader(f)
    question_dict = {}
    for row in reader: 
        question_dict[row["question_id"]] = row
display(question_dict)

{'1': {'question_id': '1',
  'question': 'How can I import data to my project?',
  'response_askai': 'You can import data either via the UI or programmatically. In the UI, open the Data Manager and click the “Import” button to upload files or specify a URL. Alternatively, use the Label Studio SDK’s\u202fimport_tasks()\u202fmethod or the REST API endpoint (/api/projects/{id}/import) to import data.\n\nYou can also connect to cloud storage and sync data into Label Studio.\n\n**Further information**\n1. [Importing tasks guide](https://docs.humansignal.com/guide/tasks.html)\n2. [Label Studio API import endpoint](https://labelstud.io/api#operation/api_projects_import_create)\n3. [LS SDK import example](https://github.com/HumanSignal/label-studio-sdk/tree/master/examples/import_data/import_from_json.md)\n4. [Cloud storages](https://docs.humansignal.com/guide/storage)',
  'response_chatgpt': 'To import data:\n\n1. Open your project → **Data Import**.\n2. Upload files or connect cloud storage.

In [47]:
import json
tasks = ls.tasks.list(project=rubric_creation_project_id)

all_rubrics = {}
for i, task in enumerate(tasks):
    question_id = str(task.data["question_id"])
    rubric_items = {}

    # Check if task has annotations before accessing
    # If no annotations, we use the predefined rubrics
    if not task.annotations or len(task.annotations) == 0:
        for i in range(3): 
            rubric_items.update({f"criteria{i+1}": question_dict[question_id][f"criteria{i+1}"]})

    # If there are annotations, we use the annotations
    elif task.annotations and len(task.annotations) > 0:
        for c, r in enumerate(task.annotations[0]["result"]):
            if r["type"] == "chatmessage":
                # we don't want to duplicate tasks, so we only take the instance of 
                # the rubric with the number, since it includes the criteria
                continue
            if r["type"] == "number":
                rubric_items.update({f"criteria{c+1}": f"{r['value']['chatmessage']['content']} \n (Points: {r['value']['number']})"})
        
    all_rubrics[question_id] = rubric_items
display(all_rubrics)

ApiError: status_code: 403, body: {'id': 'a1fb3df4-94cc-48c9-8e03-aaf6638317e1', 'status_code': 403, 'version': '1.23.0.dev0', 'detail': 'You do not have permission to perform this action.', 'exc_info': None}

In [57]:
for task_id, criteria in rubrics.items():
    print(task_id)
    task_data = question_dict[str(task_id)]
    for model in ["chatgpt", "claude"]:
        criteria["question_id"] = task_id
        criteria["User_Query"] = task_data["question"]
        criteria["AI_Response"] = task_data[f"response_{model}"]
        criteria["model"] = model
        
        print(criteria)
        for c in ["criteria1", "criteria2", "criteria3"]:
            if not c in criteria.keys():
                criteria[c] = task_data[c]
                
        ls.tasks.create(
                project=rubric_evaluation_project_id,
                data=criteria
            )

1
{'criteria1': 'Answer should mention that data can be imported using the UI or the SDK \n (Points: 10)', 'criteria2': 'Answer should provide documentation \n (Points: 5)', 'criteria3': 'answer should provide a template of the SDK call \n (Points: 4)', 'User_Query': 'How can I import data to my project?', 'AI_Response': 'To import data:\n\n1. Open your project → **Data Import**.\n2. Upload files or connect cloud storage.\n3. Click **Import**.\n\nUsing the SDK:\n\n```python\nfrom label_studio_sdk import LabelStudio\nls = LabelStudio(base_url=URL, api_key=KEY)\nls.project.import_data(PROJECT_ID, tasks=[{"text": "Example"}])\n```\n\n**Further information**  \n1. https://docs.humansignal.com/guide/import  \n2. https://labelstud.io/sdk\n', 'model': 'chatgpt', 'question_id': '1'}
{'criteria1': 'Answer should mention that data can be imported using the UI or the SDK \n (Points: 10)', 'criteria2': 'Answer should provide documentation \n (Points: 5)', 'criteria3': 'answer should provide a temp

### Step 5: Score the Rubrics with LLM Judges

#### Use this Prompt: 

```
You will be provided a query from a user and a potential response, as well as a series of criteria. For each criterion, your job is to decide whether the criteria is met or unmet. Check the appropriate box for each critiera. 

User Query: {User_Query}
Response to Grade: {AI_Response}

Criteria 1: {criteria1}

Criteria 2: {criteria2}

Criteria 3: {criteria3}
```

We'll run this prompt with two different LLM Judges in Prompts in Label Studio!

### Step 6: Analytics

#### A: Get all tasks from LS and calculate score

In [66]:
tasks = ls.tasks.list(project=rubric_evaluation_project_id)
total_scores = {}
possible_scores = {}
for task in tasks: 
    question_id = str(task.data["question_id"])
    model_id = task.data["model"]

    # First, we extract the point values from the rubric items for the task
    points_dict = {}
    possible_points = 0
    for c in ["criteria1", "criteria2", "criteria3"]:
        criteria, points = task.data[c].split("Points:")
        points = float(points.strip()[:-1])
        possible_points += points if points > 0 else 0
        points_dict[c] = points
        
    possible_scores[question_id] = possible_points

    total = 0

    # If a human had to review to break a discrepancy, it'll show up in [annotations]
    # We start there to collect the evaluations
    if task.annotations:
        for r in task.annotations[0]["result"]:
            # If criteria is not met, we don't care about the score, they don't gain any points
            if "not met" in r["value"]["choices"][0]:
                continue

            # If criteria is met, we add the points to the total score
            else: 
                total += points_dict[r["from_name"]]

    # If the models agreed, we didn't use human review
    # These show up in predictions, but since they're the same, we don't care which model they come from.
    else: 
        for r in task.predictions[0]["result"]:
             # If criteria is not met, we don't care about the score, they don't gain any points
            if "not met" in r["value"]["choices"][0]:
                continue

            # If criteria is met, we add the points to the total score
            else: 
                total += points_dict[r["from_name"]]

    total_scores[(question_id, model_id)] = total

display(total_scores)


                


{('1', 'chatgpt'): 19.0,
 ('1', 'claude'): 19.0,
 ('2', 'chatgpt'): 10.0,
 ('2', 'claude'): 10.0,
 ('3', 'chatgpt'): 0,
 ('3', 'claude'): -5.0,
 ('4', 'chatgpt'): 0,
 ('4', 'claude'): 10.0,
 ('5', 'chatgpt'): -5.0,
 ('5', 'claude'): 0,
 ('6', 'chatgpt'): 0,
 ('6', 'claude'): 0,
 ('7', 'chatgpt'): 10.0,
 ('7', 'claude'): 10.0,
 ('8', 'chatgpt'): 0,
 ('8', 'claude'): 0,
 ('9', 'chatgpt'): 5.0,
 ('9', 'claude'): 10.0,
 ('10', 'chatgpt'): 0,
 ('10', 'claude'): 0}

#### B: Analytics 

In [74]:
total_possible_points = sum(possible_scores.values())
final_scores = {'chatgpt': 0, 'claude': 0}
for (question_id, model_id), score in total_scores.items():
    final_scores[str(model_id)] += score

final_scores["chatgpt"] = final_scores["chatgpt"] / total_possible_points
final_scores["claude"] = final_scores["claude"] / total_possible_points

display(final_scores)








{'chatgpt': 0.23780487804878048, 'claude': 0.32926829268292684}