# GenAI Lab — LLM Comparison & De-identification


## Setup & Dependencies


In [36]:
# %pip install requests langchain_openai tqdm python-dotenv --break-system-packages

Load Environment Variables

In [37]:
from dotenv import load_dotenv

load_dotenv()

True

### Authentication helper


In [38]:
import os, requests, time, json

API_URL = os.getenv("API_URL")
HEADERS = {
    # "Host": API_URL.replace("http://", "").replace("https://", ""),
    "Origin": API_URL,
    "Content-Type": "application/json",
}


class AuthManager:
    def __init__(self, username, password, client_id, client_secret):
        self.api_url = API_URL
        self.username = username
        self.password = password
        self.client_id = client_id
        self.client_secret = client_secret
        self.token_last_timestamp = 0
        self.access_token = None
        self.refresh_token = None
        self._cookies = None
        self.get_token()

    def is_token_expired(self):
        # Token expires in 15 minutes, refresh 1 min before
        if not self.token_last_timestamp or not self.access_token:
            return True
        return (int(time.time()) - self.token_last_timestamp) > (14 * 60)

    def get_token(self):
        url = f"{self.api_url}/openid-connect/token"
        data = {
            "username": self.username,
            "password": self.password,
            "client_id": self.client_id,
            "client_secret": self.client_secret,
        }
        response = requests.post(url, data=json.dumps(data))
        if not response.ok:
            print(f"❌ Failed getting token: {response.json()}")
            return None
        auth_info = response.json()
        self.access_token = auth_info["access_token"]
        self.refresh_token = auth_info["refresh_token"]
        self.token_last_timestamp = int(time.time())
        self._cookies = {
            "access_token": f"Bearer {self.access_token}",
            "refresh_token": self.refresh_token,
        }
        return self.access_token

    @property
    def cookies(self):
        if self.is_token_expired():
            self.get_token()
        return self._cookies

    def get_auth_header(self):
        if self.is_token_expired():
            self.get_token()
        return {"Authorization": f"Bearer {self.access_token}"}

### LOGIN

In [39]:
import os

USERNAME = os.getenv("USERNAME")
PASSWORD = os.getenv("PASSWORD")
CLIENT_ID = "annotationlab"
CLIENT_SECRET = os.getenv("CLIENT_SECRET")

auth = AuthManager(USERNAME, PASSWORD, CLIENT_ID, CLIENT_SECRET)
print("🟢 Authentication Successful")

🟢 Authentication Successful


### Project Helpers


In [40]:
import os
from datetime import datetime
from math import ceil
import uuid


def create_project(project_name: str):
    try:
        payload = {"project_name": project_name}
        response = requests.post(
            f"{API_URL}/api/projects/create",
            headers=HEADERS,
            cookies=auth.cookies,
            json=payload,
        )
        response.raise_for_status()
        print(f"✅ Project '{project_name}' created successfully")
        return response.json()
    except requests.HTTPError as http_error:
        print(f"❌ Project Creation Failed. Error={http_error.response.text}")
        raise


def get_config_templates():
    try:
        response = requests.get(
            f"{API_URL}/api/projects/config_templates",
            headers=HEADERS,
            cookies=auth.cookies,
        )
        return response.json()["templates"]
    except requests.HTTPError as http_error:
        print(f"❌ Couldn't get config templates. Error={http_error.response.text}")
        raise


def update_project_config(project_name: str, label_config: str, deploy=False, custom_llm_service_provider=None):
    try:
        payload = {"label_config": label_config, "deploy": str(deploy).lower()}
        if custom_llm_service_provider:
            payload.update({"custom_llm_service_provider": json.dumps(custom_llm_service_provider)})
        response = requests.post(
            f"{API_URL}/api/projects/{project_name}/save-config",
            headers={**HEADERS, "Content-Type": "application/x-www-form-urlencoded"},
            cookies=auth.cookies,
            data=payload,
        )
        response.raise_for_status()
        print(f"✅ Updated project config for '{project_name}' project")
        return response.json()
    except requests.HTTPError as http_error:
        print(f"❌ Couldn't update project config. Error={http_error.response.text}")
        raise


def import_tasks_to_project(project_name: str, file_path: str):
    try:
        file_name = os.path.basename(file_path)
        total_size = os.stat(file_path).st_size
        chunk_size = 5000000
        total_chunks = ceil(total_size / chunk_size)
        upload_url = f"{API_URL}/api/projects/{project_name}/import"
        data = {
            "dzuuid": str(uuid.uuid4()),
            "dzchunkindex": "0",
            "dztotalfilesize": str(total_size),
            "dzchunksize": str(chunk_size),
            "dztotalchunkcount": str(total_chunks),
            "dzchunkbyteoffset": "",
            "overwrite": False,
            "ocr_enable": False,
            "file_name": file_name,
        }
        with open(file_path, "rb") as file:
            files = {"file": (file_name, file, "application/octet-stream")}
            for current_chunk in range(1, total_chunks + 1):
                data.update(
                    {
                        "dzchunkindex": str(current_chunk - 1),
                        "dzchunkbyteoffset": str(chunk_size * (current_chunk - 1)),
                    }
                )
                r = requests.post(
                    upload_url,
                    files=files,
                    data=data,
                    cookies=auth.cookies,
                )
                r.raise_for_status()
        print("✅ Tasks Imported Successfully.")
    except requests.HTTPError as http_error:
        print(f"❌ Import Task Failed. Error={http_error.response.text}")
        raise


def export_deidentified_tasks(project_name: str):
    try:
        payload = {
            "tags": [],
            "groundTruth": True,
            "excludeTasksWithoutCompletions": True,
            "excludeTasksWithoutPredictions": False,
            "annotation_task": "all",
            "annotation_task_value": [],
            "annotators": [],
            "deidentificationTask": True,
            "type": "local",
        }
        params = {"format": "JSON"}
        response = requests.post(
            f"{API_URL}/api/projects/{project_name}/export",
            headers=HEADERS,
            cookies=auth.cookies,
            params=params,
            json=payload,
        )
        response.raise_for_status()
        download_link = response.json().get("download_link")
        if not download_link:
            raise Exception("No download_link in export response")

        with requests.get(download_link, cookies=auth.cookies, stream=True) as download_res:
            download_res.raise_for_status()
            filename = f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}_{project_name}.zip"
            chunk_size = 8192
            with open(filename, 'wb') as f:
                for chunk in download_res.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
        print(f"✅ Tasks Exported successfully: {filename}")
    except requests.HTTPError as http_error:
        print(f"❌ Tasks Export Failed. Error={http_error.response.text}")
        raise


def get_project_tasks(project_name: str):
    try:
        response = requests.get(
            f"{API_URL}/api/projects/{project_name}/tasks",
            headers=HEADERS,
            cookies=auth.cookies,
        )
        tasks = response.json()["items"]
        return tasks
    except requests.HTTPError as http_error:
        print(f"❌ Error while getting project tasks. Error={http_error.response.text}")
        raise

## Gen AI Lab - LLM Comparison


### 1 - Create Blind LLM Comparison Type Project

In [41]:
LLM_COMPARISON_PROJECT_NAME = "NotebookLLMComparisonProject"

create_project(LLM_COMPARISON_PROJECT_NAME)

config_templates = get_config_templates()
blind_eval_proj_config = next(
    (config for config in config_templates.get("text", []) if config.get("title") == "Blind LLM Response Comparison"),
    None,
)
if not blind_eval_proj_config:
    raise RuntimeError(f"'Blind LLM Response Comparison' type project config not found")

update_project_config(
    LLM_COMPARISON_PROJECT_NAME,
    label_config=blind_eval_proj_config['label_config'],
)

✅ Project 'NotebookLLMComparisonProject' created successfully
✅ Updated project config for 'NotebookLLMComparisonProject' project


{'messages': [{'message': 'Project config saved.', 'success': True}]}

### 2 - Add 3 Different LLMs to Project Config

In [42]:
THREE_LLMS_BLIND_EVAL_LABEL_CONFIG = """<View orientation="horizontal" pretty="true" is_llm_comparison="true">
  <View pretty="true" className="summary">
    <Text name="Result Summary" value="$prompt"/>
  </View>
  <Header value="What response do you prefer  " size="5" style="margin: 10px 20px 10px 0;"/>
  <Choices name="choices" toName="Result Summary" choice="single" showInline="true">
    <Choice value="response1"/>
    <Choice value="response2"/>
    <Choice value="response3"/>
  </Choices>
  <View orientation="horizontal" pretty="true" evaluation_block="true" style="max-height: 70vh;">
    <View style="position: sticky; top: 0; z-index: 1000; margin-bottom: 16px;" pretty="true">
      <Labels name="label" toName="response1,response2,response3">
        <Label background="#cc00d0" value="Hallucination"/>
        <Label background="#ffb800" value="CitationError"/>
        <Label background="#8e66ff" value="HallucinatedReference"/>
        <Label background="#e06681" value="FromMemoryFact"/>
        <Label background="#9cbdf7" value="FactMissingCitation"/>
      </Labels>
    </View>
    <View pretty="true" orientation="none" style="display: grid; grid-template-columns: repeat(2, 1fr); row-gap: 16px;">
      <View duplication_block="true" names="response1,response2,response3" collapsible="true" style="gap: 16px">
        <View>
          <Header value="Rate the quality of References" size="5"/>
          <View style="gap: 20px;" orientation="vertical" pretty="true">
            <Rating name="CitedReferencesQuality" toName="response1,response2,response3" maxRating="5" meta="Rate the quality of References"/>
          </View>
        </View>
        <View style="height: 300px; overflow-y: auto;" orientation="vertical" pretty="true">
          <Text name="response1" value="$response1"/>
          <Text name="response2" value="$response2"/>
          <Text name="response3" value="$response3"/>
        </View>
        <Choices name="Organization and Coherence" toName="response1,response2,response3" choice="single" meta="Organization and Coherence" collapsible="true">
          <Choice value="Score 1 - Poor Organization"/>
          <Choice value="Score 2 - Basic Organization"/>
          <Choice value="Score 3 - Moderate Organization"/>
          <Choice value="Score 4 - Strong Organization"/>
          <Choice value="Score 5 - Exceptional Organization"/>
        </Choices>
        <Choices name="Coverage and Amount of information" toName="response1,response2,response3" choice="single" meta="Coverage and Amount of information" collapsible="true">
          <Choice value="Score 1 - Severely Lacking Coverage"/>
          <Choice value="Score 2- Partial Coverage"/>
          <Choice value="Score 3 - Acceptable Coverage / Overview"/>
          <Choice value="Score 4 - Good Coverage"/>
          <Choice value="Score 5 - Comprehensive Coverage"/>
        </Choices>
        <Choices name="Relevance" toName="response1,response2,response3" choice="single" meta="Relevance" collapsible="true">
          <Choice value="Score 1 - Off-Topic"/>
          <Choice value="Score 2 - Frequently Off-Topic with Limited Focus"/>
          <Choice value="Score 3 - Somewhat On-Topic but with Several Digressions or Irrelevant Information"/>
          <Choice value="Score 4 - Mostly On-Topic with Minor Deviations"/>
          <Choice value="Score 5 - Focused and Entirely On-Topic"/>
        </Choices>
        <Choices name="Overall Helpfulness" toName="response1,response2,response3" choice="single" meta="Overall Helpfulness" collapsible="true">
          <Choice value="Score 1 - Unhelpful"/>
          <Choice value="Score 2 - Better than Searching from Scratch, but Limited Utility"/>
          <Choice value="Score 3 - Provides Useful Discussions and Papers, Though Individual Review is Needed"/>
          <Choice value="Score 4 - Useful"/>
          <Choice value="Score 5 - Super Useful"/>
        </Choices>
        <Header value="Comments" size="5"/>
        <TextArea name="OverallComments" toName="response1,response2,response3" rows="3" maxSubmissions="1" editable="true"/>
      </View>
    </View>
  </View>
</View>"""


MODEL_NAME_RESPONSE_LABEL_MAP = {
    "gpt-4o-mini": "response1",
    "gpt-4.1-mini": "response2",
    "gpt-5-mini": "response3",
}

custom_llm_service_provider = {v:k for k,v in MODEL_NAME_RESPONSE_LABEL_MAP.items()}
update_project_config(
    LLM_COMPARISON_PROJECT_NAME,
    label_config=THREE_LLMS_BLIND_EVAL_LABEL_CONFIG,
    custom_llm_service_provider=custom_llm_service_provider,
)

✅ Updated project config for 'NotebookLLMComparisonProject' project


{'messages': [{'message': 'Project config saved.', 'success': True}]}

### Generate Responses for the Prompts with 3 different LLMs


In [43]:
from typing import List
from langchain_openai import AzureChatOpenAI

# Azure Open AI Creds:
AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT")
AZURE_API_VERSION = os.getenv("AZURE_API_VERSION")
AZURE_API_KEY = os.getenv("AZURE_API_KEY")


def generate_llm_response(prompt: str, model_name: str, temperature=1):
    llm = AzureChatOpenAI(
        azure_deployment=model_name,
        api_key=AZURE_API_KEY,
        azure_endpoint=AZURE_ENDPOINT,
        api_version=AZURE_API_VERSION,
        temperature=temperature,
    )
    response = llm.invoke(prompt)
    return response.content


def get_multi_llm_response(prompt: str, llm_models: List[str], temperature=1):
    responses = []
    for llm_model in llm_models:
        response = generate_llm_response(prompt, llm_model, temperature)
        responses.append({"model": llm_model, "response": response})
    return prompt, responses

In [44]:
import json
from concurrent.futures import as_completed, ThreadPoolExecutor
from tqdm import tqdm

PROMPTS_FILE_PATH = "prompts.json"
PROMPTS_RESPONSES_FILE_PATH = "prompts-responses.json"
LLM_MODELS = MODEL_NAME_RESPONSE_LABEL_MAP.keys()

# Get Prompts
with open(PROMPTS_FILE_PATH) as f:
    prompts = json.load(f)

# Generate Responses with different LLMs for each Prompts
prompts_responses = []
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(get_multi_llm_response, prompt, LLM_MODELS) for prompt in prompts]
    for future in tqdm(as_completed(futures), desc="🔄 Generating Responses", total=len(futures)):
        prompt, responses = future.result()
        prompts_responses.append({"prompt": prompt, "responses": responses})

# Save Prompts Responses
with open(PROMPTS_RESPONSES_FILE_PATH, "w") as f:
    json.dump(prompts_responses, f)

🔄 Generating Responses: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:41<00:00,  4.14s/it]


Format the responses to the tasks


In [45]:
LLM_COMPARISON_TASKS_FILE_PATH = "llm-comparison-tasks.json"


def format_prompt_responses_as_task(prompt_responses):
    task = {"data": {"prompt": prompt_responses["prompt"]}}
    for response in prompt_responses['responses']:
        model = response["model"]
        response_label = MODEL_NAME_RESPONSE_LABEL_MAP[model]
        task["data"].update({response_label: response["response"]})
    return task


with open(PROMPTS_RESPONSES_FILE_PATH) as f:
    prompts_responses = json.load(f)

tasks = []
for prompt_responses in prompts_responses:
    tasks.append(format_prompt_responses_as_task(prompt_responses))

with open(LLM_COMPARISON_TASKS_FILE_PATH, "w") as f:
    json.dump(tasks, f)

### 2 - Import the tasks to the Project


In [46]:
import_tasks_to_project(
    project_name=LLM_COMPARISON_PROJECT_NAME, file_path=LLM_COMPARISON_TASKS_FILE_PATH
)

✅ Tasks Imported Successfully.


## Gen AI Lab - DeIdentification


### DeIdentification Helpers


In [47]:
def get_model_servers():
    try:
        response = requests.get(
            f"{API_URL}/api/mt/get_model_servers", headers=HEADERS, cookies=auth.cookies
        )
        response.raise_for_status()
        model_servers = response.json()
        return model_servers
    except requests.HTTPError as http_error:
        print(f"❌ Error while getting model servers. Error={http_error.response.text}")
        raise


def run_deidentification(project_name: str, server_project_id=None):
    if not server_project_id:
        deployed_model_servers = get_model_servers()
        available_model_servers = [
            model_server
            for model_server in deployed_model_servers
            if model_server["state"] == "idle"
        ]

        if not available_model_servers:
            print("❌ Model server not available.")
            raise Exception("Model server not available")
        server_project_id = available_model_servers[0]["id"]

    tasks = get_project_tasks(project_name)
    task_ids = [task["id"] for task in tasks]
    params = {"server_project_id": server_project_id}
    payload = {"task_ids": task_ids, "status": "", "only_deidentificate": True}
    print(
        f"🚀 Starting DeIdentification; server_id={server_project_id} project_name={project_name} task_ids={task_ids}"
    )
    try:
        response = requests.post(
            f"{API_URL}/api/projects/{project_name}/preannotate",
            headers=HEADERS,
            cookies=auth.cookies,
            params=params,
            json=payload,
        )
        response.raise_for_status()
        print(f"✅ {response.json()['message']}")
        return {
            "project_name": project_name,
            "server_project_id": server_project_id,
            "task_ids": task_ids,
        }
    except requests.HTTPError as http_error:
        print(f"❌ Error starting DeIdentification. Error={http_error.response.text}")
        raise


def get_deidentification_status(project_name, task_ids):
    try:
        response = requests.get(
            f"{API_URL}/api/projects/{project_name}/preannotation_status",
            headers=HEADERS,
            cookies=auth.cookies,
            params={"task_id": ",".join(map(str, task_ids))},
        )
        response.raise_for_status()
        return response.json()
    except requests.HTTPError as http_error:
        print(f"❌ Error while getting DeIdentification Status. Error={http_error.response.text}")
        raise

### 1 - Create DeIdentification Type Project


In [48]:
DEIDENTIFICATION_PROJECT_NAME = "NotebookDeIdentificationProject"

create_project(DEIDENTIFICATION_PROJECT_NAME)

config_templates = get_config_templates()
deid_proj_config = next(
    config
    for config in config_templates["text"]
    if config["title"] == "De-identification"
)

update_project_config(
    DEIDENTIFICATION_PROJECT_NAME, label_config=deid_proj_config["label_config"]
)

✅ Project 'NotebookDeIdentificationProject' created successfully
✅ Updated project config for 'NotebookDeIdentificationProject' project


{'messages': [{'message': 'Project config saved.', 'success': True}]}

### 2 - Import PreAnnotated Tasks


In [49]:
PREANNOTATED_TASKS_FILE = "Pre-Annotated-Tasks-ForDeid.zip"
import_tasks_to_project(DEIDENTIFICATION_PROJECT_NAME, PREANNOTATED_TASKS_FILE)

✅ Tasks Imported Successfully.


### 3 - Run DeIdentification


In [50]:
SERVER_PROJECT_ID = 144  # Already deployed Server Project id
run_info = run_deidentification(DEIDENTIFICATION_PROJECT_NAME, SERVER_PROJECT_ID)

🚀 Starting DeIdentification; server_id=144 project_name=NotebookDeIdentificationProject task_ids=[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
❌ Error starting DeIdentification. Error={"error":"Server not deployed"}



HTTPError: 400 Client Error: Bad Request for url: https://genailab.demo.johnsnowlabs.dev/api/projects/NotebookDeIdentificationProject/preannotate?server_project_id=144

### 4 - Get DeIdentification Progress


In [None]:
from tqdm import tqdm


p_bar = tqdm(desc="🔄 DeIdentification Progress", total=len(run_info["task_ids"]))
while True:
    deidentification_status = get_deidentification_status(
        run_info["project_name"], run_info["task_ids"]
    )
    total = len(run_info["task_ids"])
    running = deidentification_status["running"]

    p_bar.n = total - running
    p_bar.refresh()

    if deidentification_status["running"] == 0:
        print("🎉 DeIdentification Completed!")
        p_bar.close()
        break
    time.sleep(1)

### 5 - Export De-Identified Tasks

In [None]:
export_deidentified_tasks(DEIDENTIFICATION_PROJECT_NAME)