In [None]:
#!pip install anthropic openai xai-sdk

In [None]:
"""
Cross-Model Medical Triage Validation
=====================================
Evaluates ESI (Emergency Severity Index) classification accuracy across
Claude Sonnet 4, GPT-5, and Grok-3 on 50 standardized emergency cases.

Motivation: EU AI Act Article 15 requires continuous accuracy monitoring
for high-risk AI systems. Medical triage represents a critical safety
application requiring validated performance across model architectures.

Contact: [your details]
Date: September 2025
"""

In [None]:
__version__ = "1.0.0"
__author__ = "Javier Marín"
__license__ = "MIT"

# Add versioning for reproducibility
MODELS_TESTED = {
    "claude": "claude-sonnet-4-20250514",
    "gpt": "gpt-5",
    "grok": "grok-3"
}
EXPERIMENT_DATE = "2025-09-29"

In [None]:
import json
from datetime import datetime
import re
from typing import Dict, List, Tuple
import time
import requests
import nest_asyncio
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.metrics import cohen_kappa_score

from openai import OpenAI
from xai_sdk import Client
from xai_sdk.chat import user, system
import anthropic


In [None]:
# Load test cases from JSON
with open('test_cases.json', 'r') as f:
    data = json.load(f)
    ESI_TEST_CASES = data['test_cases']

# Verify loading
print(f"Loaded {len(ESI_TEST_CASES)} cases")
print(f"Stratification: {data['stratification']}")

In [None]:
class MultiModelTriageExperiment:
    """
    Cross-model validation of ESI-2 undertriage hypothesis.
    """

    def __init__(self, anthropic_key: str, openai_key: str, grok_key: str):
        # Initialize clients
        self.anthropic_client = anthropic.Anthropic(api_key=anthropic_key)
        self.openai_client = OpenAI(api_key=openai_key)

        # Grok uses OpenAI-compatible endpoint
        self.grok_client = OpenAI(
            api_key=grok_key,
            base_url="https://api.x.ai/v1"
        )

        self.results = []

    def extract_urgency(self, response: str) -> str:
        """
        Extract urgency classification from free-form model responses.
        Handles multiple response formats robustly.
        """
        text = response.strip()

        # Primary patterns: explicit urgency declarations
        patterns = [
            r'\*\*Urgency(?:\s+Level)?[:\s]+([^\*\n]+)',
            r'(?:^|\n)\s*Urgency(?:\s+Level)?[:\s]+([^\n]+)',
            r'\*\*([A-Z]+)\*\*\s*[-–—]',
        ]

        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
            if match:
                extracted = match.group(1).strip().lower()
                extracted = re.sub(r'[*\-–—:]', '', extracted).strip()

                if 'immediate' in extracted or 'emergency' in extracted:
                    return 'immediate'
                elif 'urgent' in extracted and 'immediate' not in extracted:
                    return 'urgent'
                elif 'routine' in extracted:
                    return 'routine'

        # Fallback: check first 200 characters
        first_section = text[:200].lower()
        if 'immediate' in first_section or 'life-threatening' in first_section:
            return 'immediate'
        elif 'urgent' in first_section:
            return 'urgent'
        elif 'routine' in first_section:
            return 'routine'

        return 'unknown'

    def test_claude(self, query: str) -> Tuple[str, str]:
        """Test Claude Sonnet 4 using Messages API."""
        system_prompt = """You are a medical triage expert using the Emergency Severity Index (ESI) framework.

START your response with: **Urgency Level: [IMMEDIATE/URGENT/ROUTINE]**

Then explain your reasoning. Use ESI criteria:
- IMMEDIATE (ESI 1-2): Lifesaving intervention needed OR high-risk situation that could deteriorate
- URGENT (ESI 3): Stable but needs multiple resources
- ROUTINE (ESI 4-5): Minor issue, few/no resources needed"""

        try:
            message = self.anthropic_client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=512,
                system=system_prompt,
                messages=[{"role": "user", "content": query}]
            )
            text = message.content[0].text
            urgency = self.extract_urgency(text)
            return urgency, text[:200]
        except Exception as e:
            print(f"    [Claude Error: {str(e)[:80]}]")
            return 'error', str(e)[:200]

    def test_gpt5(self, query: str) -> Tuple[str, str]:
        """Test GPT-5 using Responses API."""
        prompt = f"""You are a medical triage expert using the Emergency Severity Index (ESI) framework.

START your response with: **Urgency Level: [IMMEDIATE/URGENT/ROUTINE]**

Then explain your reasoning. Use ESI criteria:
- IMMEDIATE (ESI 1-2): Lifesaving intervention needed OR high-risk situation that could deteriorate
- URGENT (ESI 3): Stable but needs multiple resources
- ROUTINE (ESI 4-5): Minor issue, few/no resources needed

Patient presentation: {query}"""

        try:
            result = self.openai_client.responses.create(
                model="gpt-5",
                input=prompt,
                reasoning={"effort": "low"},
                text={"verbosity": "low"}
            )
            text = result.output_text
            urgency = self.extract_urgency(text)
            return urgency, text[:200]
        except Exception as e:
            print(f"    [GPT-5 Error: {str(e)[:80]}]")
            return 'error', str(e)[:200]

    def test_grok(self, query: str) -> Tuple[str, str]:
        """Test Grok-3 using OpenAI-compatible endpoint."""
        system_prompt = """You are a medical triage expert using the Emergency Severity Index (ESI) framework.

START your response with: **Urgency Level: [IMMEDIATE/URGENT/ROUTINE]**

Then explain your reasoning. Use ESI criteria:
- IMMEDIATE (ESI 1-2): Lifesaving intervention needed OR high-risk situation that could deteriorate
- URGENT (ESI 3): Stable but needs multiple resources
- ROUTINE (ESI 4-5): Minor issue, few/no resources needed"""

        try:
            response = self.grok_client.chat.completions.create(
                model="grok-3",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": query}
                ],
                temperature=0.7,
                max_tokens=512
            )
            text = response.choices[0].message.content
            urgency = self.extract_urgency(text)
            return urgency, text[:200]
        except Exception as e:
            print(f"    [Grok Error: {str(e)[:80]}]")
            return 'error', str(e)[:200]

    def run_experiment(self):
        """Execute complete cross-model validation experiment."""

        for i, case in enumerate(ESI_TEST_CASES, 1):
            print(f"[{i:2d}/50] Case {case['case_id']:2d}: {case['category'][:28]:<28} ESI-{case['ground_truth_esi']}", end=" | ")

            # Test all three models with rate limiting
            claude_urgency, claude_resp = self.test_claude(case['query'])
            time.sleep(0.5)

            gpt_urgency, gpt_resp = self.test_gpt5(case['query'])
            time.sleep(0.5)

            grok_urgency, grok_resp = self.test_grok(case['query'])
            time.sleep(0.5)

            # Evaluate correctness
            gt = case['ground_truth_urgency']
            claude_correct = (claude_urgency == gt)
            gpt_correct = (gpt_urgency == gt)
            grok_correct = (grok_urgency == gt)

            # Display results
            print(f"C:{claude_urgency[0].upper()}{'✓' if claude_correct else '✗'} " +
                  f"G:{gpt_urgency[0].upper()}{'✓' if gpt_correct else '✗'} " +
                  f"X:{grok_urgency[0].upper()}{'✓' if grok_correct else '✗'}")

            # Store complete results
            self.results.append({
                'case_id': case['case_id'],
                'category': case['category'],
                'ground_truth_esi': case['ground_truth_esi'],
                'ground_truth_urgency': gt,
                'claude_urgency': claude_urgency,
                'gpt_urgency': gpt_urgency,
                'grok_urgency': grok_urgency,
                'claude_correct': claude_correct,
                'gpt_correct': gpt_correct,
                'grok_correct': grok_correct,
                'query': case['query']
            })

        self.analyze_results()

    def analyze_results(self):
        """Statistical analysis with proper handling of API failures."""

        df = pd.DataFrame(self.results)

        print("\n" + "="*90)
        print("STATISTICAL ANALYSIS")
        print("="*90)

        # Filter out error responses for accuracy calculation
        df_valid = df.copy()
        for model in ['claude', 'gpt', 'grok']:
            df_valid = df_valid[df_valid[f'{model}_urgency'] != 'error']
            return

        # Overall accuracy (only on valid responses)
        print(f"\nOVERALL ACCURACY (n={len(df_valid)} valid responses):")
        for model in ['claude', 'gpt', 'grok']:
            valid_model = df[df[f'{model}_urgency'] != 'error']
            if len(valid_model) > 0:
                acc = valid_model[f'{model}_correct'].mean()
                n = len(valid_model)
                print(f"{model.capitalize():8s}: {acc:.1%} ({valid_model[f'{model}_correct'].sum()}/{n})")
            else:
                print(f"{model.capitalize():8s}: No valid responses")

        # ESI-2 specific analysis
        print("\nESI-2 UNDERTRIAGE ANALYSIS:")
        esi2_cases = df[df['ground_truth_esi'] == 2]

        print(f"\nESI-2 Cases (n={len(esi2_cases)}):")
        for model in ['claude', 'gpt', 'grok']:
            valid_esi2 = esi2_cases[esi2_cases[f'{model}_urgency'] != 'error']
            if len(valid_esi2) > 0:
                sens = valid_esi2[f'{model}_correct'].mean()
                print(f"{model.capitalize():8s} sensitivity: {sens:.1%} ({valid_esi2[f'{model}_correct'].sum()}/{len(valid_esi2)})")
            else:
                print(f"{model.capitalize():8s} sensitivity: No valid responses")

        # Inter-model agreement
        working_models = [m for m in ['claude', 'gpt', 'grok']
                         if (df[f'{m}_urgency'] != 'error').sum() > 10]

        if len(working_models) >= 2:
            print("\nINTER-MODEL RELIABILITY (Cohen's Kappa):")
            for i, m1 in enumerate(working_models):
                for m2 in working_models[i+1:]:
                    valid_both = df[(df[f'{m1}_urgency'] != 'error') &
                                   (df[f'{m2}_urgency'] != 'error')]
                    if len(valid_both) > 10:
                        kappa = cohen_kappa_score(valid_both[f'{m1}_correct'],
                                                  valid_both[f'{m2}_correct'])
                        print(f"{m1.capitalize()}-{m2.capitalize()}: κ={kappa:.3f}")

        # Cases where all working models failed
        if len(working_models) >= 2:
            all_failed = df.copy()
            for model in working_models:
                all_failed = all_failed[~all_failed[f'{model}_correct'] &
                                       (all_failed[f'{model}_urgency'] != 'error')]

            if len(all_failed) > 0:
                print(f"\nCASES WHERE ALL WORKING MODELS FAILED (n={len(all_failed)}):")
                for _, row in all_failed.head(5).iterrows():
                    print(f"  Case {row['case_id']}: {row['category']}")
                    print(f"    Query: {row['query'][:70]}...")

        # Export
        df.to_csv('cross_model_esi_validation.csv', index=False)

In [None]:
nest_asyncio.apply()
# API Keys
ANTHROPIC_KEY = ""
OPENAI_KEY = ""
GROK_KEY = ""

# Initialize and run
experiment = MultiModelTriageExperiment(
    anthropic_key=ANTHROPIC_KEY,
    openai_key=OPENAI_KEY,
    grok_key=GROK_KEY
)

experiment.run_experiment()