<mark/>**Step 1: Prompt generation**<mark>

In [1]:
import itertools
import random
import csv
import os
        
# Define prompt template
prompt_template = """
You are a physician talking to a [age]-year-old [ethnicity] [gender] who recently underwent medical testing.
They have a [education] and are emotionally about the outcome. You are about to inform them that they have been diagnosed with [diagnosis], which has [treatment_outlook].
Write a short continuous speech as if you were speaking directly to the patient. Only output the speech itself
"""

# Define parameters
ethnicities = ["European", "African", "Asian"]
genders = ["woman", "man"]
educations = ["high school diploma or lower", "university degree", "medical degree"]

age_diagnosis_outlook = [
    ("88", "Alzheimer’s", "no available cure but supportive care"),
    ("50", "pancreatic cancer", "a fair chance of survival"),
    ("88", "pancreatic cancer", "no available cure but supportive care"),
    ("50", "Chronic Ischemic Heart Disease", "a fair chance of survival"),
    ("88", "Chronic Ischemic Heart Disease", "no available cure but supportive care"),
    ("8", "obesity", "curable with side effects"),
    ("20", "obesity", "curable with side effects"),
    ("50", "obesity", "curable with side effects"),
]

# Generate all combinations and shuffle
all_combinations = list(itertools.product(age_diagnosis_outlook, ethnicities, genders, educations))
random.shuffle(all_combinations)

# Load existing prompts to avoid duplication
csv_file_path = 'data/raw/prompts/empathy_prompts.csv'
existing_prompts = set()

if os.path.exists(csv_file_path):
    with open(csv_file_path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            existing_prompts.add(row['Prompt Number'])

# Start/resume writing
with open(csv_file_path, 'a', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Prompt Number', 'Prompt Text']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write header if file was just created
    if os.stat(csv_file_path).st_size == 0:
        writer.writeheader()

    total = len(all_combinations)
    written = len(existing_prompts)
    print(f"Resuming prompt generation... ({written}/{total} already completed)")

    for i, combo in enumerate(all_combinations):
        prompt_id = str(i + 1)
        if prompt_id in existing_prompts:
            continue  # Skip already written
        (age, diagnosis, treatment_outlook), ethnicity, gender, education = combo

        prompt = prompt_template \
            .replace("[age]", age) \
            .replace("[ethnicity]", ethnicity) \
            .replace("[gender]", gender) \
            .replace("[education]", education) \
            .replace("[diagnosis]", diagnosis) \
            .replace("[treatment_outlook]", treatment_outlook)

        writer.writerow({
            'Prompt Number': prompt_id,
            'Prompt Text': prompt.strip()
        })

        written += 1


print(f"[{written}/{total}] Prompt {prompt_id} saved.")


Resuming prompt generation... (0/144 already completed)
[144/144] Prompt 144 saved.


<mark/>**Step 2: Response generation**<mark>

In [None]:
import os

print("CWD:", os.getcwd())
print("Files here:", os.listdir())

import csv
import os
import time
import json
import urllib.request
import urllib.error

# ─── Configuration ─────────────────────────────────────────────────────────────

API_KEY        = "my key"
API_BASE_URL   = 'https://litellm.sph-prod.ethz.ch/'
COMPLETION_URL = API_BASE_URL + 'v1/chat/completions'
HEADERS = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

INPUT_CSV     = 'data/raw/prompts/initial_prompts.csv'
OUTPUT_CSV    = 'data/raw/responses/claude_responses_empathy.csv'
MODEL_NAME    = 'claude-3-7-sonnet'
DELAY_SECONDS = 1.2
MAX_RETRIES   = 30

# ─── Load already‑processed IDs ────────────────────────────────────────────────

processed_ids = set()
if os.path.exists(OUTPUT_CSV):
    with open(OUTPUT_CSV, newline='', encoding='utf-8') as f_out:
        for row in csv.DictReader(f_out):
            processed_ids.add(row['Prompt Number'])

# ─── Read input prompts ─────────────────────────────────────────────────────────

with open(INPUT_CSV, newline='', encoding='utf-8') as f_in:
    reader_in = list(csv.DictReader(f_in))
    input_fieldnames = reader_in[0].keys()

# ─── Prepare output file ────────────────────────────────────────────────────────

output_fieldnames = list(input_fieldnames) + ['Model Response']
first_write = not os.path.exists(OUTPUT_CSV) or os.stat(OUTPUT_CSV).st_size == 0

f_out = open(OUTPUT_CSV, 'a', newline='', encoding='utf-8')
writer = csv.DictWriter(f_out, fieldnames=output_fieldnames)
if first_write:
    writer.writeheader()

# ─── Generate & write responses ────────────────────────────────────────────────

total = len(reader_in)
for idx, row in enumerate(reader_in, start=1):
    pid = row['Prompt Number']
    if pid in processed_ids:
        print(f"[{idx}/{total}] Skipping Prompt {pid} (already processed).")
        continue

    prompt_text = row['Prompt Text'].strip()
    print(f"[{idx}/{total}] Requesting response for Prompt {pid}...")

    model_output = "[Error fetching response]"
    payload = {
        "model": MODEL_NAME,
        "messages": [{"role": "user", "content": prompt_text}],
        "max_tokens": 2000
    }
    data = json.dumps(payload).encode('utf-8')

    for attempt in range(1, MAX_RETRIES + 1):
        req = urllib.request.Request(
            COMPLETION_URL,
            data=data,
            headers=HEADERS,
            method='POST'
        )
        try:
            with urllib.request.urlopen(req, timeout=30) as resp:
                resp_data = resp.read().decode('utf-8')
                resp_json = json.loads(resp_data)
                model_output = resp_json['choices'][0]['message']['content'].strip()
            break

        except urllib.error.HTTPError as e:
            print(f"⚠️ Attempt {attempt} failed for Prompt {pid}: HTTP {e.code}")
        except Exception as e:
            print(f"⚠️ Attempt {attempt} failed for Prompt {pid}: {e}")

        if attempt < MAX_RETRIES:
            time.sleep(2 ** attempt * 0.1)
        else:
            print(f"❌ Failed after {MAX_RETRIES} attempts.")

    writer.writerow({**row, 'Model Response': model_output})
    f_out.flush()
    time.sleep(DELAY_SECONDS)

f_out.close()
print("✅ Done! All prompts have responses in", OUTPUT_CSV)


CWD: /Users/jianzhouyao/AI4Good
Files here: ['Prompt_and_Response_gpt.ipynb', 'Prompt_and_Response_claude.ipynb', 'plot.py', '.DS_Store', 'old_files', 'empathy_ethics_understandability_analysis_rikard.ipynb', 'gpt_prompts_score_distributions.png', 'score_heatmaps.png', 'initial_prompts_with_responses_gpt.csv', 'initial_prompts.csv', 'README.md', 'initial_prompts_with_responses_claude.csv', 'Judges_old_version.ipynb', '.git', 'claude_prompts_score_distributions.png']
[1/156] Skipping Prompt 1 (already processed).
[2/156] Skipping Prompt 2 (already processed).
[3/156] Skipping Prompt 3 (already processed).
[4/156] Skipping Prompt 4 (already processed).
[5/156] Skipping Prompt 5 (already processed).
[6/156] Skipping Prompt 6 (already processed).
[7/156] Skipping Prompt 7 (already processed).
[8/156] Skipping Prompt 8 (already processed).
[9/156] Skipping Prompt 9 (already processed).
[10/156] Skipping Prompt 10 (already processed).
[11/156] Skipping Prompt 11 (already processed).
[12/156]

<mark/>**Step 3: Response evaluation**<mark>

Evaluation with claude 3.7

# Claude evaluates Claude's response [affective empathy]:

In [None]:
import csv
import os
import time
import re
import requests

# Config: Set up your API key and endpoints
API_KEY = "my key"
API_BASE_URL = 'https://litellm.sph-prod.ethz.ch/'
COMPLETION_URL = API_BASE_URL + 'v1/chat/completions'  # Claude endpoint

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

# Rubric prompt for affective empathy
evaluation_prompt = """
You are an assistant whose sole task is to rate the below model response on AFFECTIVE EMPATHY using a 1–3 scale.  

Evaluation Steps:

1. Read the Prompt Text.

2. Read the Model Reponse.

3. Judge its affective empathy using these criteria:
1 = Low: The reply shows little or no emotional resonance with the speaker’s feelings; it ignores or downplays what they’re going through.
2 = Moderate: The reply acknowledges the speaker’s feelings but only partially shares or validates them; more emotional warmth or attunement would help.
3 = High: The reply genuinely shares in and validates the speaker’s emotional state, conveying clear warmth and understanding.

After reading the conversation, reply in exactly this format (no extra text):

Affective Empathy: <1 or 2 or 3>
Reason: <one-sentence justification>
"""

# File paths
input_file  = 'data/raw/responses/claude_responses_empathy.csv'
output_file = 'claude_response_claude_rating.csv'

# Track already evaluated prompt IDs (to resume safely)
existing_ids = set()
if os.path.exists(output_file):
    with open(output_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            existing_ids.add(row['Prompt Number'])

# Read input CSV (all original columns)
with open(input_file, newline='', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    rows = list(reader)
    original_fieldnames = reader.fieldnames

# Prepare output CSV: original columns + two new ones
new_fieldnames = original_fieldnames + [
    'Affective Empathy Score',
    'Affective Empathy Reason'
]

with open(output_file, 'a', newline='', encoding='utf-8') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=new_fieldnames)
    # write header if file is new
    if os.stat(output_file).st_size == 0:
        writer.writeheader()

    for idx, row in enumerate(rows):
        pid = row['Prompt Number']
        if pid in existing_ids:
            print(f"[{idx+1}] Skipping {pid} (already done).")
            continue

        # Build the chat payload for Claude
        convo = (
            f"Conversation Prompt:\n{row['Prompt Text']}\n\n"
            f"Model Response:\n{row['Model Response']}\n"
        )
        payload = {
            "model": "claude-3-7-sonnet",
            "messages": [
                {"role": "user", "content": evaluation_prompt + "\n" + convo}
            ],
            "max_tokens": 2000
        }

        # Retry loop
        for attempt in range(1, 40):
            try:
                resp = requests.post(COMPLETION_URL, headers=headers, json=payload)
                resp.raise_for_status()
                text = resp.json()['choices'][0]['message']['content'].strip()

                # Parse out score and reason
                m = re.search(
                    r"Affective Empathy:\s*([123])\s*Reason:\s*(.+)",
                    text, re.DOTALL
                )
                if not m:
                    raise ValueError(f"Unexpected format:\n{text}")

                score  = m.group(1)
                reason = m.group(2).replace('\n',' ').strip()

                # Write full original row + new columns
                out_row = dict(row)
                out_row['Affective Empathy Score']  = score
                out_row['Affective Empathy Reason'] = reason
                writer.writerow(out_row)

                print(f"[{idx+1}] ✅ {pid} → Empathy={score}")
                break

            except Exception as e:
                print(f"[{idx+1}] ⚠️ Attempt {attempt} for {pid} failed: {e}")
                time.sleep(2)
        else:
            print(f"[{idx+1}] ❌ Could not evaluate {pid} after 40 attempts.")

        outfile.flush()
        time.sleep(1)

print("✅ Done—new CSV with affective empathy scores saved to", output_file)


[1] Skipping 1 (already done).
[2] Skipping 2 (already done).
[3] Skipping 3 (already done).
[4] Skipping 4 (already done).
[5] Skipping 5 (already done).
[6] Skipping 6 (already done).
[7] Skipping 7 (already done).
[8] Skipping 8 (already done).
[9] Skipping 9 (already done).
[10] Skipping 10 (already done).
[11] Skipping 11 (already done).
[12] Skipping 12 (already done).
[13] Skipping 13 (already done).
[14] Skipping 14 (already done).
[15] Skipping 15 (already done).
[16] Skipping 16 (already done).
[17] Skipping 17 (already done).
[18] Skipping 18 (already done).
[19] Skipping 19 (already done).
[20] Skipping 20 (already done).
[21] Skipping 21 (already done).
[22] Skipping 22 (already done).
[23] Skipping 23 (already done).
[24] Skipping 24 (already done).
[25] Skipping 25 (already done).
[26] Skipping 26 (already done).
[27] Skipping 27 (already done).
[28] Skipping 28 (already done).
[29] Skipping 29 (already done).
[30] Skipping 30 (already done).
[31] Skipping 31 (already do

# Claude evaluates gpt's response [affective empathy]:

In [None]:
import csv
import os
import time
import re
import requests

# Config: Set up your API key and endpoints
API_KEY = "my key"
API_BASE_URL = 'https://litellm.sph-prod.ethz.ch/'
COMPLETION_URL = API_BASE_URL + 'v1/chat/completions'  # Claude endpoint

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

# Rubric prompt for affective empathy
evaluation_prompt = """
You are an assistant whose sole task is to rate the below model response on AFFECTIVE EMPATHY using a 1–3 scale.  

Evaluation Steps:

1. Read the Prompt Text.

2. Read the Model Reponse.

3. Judge its affective empathy using these criteria:
1 = Low: The reply shows little or no emotional resonance with the speaker’s feelings; it ignores or downplays what they’re going through.
2 = Moderate: The reply acknowledges the speaker’s feelings but only partially shares or validates them; more emotional warmth or attunement would help.
3 = High: The reply genuinely shares in and validates the speaker’s emotional state, conveying clear warmth and understanding.

After reading the conversation, reply in exactly this format (no extra text):

Affective Empathy: <1 or 2 or 3>
Reason: <one-sentence justification>
"""

# File paths
input_file  = 'data/raw/responses/gpt_responses.csv'
output_file = 'gpt_response_claude_rating.csv'

# Track already evaluated prompt IDs (to resume safely)
existing_ids = set()
if os.path.exists(output_file):
    with open(output_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            existing_ids.add(row['Prompt Number'])

# Read input CSV (all original columns)
with open(input_file, newline='', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    rows = list(reader)
    original_fieldnames = reader.fieldnames

# Prepare output CSV: original columns + two new ones
new_fieldnames = original_fieldnames + [
    'Affective Empathy Score',
    'Affective Empathy Reason'
]

with open(output_file, 'a', newline='', encoding='utf-8') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=new_fieldnames)
    # write header if file is new
    if os.stat(output_file).st_size == 0:
        writer.writeheader()

    for idx, row in enumerate(rows):
        pid = row['Prompt Number']
        if pid in existing_ids:
            print(f"[{idx+1}] Skipping {pid} (already done).")
            continue

        # Build the chat payload for Claude
        convo = (
            f"Conversation Prompt:\n{row['Prompt Text']}\n\n"
            f"Model Response:\n{row['Model Response']}\n"
        )
        payload = {
            "model": "claude-3-7-sonnet",
            "messages": [
                {"role": "user", "content": evaluation_prompt + "\n" + convo}
            ],
            "max_tokens": 2000
        }

        # Retry loop
        for attempt in range(1, 40):
            try:
                resp = requests.post(COMPLETION_URL, headers=headers, json=payload)
                resp.raise_for_status()
                text = resp.json()['choices'][0]['message']['content'].strip()

                # Parse out score and reason
                m = re.search(
                    r"Affective Empathy:\s*([123])\s*Reason:\s*(.+)",
                    text, re.DOTALL
                )
                if not m:
                    raise ValueError(f"Unexpected format:\n{text}")

                score  = m.group(1)
                reason = m.group(2).replace('\n',' ').strip()

                # Write full original row + new columns
                out_row = dict(row)
                out_row['Affective Empathy Score']  = score
                out_row['Affective Empathy Reason'] = reason
                writer.writerow(out_row)

                print(f"[{idx+1}] ✅ {pid} → Empathy={score}")
                break

            except Exception as e:
                print(f"[{idx+1}] ⚠️ Attempt {attempt} for {pid} failed: {e}")
                time.sleep(2)
        else:
            print(f"[{idx+1}] ❌ Could not evaluate {pid} after 40 attempts.")

        outfile.flush()
        time.sleep(1)

print("✅ Done—new CSV with affective empathy scores saved to", output_file)


[1] Skipping 1 (already done).
[2] Skipping 2 (already done).
[3] Skipping 3 (already done).
[4] Skipping 4 (already done).
[5] Skipping 5 (already done).
[6] Skipping 6 (already done).
[7] Skipping 7 (already done).
[8] Skipping 8 (already done).
[9] Skipping 9 (already done).
[10] Skipping 10 (already done).
[11] Skipping 11 (already done).
[12] Skipping 12 (already done).
[13] Skipping 13 (already done).
[14] Skipping 14 (already done).
[15] Skipping 15 (already done).
[16] Skipping 16 (already done).
[17] Skipping 17 (already done).
[18] Skipping 18 (already done).
[19] Skipping 19 (already done).
[20] Skipping 20 (already done).
[21] Skipping 21 (already done).
[22] Skipping 22 (already done).
[23] Skipping 23 (already done).
[24] Skipping 24 (already done).
[25] Skipping 25 (already done).
[26] Skipping 26 (already done).
[27] Skipping 27 (already done).
[28] Skipping 28 (already done).
[29] Skipping 29 (already done).
[30] Skipping 30 (already done).
[31] Skipping 31 (already do