<mark/>**Step 1: Prompt generation**<mark>

In [13]:
import itertools
import random
import csv
import os

# Define prompt template (no emotional reference)
prompt_template = """
You are a physician talking to a [age]-year-old [ethnicity] [gender] who recently underwent medical testing.
They have a [education] and are about to be informed that they have been diagnosed with [diagnosis], which has [treatment_outlook].
Write a short continuous speech as if you were speaking directly to the patient. Only output the speech itself.
"""

# Demographic parameters
ethnicities = ["European", "African", "Asian"]
genders     = ["female", "male"]
educations  = ["high school diploma or lower", "university degree", "medical degree"]

# (age, diagnosis, evidence‑backed outlook)
age_diagnosis_outlook = [
    # Obesity: WHO → chronic, manageable
    ("8",  "obesity", "a chronic condition manageable through lifestyle modifications and medical support"),
    ("12", "obesity", "a chronic condition manageable through lifestyle modifications and medical support"),
    ("30", "obesity", "a chronic condition manageable through lifestyle modifications and medical support"),
    ("60", "obesity", "a chronic condition manageable through lifestyle modifications and medical support"),

    # Pancreatic cancer: ACS → ~13% 5‑yr survival
    ("50", "pancreatic cancer", "a limited 5‑year relative survival rate (~13%) despite treatment options"),
    # Older/advanced focus → palliative care
    ("75", "pancreatic cancer", "a limited 5‑year survival (~8% for advanced stages); focus on palliative care"),

    # Alzheimer’s: no cure, supportive/palliative (Alzheimer’s Assn.)
    ("70", "Alzheimer’s", "no cure available; supportive and palliative care to maintain quality of life"),
    ("85", "Alzheimer’s", "no cure available; supportive and palliative care to maintain quality of life"),

    # Chronic Ischemic Heart Disease: manageable (AHA)
    ("50", "Chronic Ischemic Heart Disease", "manageable with medications, lifestyle changes, and possible revascularization to improve outcomes"),
    ("80", "Chronic Ischemic Heart Disease", "manageable with medications, lifestyle changes, and possible revascularization to improve outcomes—though advanced age increases risk"),
]

def is_valid_combination(age, education):
    """
    - Children (<18) only 'high school diploma or lower'
    - 'university degree' requires age ≥22
    - 'medical degree' requires age ≥25
    """
    age = int(age)
    if age < 18 and education != "high school diploma or lower":
        return False
    if education == "university degree" and age < 22:
        return False
    if education == "medical degree" and age < 25:
        return False
    return True

# Build only valid combos
all_combinations = [
    {
        "age": age,
        "ethnicity": eth,
        "gender": gender,
        "education": edu,
        "diagnosis": diag,
        "treatment_outlook": outlook,
    }
    for (age, diag, outlook), eth, gender, edu
    in itertools.product(age_diagnosis_outlook, ethnicities, genders, educations)
    if is_valid_combination(age, edu)
]

random.shuffle(all_combinations)

# Write CSV
csv_file = 'data/raw/prompts/initial_prompts.csv'
file_exists = os.path.exists(csv_file)

with open(csv_file, 'a', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=[
        "Prompt Number", "age", "ethnicity", "gender",
        "education", "diagnosis", "treatment_outlook", "Prompt Text"
    ])
    if not file_exists or os.stat(csv_file).st_size == 0:
        writer.writeheader()

    for i, combo in enumerate(all_combinations, start=1):
        prompt = prompt_template \
            .replace("[age]", combo["age"]) \
            .replace("[ethnicity]", combo["ethnicity"]) \
            .replace("[gender]", combo["gender"]) \
            .replace("[education]", combo["education"]) \
            .replace("[diagnosis]", combo["diagnosis"]) \
            .replace("[treatment_outlook]", combo["treatment_outlook"]) \
            .strip()

        row = {
            "Prompt Number": i,
            **combo,
            "Prompt Text": prompt
        }
        writer.writerow(row)

print(f"[{len(all_combinations)}] prompts generated and saved to {csv_file}.")


[156] prompts generated and saved to initial_prompts.csv.


<mark/>**Step 2: Response generation**<mark>

In [1]:
%%time
import csv
import os
import time
import requests

# ─── Configuration ─────────────────────────────────────────────────────────────

API_KEY        = ""  # ← set your key here
API_BASE_URL   = 'https://litellm.sph-prod.ethz.ch/'
COMPLETION_URL = API_BASE_URL + 'completions'
HEADERS        = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

INPUT_CSV      = 'data/raw/prompts/initial_prompts.csv'
OUTPUT_CSV     = 'data/raw/responses/gpt_responses.csv'
MODEL_NAME     = 'gpt-4o'
DELAY_SECONDS  = 1.2

# ─── Load already‑processed IDs ────────────────────────────────────────────────

processed_ids = set()
if os.path.exists(OUTPUT_CSV):
    with open(OUTPUT_CSV, newline='', encoding='utf-8') as f_out:
        reader_out = csv.DictReader(f_out)
        for row in reader_out:
            processed_ids.add(row['Prompt Number'])

# ─── Read input prompts ─────────────────────────────────────────────────────────

with open(INPUT_CSV, newline='', encoding='utf-8') as f_in:
    reader_in = list(csv.DictReader(f_in))
    input_fieldnames = reader_in[0].keys()

# ─── Prepare output file ────────────────────────────────────────────────────────

output_fieldnames = list(input_fieldnames) + ['Model Response']
first_write = not os.path.exists(OUTPUT_CSV) or os.stat(OUTPUT_CSV).st_size == 0

f_out = open(OUTPUT_CSV, 'a', newline='', encoding='utf-8')
writer_out = csv.DictWriter(f_out, fieldnames=output_fieldnames)
if first_write:
    writer_out.writeheader()

# ─── Generate & write responses ────────────────────────────────────────────────

total = len(reader_in)
for idx, row in enumerate(reader_in, start=1):
    pid = row['Prompt Number']
    if pid in processed_ids:
        print(f"[{idx}/{total}] Skipping {pid} (already done).")
        continue

    prompt = row['Prompt Text']
    print(f"[{idx}/{total}] Requesting response for Prompt {pid}...")

    try:
        resp = requests.post(
            COMPLETION_URL,
            headers=HEADERS,
            json={
                "model": MODEL_NAME,
                "prompt": prompt,
                "max_tokens": 2000
            }
        )
        resp.raise_for_status()
        data = resp.json()
        text = data.get("choices", [{}])[0].get("text", "").strip()

        # If you wrap your model output in delimiters (e.g. ---response---), adjust here.
        # For a simple pass‑through:
        model_out = text

    except Exception as e:
        print(f"⚠️ Error on Prompt {pid}: {e}")
        model_out = "[Error fetching response]"

    # Merge original row + response
    out_row = { **row, 'Model Response': model_out }
    writer_out.writerow(out_row)
    f_out.flush()
    time.sleep(DELAY_SECONDS)

f_out.close()
print("✅ All prompts processed.")    


[1/156] Skipping 1 (already done).
[2/156] Skipping 2 (already done).
[3/156] Skipping 3 (already done).
[4/156] Skipping 4 (already done).
[5/156] Skipping 5 (already done).
[6/156] Skipping 6 (already done).
[7/156] Skipping 7 (already done).
[8/156] Skipping 8 (already done).
[9/156] Skipping 9 (already done).
[10/156] Skipping 10 (already done).
[11/156] Skipping 11 (already done).
[12/156] Skipping 12 (already done).
[13/156] Skipping 13 (already done).
[14/156] Skipping 14 (already done).
[15/156] Skipping 15 (already done).
[16/156] Skipping 16 (already done).
[17/156] Skipping 17 (already done).
[18/156] Skipping 18 (already done).
[19/156] Skipping 19 (already done).
[20/156] Skipping 20 (already done).
[21/156] Skipping 21 (already done).
[22/156] Skipping 22 (already done).
[23/156] Skipping 23 (already done).
[24/156] Skipping 24 (already done).
[25/156] Skipping 25 (already done).
[26/156] Skipping 26 (already done).
[27/156] Skipping 27 (already done).
[28/156] Skipping 2

<mark/>**Step 3: Response evaluation**<mark>

# GPT evaluates gpt's response [affective empathy]:

In [None]:
import csv
import os
import time
import re
import requests

# Config: Set up your API key and endpoints
API_KEY = ""

COMPLETION_URL = API_BASE_URL + 'completions'

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

# Rubric prompt for affective empathy
evaluation_prompt = """
You are an assistant whose sole task is to rate the below model response on AFFECTIVE EMPATHY using a 1–3 scale.  

Evaluation Steps:

1. Read the Prompt Text.

2. Read the Model Reponse.

3. Judge its affective empathy using these criteria:
1 = Low: The reply shows little or no emotional resonance with the speaker’s feelings; it ignores or downplays what they’re going through.
2 = Moderate: The reply acknowledges the speaker’s feelings but only partially shares or validates them; more emotional warmth or attunement would help.
3 = High: The reply genuinely shares in and validates the speaker’s emotional state, conveying clear warmth and understanding.

After reading the conversation, reply in exactly this format (no extra text):

Affective Empathy: <1 or 2 or 3>
Reason: <one-sentence justification>
"""

# File paths
input_file  = 'data/raw/responses/gpt_responses.csv'
output_file = 'gpt_response_gpt_rating.csv'

# Track already evaluated prompt IDs (to resume safely)
existing_ids = set()
if os.path.exists(output_file):
    with open(output_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            existing_ids.add(row['Prompt Number'])

# Read input CSV (all original columns)
with open(input_file, newline='', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    rows = list(reader)
    original_fieldnames = reader.fieldnames

# Prepare output CSV: original columns + two new ones
new_fieldnames = original_fieldnames + [
    'Affective Empathy Score',
    'Affective Empathy Reason'
]

with open(output_file, 'a', newline='', encoding='utf-8') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=new_fieldnames)
    # write header if file is new
    if os.stat(output_file).st_size == 0:
        writer.writeheader()

    for idx, row in enumerate(rows):
        pid = row['Prompt Number']
        if pid in existing_ids:
            print(f"[{idx+1}] Skipping {pid} (already done).")
            continue

        # Build the LLM prompt
        convo = (
            f"Conversation Prompt:\n{row['Prompt Text']}\n\n"
            f"Model Response:\n{row['Model Response']}\n"
        )
        payload = {
            "model": "gpt-4o",
            "prompt": evaluation_prompt + "\n" + convo,
            "max_tokens": 2000
        }

        # Retry loop
        for attempt in range(1, 40):
            try:
                resp = requests.post(COMPLETION_URL, headers=headers, json=payload)
                resp.raise_for_status()
                text = resp.json()['choices'][0]['text'].strip()

                # Parse out score and reason
                m = re.search(
                    r"Affective Empathy:\s*([123])\s*Reason:\s*(.+)",
                    text, re.DOTALL
                )
                if not m:
                    raise ValueError(f"Unexpected format:\n{text}")

                score  = m.group(1)
                reason = m.group(2).replace('\n',' ').strip()

                # Write full original row + new columns
                out_row = dict(row)
                out_row['Affective Empathy Score']  = score
                out_row['Affective Empathy Reason'] = reason
                writer.writerow(out_row)

                print(f"[{idx+1}] ✅ {pid} → Empathy={score}")
                break

            except Exception as e:
                print(f"[{idx+1}] ⚠️ Attempt {attempt} for {pid} failed: {e}")
                time.sleep(2)
        else:
            print(f"[{idx+1}] ❌ Could not evaluate {pid} after 40' attempts.")

        outfile.flush()
        time.sleep(1)

print("✅ Done—new CSV with affective empathy scores saved to", output_file)


# GPT evaluates Claude's response [affective empathy]:

In [5]:
import csv
import os
import time
import re
import requests

# Config: Set up your API key and endpoints
API_KEY = ""

COMPLETION_URL = API_BASE_URL + 'completions'

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

# Rubric prompt for affective empathy
evaluation_prompt = """
You are an assistant whose sole task is to rate the below model response on AFFECTIVE EMPATHY using a 1–3 scale.  

Evaluation Steps:

1. Read the Prompt Text.

2. Read the Model Reponse.

3. Judge its affective empathy using these criteria:
1 = Low: The reply shows little or no emotional resonance with the speaker’s feelings; it ignores or downplays what they’re going through.
2 = Moderate: The reply acknowledges the speaker’s feelings but only partially shares or validates them; more emotional warmth or attunement would help.
3 = High: The reply genuinely shares in and validates the speaker’s emotional state, conveying clear warmth and understanding.

After reading the conversation, reply in exactly this format (no extra text):

Affective Empathy: <1 or 2 or 3>
Reason: <one-sentence justification>
"""

# File paths
input_file  = 'data/raw/responses/claude_responses_empathy.csv'
output_file = 'claude_response_gpt_rating.csv'

# Track already evaluated prompt IDs (to resume safely)
existing_ids = set()
if os.path.exists(output_file):
    with open(output_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            existing_ids.add(row['Prompt Number'])

# Read input CSV (all original columns)
with open(input_file, newline='', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    rows = list(reader)
    original_fieldnames = reader.fieldnames

# Prepare output CSV: original columns + two new ones
new_fieldnames = original_fieldnames + [
    'Affective Empathy Score',
    'Affective Empathy Reason'
]

with open(output_file, 'a', newline='', encoding='utf-8') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=new_fieldnames)
    # write header if file is new
    if os.stat(output_file).st_size == 0:
        writer.writeheader()

    for idx, row in enumerate(rows):
        pid = row['Prompt Number']
        if pid in existing_ids:
            print(f"[{idx+1}] Skipping {pid} (already done).")
            continue

        # Build the LLM prompt
        convo = (
            f"Conversation Prompt:\n{row['Prompt Text']}\n\n"
            f"Model Response:\n{row['Model Response']}\n"
        )
        payload = {
            "model": "gpt-4o",
            "prompt": evaluation_prompt + "\n" + convo,
            "max_tokens": 2000
        }

        # Retry loop
        for attempt in range(1, 40):
            try:
                resp = requests.post(COMPLETION_URL, headers=headers, json=payload)
                resp.raise_for_status()
                text = resp.json()['choices'][0]['text'].strip()

                # Parse out score and reason
                m = re.search(
                    r"Affective Empathy:\s*([123])\s*Reason:\s*(.+)",
                    text, re.DOTALL
                )
                if not m:
                    raise ValueError(f"Unexpected format:\n{text}")

                score  = m.group(1)
                reason = m.group(2).replace('\n',' ').strip()

                # Write full original row + new columns
                out_row = dict(row)
                out_row['Affective Empathy Score']  = score
                out_row['Affective Empathy Reason'] = reason
                writer.writerow(out_row)

                print(f"[{idx+1}] ✅ {pid} → Empathy={score}")
                break

            except Exception as e:
                print(f"[{idx+1}] ⚠️ Attempt {attempt} for {pid} failed: {e}")
                time.sleep(2)
        else:
            print(f"[{idx+1}] ❌ Could not evaluate {pid} after 40' attempts.")

        outfile.flush()
        time.sleep(1)

print("✅ Done—new CSV with affective empathy scores saved to", output_file)


[1] Skipping 1 (already done).
[2] Skipping 2 (already done).
[3] Skipping 3 (already done).
[4] Skipping 4 (already done).
[5] Skipping 5 (already done).
[6] Skipping 6 (already done).
[7] Skipping 7 (already done).
[8] Skipping 8 (already done).
[9] Skipping 9 (already done).
[10] Skipping 10 (already done).
[11] Skipping 11 (already done).
[12] Skipping 12 (already done).
[13] Skipping 13 (already done).
[14] Skipping 14 (already done).
[15] Skipping 15 (already done).
[16] Skipping 16 (already done).
[17] Skipping 17 (already done).
[18] Skipping 18 (already done).
[19] Skipping 19 (already done).
[20] Skipping 20 (already done).
[21] Skipping 21 (already done).
[22] Skipping 22 (already done).
[23] Skipping 23 (already done).
[24] Skipping 24 (already done).
[25] Skipping 25 (already done).
[26] Skipping 26 (already done).
[27] Skipping 27 (already done).
[28] Skipping 28 (already done).
[29] Skipping 29 (already done).
[30] Skipping 30 (already done).
[31] Skipping 31 (already do

# GPT evaluates gpt's response [cognitive understandability]:

In [6]:
import csv
import os
import time
import re
import requests

# Config: Set up your API key and endpoints
API_KEY = ""

COMPLETION_URL = API_BASE_URL + 'completions'

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

# Refined rubric prompt for cognitive Empathy
evaluation_prompt = """
You are an assistant whose sole task is to rate the below model response on COGNITIVE Empathy using a 1–3 scale.

Evaluation Steps:

1. Read the Prompt Text.

2. Read the Model Response.

3. Judge its cognitive Empathy using these criteria:

1 = Low: The response overlooks or misrepresents the speaker’s perspective. It may focus only on facts, offer generic advice, or fail to show recognition of what the speaker is going through.

2 = Moderate: The response makes an effort to engage with the speaker’s experience but lacks nuance. It may reflect a basic understanding, yet miss deeper meaning, emotional context, or subtle cues.

3 = High: The response clearly reflects the speaker’s situation or emotional state with insight and precision. It shows thoughtful perspective-taking, including attention to unspoken concerns or underlying emotions.

Be deliberate and use the full scale—only give a 3 when the response shows specific evidence of deep cognitive Empathy.

After reading the conversation, reply in exactly this format (no extra text):

Cognitive Empathy: <1 or 2 or 3>
Reason: <one-sentence justification>
"""




# File paths
input_file  = 'gpt_response_gpt_rating.csv'
output_file = 'gpt_response_gpt_rating_understanding.csv'

# Track already evaluated prompt IDs (to resume safely)
existing_ids = set()
if os.path.exists(output_file):
    with open(output_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            existing_ids.add(row['Prompt Number'])

# Read input CSV (all original columns)
with open(input_file, newline='', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    rows = list(reader)
    original_fieldnames = reader.fieldnames

# Prepare output CSV: original columns + two new ones
new_fieldnames = original_fieldnames + [
    'Cognitive Empathy Score',
    'Cognitive Empathy Reason'
]

with open(output_file, 'a', newline='', encoding='utf-8') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=new_fieldnames)
    # write header if file is new
    if os.stat(output_file).st_size == 0:
        writer.writeheader()

    for idx, row in enumerate(rows):
        pid = row['Prompt Number']
        if pid in existing_ids:
            print(f"[{idx+1}] Skipping {pid} (already done).")
            continue

        # Build the LLM prompt
        convo = (
            f"Conversation Prompt:\n{row['Prompt Text']}\n\n"
            f"Model Response:\n{row['Model Response']}\n"
        )
        payload = {
            "model": "gpt-4o",
            "prompt": evaluation_prompt + "\n" + convo,
            "max_tokens": 2000
        }

        # Retry loop
        for attempt in range(1, 40):
            try:
                resp = requests.post(COMPLETION_URL, headers=headers, json=payload)
                resp.raise_for_status()
                text = resp.json()['choices'][0]['text'].strip()

                # Parse out score and reason
                m = re.search(
                    r"Cognitive Empathy:\s*([123])\s*Reason:\s*(.+)",
                    text, re.DOTALL
                )
                if not m:
                    raise ValueError(f"Unexpected format:\n{text}")

                score  = m.group(1)
                reason = m.group(2).replace('\n',' ').strip()

                # Write full original row + new columns
                out_row = dict(row)
                out_row['Cognitive Empathy Score']  = score
                out_row['Cognitive Empathy Reason'] = reason
                writer.writerow(out_row)

                print(f"[{idx+1}] ✅ {pid} → Empathy={score}")
                break

            except Exception as e:
                print(f"[{idx+1}] ⚠️ Attempt {attempt} for {pid} failed: {e}")
                time.sleep(2)
        else:
            print(f"[{idx+1}] ❌ Could not evaluate {pid} after 40' attempts.")

        outfile.flush()
        time.sleep(1)

print("✅ Done—new CSV with affective empathy scores saved to", output_file)


[1] ✅ 1 → Empathy=3
[2] ✅ 2 → Empathy=3
[3] ✅ 3 → Empathy=3
[4] ✅ 4 → Empathy=3
[5] ✅ 5 → Empathy=3
[6] ✅ 6 → Empathy=3
[7] ✅ 7 → Empathy=3
[8] ✅ 8 → Empathy=3
[9] ✅ 9 → Empathy=3
[10] ✅ 10 → Empathy=3
[11] ✅ 11 → Empathy=3
[12] ✅ 12 → Empathy=3
[13] ✅ 13 → Empathy=3
[14] ✅ 14 → Empathy=3
[15] ✅ 15 → Empathy=3
[16] ✅ 16 → Empathy=3
[17] ✅ 17 → Empathy=3
[18] ✅ 18 → Empathy=3
[19] ✅ 19 → Empathy=3
[20] ✅ 20 → Empathy=3
[21] ✅ 21 → Empathy=3
[22] ✅ 22 → Empathy=3
[23] ✅ 23 → Empathy=3
[24] ✅ 24 → Empathy=3
[25] ✅ 25 → Empathy=3
[26] ✅ 26 → Empathy=3
[27] ✅ 27 → Empathy=3
[28] ✅ 29 → Empathy=3
[29] ✅ 30 → Empathy=3
[30] ✅ 31 → Empathy=3
[31] ✅ 32 → Empathy=3
[32] ✅ 33 → Empathy=3
[33] ✅ 34 → Empathy=3
[34] ✅ 35 → Empathy=3
[35] ✅ 36 → Empathy=3
[36] ✅ 37 → Empathy=3
[37] ✅ 38 → Empathy=3
[38] ✅ 39 → Empathy=3
[39] ✅ 40 → Empathy=2
[40] ✅ 41 → Empathy=3
[41] ✅ 42 → Empathy=3
[42] ✅ 43 → Empathy=3
[43] ✅ 44 → Empathy=3
[44] ✅ 45 → Empathy=3
[45] ✅ 46 → Empathy=3
[46] ✅ 47 → Empathy=2
[47] ✅

# GPT evaluates claude's response [cognitive understandability]:

In [7]:
import csv
import os
import time
import re
import requests

# Config: Set up your API key and endpoints
API_KEY = ""
API_BASE_URL = 'https://litellm.sph-prod.ethz.ch/'
COMPLETION_URL = API_BASE_URL + 'completions'

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

# Refined rubric prompt for cognitive Empathy
evaluation_prompt = """
You are an assistant whose sole task is to rate the below model response on COGNITIVE Empathy using a 1–3 scale.

Evaluation Steps:

1. Read the Prompt Text.

2. Read the Model Response.

3. Judge its cognitive Empathy using these criteria:

1 = Low: The response overlooks or misrepresents the speaker’s perspective. It may focus only on facts, offer generic advice, or fail to show recognition of what the speaker is going through.

2 = Moderate: The response makes an effort to engage with the speaker’s experience but lacks nuance. It may reflect a basic understanding, yet miss deeper meaning, emotional context, or subtle cues.

3 = High: The response clearly reflects the speaker’s situation or emotional state with insight and precision. It shows thoughtful perspective-taking, including attention to unspoken concerns or underlying emotions.

Be deliberate and use the full scale—only give a 3 when the response shows specific evidence of deep cognitive Empathy.

After reading the conversation, reply in exactly this format (no extra text):

Cognitive Empathy: <1 or 2 or 3>
Reason: <one-sentence justification>
"""




# File paths
input_file  = 'claude_response_gpt_rating.csv'
output_file = 'claude_response_gpt_rating_understanding.csv'

# Track already evaluated prompt IDs (to resume safely)
existing_ids = set()
if os.path.exists(output_file):
    with open(output_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            existing_ids.add(row['Prompt Number'])

# Read input CSV (all original columns)
with open(input_file, newline='', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    rows = list(reader)
    original_fieldnames = reader.fieldnames

# Prepare output CSV: original columns + two new ones
new_fieldnames = original_fieldnames + [
    'Cognitive Empathy Score',
    'Cognitive Empathy Reason'
]

with open(output_file, 'a', newline='', encoding='utf-8') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=new_fieldnames)
    # write header if file is new
    if os.stat(output_file).st_size == 0:
        writer.writeheader()

    for idx, row in enumerate(rows):
        pid = row['Prompt Number']
        if pid in existing_ids:
            print(f"[{idx+1}] Skipping {pid} (already done).")
            continue

        # Build the LLM prompt
        convo = (
            f"Conversation Prompt:\n{row['Prompt Text']}\n\n"
            f"Model Response:\n{row['Model Response']}\n"
        )
        payload = {
            "model": "gpt-4o",
            "prompt": evaluation_prompt + "\n" + convo,
            "max_tokens": 2000
        }

        # Retry loop
        for attempt in range(1, 40):
            try:
                resp = requests.post(COMPLETION_URL, headers=headers, json=payload)
                resp.raise_for_status()
                text = resp.json()['choices'][0]['text'].strip()

                # Parse out score and reason
                m = re.search(
                    r"Cognitive Empathy:\s*([123])\s*Reason:\s*(.+)",
                    text, re.DOTALL
                )
                if not m:
                    raise ValueError(f"Unexpected format:\n{text}")

                score  = m.group(1)
                reason = m.group(2).replace('\n',' ').strip()

                # Write full original row + new columns
                out_row = dict(row)
                out_row['Cognitive Empathy Score']  = score
                out_row['Cognitive Empathy Reason'] = reason
                writer.writerow(out_row)

                print(f"[{idx+1}] ✅ {pid} → Empathy={score}")
                break

            except Exception as e:
                print(f"[{idx+1}] ⚠️ Attempt {attempt} for {pid} failed: {e}")
                time.sleep(2)
        else:
            print(f"[{idx+1}] ❌ Could not evaluate {pid} after 40' attempts.")

        outfile.flush()
        time.sleep(1)

print("✅ Done—new CSV with affective empathy scores saved to", output_file)


[1] ✅ 1 → Empathy=3
[2] ✅ 2 → Empathy=3
[3] ✅ 3 → Empathy=2
[4] ✅ 4 → Empathy=3
[5] ✅ 5 → Empathy=3
[6] ✅ 6 → Empathy=3
[7] ✅ 7 → Empathy=2
[8] ✅ 8 → Empathy=3
[9] ✅ 9 → Empathy=3
[10] ✅ 10 → Empathy=3
[11] ✅ 11 → Empathy=3
[12] ✅ 12 → Empathy=3
[13] ✅ 13 → Empathy=3
[14] ✅ 14 → Empathy=3
[15] ✅ 15 → Empathy=3
[16] ✅ 16 → Empathy=3
[17] ✅ 17 → Empathy=3
[18] ✅ 18 → Empathy=3
[19] ✅ 19 → Empathy=3
[20] ✅ 20 → Empathy=3
[21] ⚠️ Attempt 1 for 21 failed: 429 Client Error: Too Many Requests for url: https://litellm.sph-prod.ethz.ch/completions
[21] ✅ 21 → Empathy=3
[22] ✅ 22 → Empathy=3
[23] ✅ 23 → Empathy=3
[24] ✅ 24 → Empathy=3
[25] ✅ 25 → Empathy=3
[26] ✅ 26 → Empathy=3
[27] ✅ 27 → Empathy=3
[28] ✅ 28 → Empathy=3
[29] ✅ 29 → Empathy=3
[30] ✅ 30 → Empathy=3
[31] ✅ 31 → Empathy=3
[32] ✅ 32 → Empathy=3
[33] ✅ 33 → Empathy=3
[34] ✅ 34 → Empathy=3
[35] ✅ 35 → Empathy=3
[36] ✅ 36 → Empathy=3
[37] ✅ 37 → Empathy=3
[38] ✅ 38 → Empathy=3
[39] ✅ 39 → Empathy=3
[40] ✅ 40 → Empathy=3
[41] ⚠️ Attempt

# Claude evaluates gpt's response [cognitive understandability]:

In [16]:
import csv
import os
import time
import re
import requests

# ─── Config: Proxy and Claude 3.7 via LiteLLM ───────────────────────────────────

API_KEY = ""
API_BASE_URL = 'https://litellm.sph-prod.ethz.ch/'
COMPLETION_URL = API_BASE_URL + 'v1/chat/completions'
HEADERS = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

# ─── Evaluation Rubric for Cognitive Empathy ─────────────────────────────

evaluation_prompt = """
You are an assistant whose sole task is to rate the below model response on COGNITIVE Empathy using a 1–3 scale.

Evaluation Steps:

1. Read the Prompt Text.
2. Read the Model Response.
3. Judge its cognitive Empathy using these criteria:

1 = Low: The response overlooks or misrepresents the speaker’s perspective. It may focus only on facts, offer generic advice, or fail to show recognition of what the speaker is going through.

2 = Moderate: The response makes an effort to engage with the speaker’s experience but lacks nuance. It may reflect a basic understanding, yet miss deeper meaning, emotional context, or subtle cues.

3 = High: The response clearly reflects the speaker’s situation or emotional state with insight and precision. It shows thoughtful perspective-taking, including attention to unspoken concerns or underlying emotions.

Be deliberate and use the full scale—only give a 3 when the response shows specific evidence of deep cognitive Empathy.

After reading the conversation, reply in exactly this format (no extra text):

Cognitive Empathy: <1 or 2 or 3>
Reason: <one-sentence justification>
"""

# ─── File Paths ────────────────────────────────────────────────────────────────

input_file = 'gpt_response_claude_rating.csv'
output_file = 'gpt_response_claude_rating_understanding.csv'

# ─── Track Already Processed Rows ──────────────────────────────────────────────

existing_ids = set()
if os.path.exists(output_file):
    with open(output_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            existing_ids.add(row['Prompt Number'])

# ─── Read Input CSV ────────────────────────────────────────────────────────────

with open(input_file, newline='', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    rows = list(reader)
    original_fieldnames = reader.fieldnames

# ─── Prepare Output CSV ────────────────────────────────────────────────────────

new_fieldnames = original_fieldnames + [
    'Cognitive Empathy Score',
    'Cognitive Empathy Reason'
]

with open(output_file, 'a', newline='', encoding='utf-8') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=new_fieldnames)
    if os.stat(output_file).st_size == 0:
        writer.writeheader()

    for idx, row in enumerate(rows):
        pid = row['Prompt Number']
        if pid in existing_ids:
            print(f"[{idx+1}] Skipping {pid} (already done).")
            continue

        # Prepare prompt
        convo = (
            f"Conversation Prompt:\n{row['Prompt Text']}\n\n"
            f"Model Response:\n{row['Model Response']}\n"
        )

        payload = {
            "model": "claude-3-7-sonnet",
            "messages": [
                {"role": "user", "content": evaluation_prompt + "\n" + convo}
            ],
            "max_tokens": 1000
        }

        # Retry logic
        for attempt in range(1, 40):
            try:
                resp = requests.post(COMPLETION_URL, headers=HEADERS, json=payload)
                resp.raise_for_status()
                text = resp.json()['choices'][0]['message']['content'].strip()

                # Parse output
                m = re.search(
                    r"Cognitive Empathy:\s*([123])\s*Reason:\s*(.+)",
                    text, re.DOTALL
                )
                if not m:
                    raise ValueError(f"Unexpected format:\n{text}")

                score = m.group(1)
                reason = m.group(2).replace('\n', ' ').strip()

                out_row = dict(row)
                out_row['Cognitive Empathy Score'] = score
                out_row['Cognitive Empathy Reason'] = reason
                writer.writerow(out_row)

                print(f"[{idx+1}] ✅ {pid} → Understanding={score}")
                break

            except Exception as e:
                print(f"[{idx+1}] ⚠️ Attempt {attempt} for {pid} failed: {e}")
                time.sleep(2)
        else:
            print(f"[{idx+1}] ❌ Could not evaluate {pid} after 40 attempts.")

        outfile.flush()
        time.sleep(1)

print("✅ Done—new CSV with cognitive Empathy scores saved to", output_file)


[1] ✅ 1 → Understanding=3
[2] ✅ 2 → Understanding=3
[3] ✅ 3 → Understanding=3
[4] ✅ 4 → Understanding=3
[5] ✅ 5 → Understanding=3
[6] ✅ 6 → Understanding=3
[7] ✅ 7 → Understanding=3
[8] ✅ 8 → Understanding=3
[9] ✅ 9 → Understanding=3
[10] ✅ 10 → Understanding=2
[11] ✅ 11 → Understanding=3
[12] ✅ 12 → Understanding=3
[13] ✅ 13 → Understanding=3
[14] ✅ 14 → Understanding=3
[15] ✅ 15 → Understanding=2
[16] ✅ 16 → Understanding=3
[17] ✅ 17 → Understanding=3
[18] ✅ 18 → Understanding=3
[19] ✅ 19 → Understanding=3
[20] ✅ 20 → Understanding=3
[21] ✅ 21 → Understanding=2
[22] ✅ 22 → Understanding=2
[23] ✅ 23 → Understanding=3
[24] ✅ 24 → Understanding=3
[25] ✅ 25 → Understanding=3
[26] ✅ 26 → Understanding=3
[27] ⚠️ Attempt 1 for 27 failed: 504 Server Error: Gateway Time-out for url: https://litellm.sph-prod.ethz.ch/v1/chat/completions
[27] ✅ 27 → Understanding=3
[28] ✅ 28 → Understanding=3
[29] ✅ 29 → Understanding=3
[30] ✅ 30 → Understanding=3
[31] ✅ 31 → Understanding=3
[32] ✅ 32 → Understa

# Claude evaluates claude's response [cognitive understandability]:

In [17]:
import csv
import os
import time
import re
import requests

# ─── Config: Proxy and Claude 3.7 via LiteLLM ───────────────────────────────────

API_KEY = ""
API_BASE_URL = 'https://litellm.sph-prod.ethz.ch/'
COMPLETION_URL = API_BASE_URL + 'v1/chat/completions'
HEADERS = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

# ─── Evaluation Rubric for Cognitive Empathy ─────────────────────────────

evaluation_prompt = """
You are an assistant whose sole task is to rate the below model response on COGNITIVE Empathy using a 1–3 scale.

Evaluation Steps:

1. Read the Prompt Text.
2. Read the Model Response.
3. Judge its cognitive Empathy using these criteria:

1 = Low: The response overlooks or misrepresents the speaker’s perspective. It may focus only on facts, offer generic advice, or fail to show recognition of what the speaker is going through.

2 = Moderate: The response makes an effort to engage with the speaker’s experience but lacks nuance. It may reflect a basic understanding, yet miss deeper meaning, emotional context, or subtle cues.

3 = High: The response clearly reflects the speaker’s situation or emotional state with insight and precision. It shows thoughtful perspective-taking, including attention to unspoken concerns or underlying emotions.

Be deliberate and use the full scale—only give a 3 when the response shows specific evidence of deep cognitive Empathy.

After reading the conversation, reply in exactly this format (no extra text):

Cognitive Empathy: <1 or 2 or 3>
Reason: <one-sentence justification>
"""

# ─── File Paths ────────────────────────────────────────────────────────────────

input_file = 'claude_response_claude_rating.csv'
output_file = 'claude_response_claude_rating_understanding.csv'

# ─── Track Already Processed Rows ──────────────────────────────────────────────

existing_ids = set()
if os.path.exists(output_file):
    with open(output_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            existing_ids.add(row['Prompt Number'])

# ─── Read Input CSV ────────────────────────────────────────────────────────────

with open(input_file, newline='', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    rows = list(reader)
    original_fieldnames = reader.fieldnames

# ─── Prepare Output CSV ────────────────────────────────────────────────────────

new_fieldnames = original_fieldnames + [
    'Cognitive Empathy Score',
    'Cognitive Empathy Reason'
]

with open(output_file, 'a', newline='', encoding='utf-8') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=new_fieldnames)
    if os.stat(output_file).st_size == 0:
        writer.writeheader()

    for idx, row in enumerate(rows):
        pid = row['Prompt Number']
        if pid in existing_ids:
            print(f"[{idx+1}] Skipping {pid} (already done).")
            continue

        # Prepare prompt
        convo = (
            f"Conversation Prompt:\n{row['Prompt Text']}\n\n"
            f"Model Response:\n{row['Model Response']}\n"
        )

        payload = {
            "model": "claude-3-7-sonnet",
            "messages": [
                {"role": "user", "content": evaluation_prompt + "\n" + convo}
            ],
            "max_tokens": 1000
        }

        # Retry logic
        for attempt in range(1, 40):
            try:
                resp = requests.post(COMPLETION_URL, headers=HEADERS, json=payload)
                resp.raise_for_status()
                text = resp.json()['choices'][0]['message']['content'].strip()

                # Parse output
                m = re.search(
                    r"Cognitive Empathy:\s*([123])\s*Reason:\s*(.+)",
                    text, re.DOTALL
                )
                if not m:
                    raise ValueError(f"Unexpected format:\n{text}")

                score = m.group(1)
                reason = m.group(2).replace('\n', ' ').strip()

                out_row = dict(row)
                out_row['Cognitive Empathy Score'] = score
                out_row['Cognitive Empathy Reason'] = reason
                writer.writerow(out_row)

                print(f"[{idx+1}] ✅ {pid} → Understanding={score}")
                break

            except Exception as e:
                print(f"[{idx+1}] ⚠️ Attempt {attempt} for {pid} failed: {e}")
                time.sleep(2)
        else:
            print(f"[{idx+1}] ❌ Could not evaluate {pid} after 40 attempts.")

        outfile.flush()
        time.sleep(1)

print("✅ Done—new CSV with cognitive Empathy scores saved to", output_file)


[1] ✅ 1 → Understanding=3
[2] ✅ 2 → Understanding=3
[3] ✅ 3 → Understanding=3
[4] ✅ 4 → Understanding=3
[5] ✅ 5 → Understanding=3
[6] ✅ 6 → Understanding=3
[7] ✅ 7 → Understanding=3
[8] ✅ 8 → Understanding=3
[9] ✅ 11 → Understanding=3
[10] ✅ 12 → Understanding=3
[11] ✅ 13 → Understanding=3
[12] ✅ 14 → Understanding=3
[13] ✅ 15 → Understanding=3
[14] ✅ 9 → Understanding=3
[15] ✅ 10 → Understanding=3
[16] ✅ 16 → Understanding=3
[17] ✅ 17 → Understanding=3
[18] ✅ 18 → Understanding=3
[19] ✅ 19 → Understanding=3
[20] ✅ 20 → Understanding=3
[21] ✅ 21 → Understanding=3
[22] ✅ 22 → Understanding=3
[23] ✅ 23 → Understanding=3
[24] ✅ 24 → Understanding=3
[25] ✅ 25 → Understanding=3
[26] ⚠️ Attempt 1 for 26 failed: 429 Client Error: Too Many Requests for url: https://litellm.sph-prod.ethz.ch/v1/chat/completions
[26] ⚠️ Attempt 2 for 26 failed: 429 Client Error: Too Many Requests for url: https://litellm.sph-prod.ethz.ch/v1/chat/completions
[26] ✅ 26 → Understanding=3
[27] ✅ 27 → Understanding=3
