In [None]:
# Install necessary libraries (run this only once)
# pip install judges instructor openai

"""
This script uses produces the scores for the initial prompts and responses from Claude and GPT-4o using the Emotion Queen judges.
The monkez patch is used to force judges to go through the ETH proxy.
The voting methods.pz file wazs also modified to handle NAN values, strings, ... in the scores.
If zou want to run this code please insert zour key and change voting methods.py in Libs/site-packages/judges/ 
"""


import os
# 1) your key  
os.environ["OPENAI_API_KEY"]  = "My key"  
# 2) proxy’s v1 root  
os.environ["OPENAI_API_BASE"] = "https://litellm.sph-prod.ethz.ch/v1"  

import openai  
# these two lines are optional if you set the env-vars above, 
# but safe to do again in code:
openai.api_key  = os.environ["OPENAI_API_KEY"]  
openai.api_base = os.environ["OPENAI_API_BASE"]  

from openai import OpenAI

client = OpenAI(
    api_key  = "My key",                             # or rely on env-var
    base_url = "https://litellm.sph-prod.ethz.ch/v1"  # your LiteLLM proxy root
)

# ── monkey-patch judges to use the client ────────────────────────
import judges._client as _jc
_jc.llm_client = lambda *args, **kwargs: client

# 2) smoke-test chat call
resp = client.chat.completions.create(
    model    = "gpt-4o-mini",
    messages = [{"role":"user","content":"Say hello"}]
)
print("MODEL SAYS:", resp.choices[0].message.content) 

import pandas as pd
from judges.graders.empathy import EmotionQueenImplicitEmotionRecognition,EmotionQueenIntentionRecognition,EmotionQueenKeyEventRecognition,EmotionQueenMixedEventRecognition
from judges import Jury

# Load the data
df_claude = pd.read_csv("data/raw/responses/claude_responses_empathy.csv", encoding="utf-8")
df_gpt = pd.read_csv("data/raw/responses/gpt_responses.csv", encoding="utf-8")

# Build a jury of 5 empathy judges
agents_implicit = [
    EmotionQueenImplicitEmotionRecognition(model="gpt-4o"),
    EmotionQueenImplicitEmotionRecognition(model="gpt-4o"),
    EmotionQueenImplicitEmotionRecognition(model="gpt-4o"),
    EmotionQueenImplicitEmotionRecognition(model="gpt-4o"),
    EmotionQueenImplicitEmotionRecognition(model="gpt-4o"),
]

agents_intention = [
    EmotionQueenIntentionRecognition(model="gpt-4o"),
    EmotionQueenIntentionRecognition(model="gpt-4o"),
    EmotionQueenIntentionRecognition(model="gpt-4o"),
    EmotionQueenIntentionRecognition(model="gpt-4o"),
    EmotionQueenIntentionRecognition(model="gpt-4o"),
]

agents_keyEvent = [
    EmotionQueenKeyEventRecognition(model="gpt-4o"),
    EmotionQueenKeyEventRecognition(model="gpt-4o"),
    EmotionQueenKeyEventRecognition(model="gpt-4o"),
    EmotionQueenKeyEventRecognition(model="gpt-4o"),
    EmotionQueenKeyEventRecognition(model="gpt-4o"),
]

agents_mixedEvent = [
    EmotionQueenMixedEventRecognition(model="gpt-4o"),
    EmotionQueenMixedEventRecognition(model="gpt-4o"),
    EmotionQueenMixedEventRecognition(model="gpt-4o"),
    EmotionQueenMixedEventRecognition(model="gpt-4o"),
    EmotionQueenMixedEventRecognition(model="gpt-4o"),
]


jury_implicit = Jury(judges=agents_implicit, voting_method="average")
jury_intention = Jury(judges=agents_intention, voting_method="average")
jury_keyEvent = Jury(judges=agents_keyEvent, voting_method="average")
jury_mixedEvent = Jury(judges=agents_mixedEvent, voting_method="average")

# Map each row → a verdict
def score(row, jury):
    verdict = jury.vote(
        input   = row["Prompt Text"],
        output  = row["Model Response"],
        expected=None
    )
    return float(verdict.score)

print("scoring claude")

df_claude["Implicit Emotion Recognition"] = df_claude.apply(score, axis=1, args=(jury_implicit,))
df_claude["Intention Recognition"] = df_claude.apply(score, axis=1, args=(jury_intention,))
df_claude["Key Event Recognition"] = df_claude.apply(score, axis=1, args=(jury_keyEvent,))
df_claude["Mixed Event Recognition"] = df_claude.apply(score, axis=1, args=(jury_mixedEvent,))
df_claude.to_csv("initial_prompts_with_responses_claude_scored_by_Queen.csv", encoding="utf-8")

print("scoring gpt")

df_gpt["Implicit Emotion Recognition"] = df_gpt.apply(score, axis=1, args=(jury_implicit,))
df_gpt["Intention Recognition"] = df_gpt.apply(score, axis=1, args=(jury_intention,))
df_gpt["Key Event Recognition"] = df_gpt.apply(score, axis=1, args=(jury_keyEvent,))
df_gpt["Mixed Event Recognition"] = df_gpt.apply(score, axis=1, args=(jury_mixedEvent,))
df_gpt.to_csv("initial_prompts_with_responses_gpt_scored_by_Queen.csv", encoding="utf-8")





        
        
    


MODEL SAYS: Hello! How can I assist you today?
scoring claude
scoring gpt
