In [2]:
import json

# Assuming you have a file called data.json
with open('dataset/60qas_ans.json', 'r') as f:
    data = json.load(f)

# Sentence-Transformer Approach

In [2]:
data[0]

{'Q#': 1,
 'Question': 'What are the common toppings of a Margherita Pizza?',
 'Category': 'IND',
 'SPARQL': '\nPREFIX pizza: <http://www.co-ode.org/ontologies/pizza/pizza.owl#>\n\nSELECT ?topping WHERE {\n  pizza:MargheritaPizza pizza:hasTopping ?topping .\n}\n',
 'FC_Ans': 'Mozzarella and tomato',
 'GS_Ans': 'MozzarellaTopping, TomatoTopping |MargheritaPizza has hasTopping some (MozzarellaTopping and TomatoTopping).'}

In [3]:
wrongs = [item['Q#'] for item in data if item['FC_Ans'] == "WRONG" or item['GS_Ans'] == "WRONG"]

print(f"Total WRONG answers: {len(wrongs)},  {wrongs}")

Total WRONG answers: 8,  [14, 15, 17, 44, 47, 48, 50, 58]


In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")

2025-07-30 11:09:07.123565: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-30 11:09:07.191440: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-30 11:09:07.211053: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-30 11:09:07.330141: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
annotator1 = [item['FC_Ans'] for item in data if item['Q#'] not in wrongs]
annotator2 = [item['GS_Ans'] for item in data if item['Q#'] not in wrongs]

annotator1_embedding = model.encode(annotator1)
annotator2_embedding = model.encode(annotator2)

In [10]:
len(annotator1), len(annotator2)

(52, 52)

In [11]:
similarities = model.similarity(annotator1_embedding, annotator2_embedding)

In [12]:
agreements = []
for idx in range(len(annotator1)):
    agreements.append(similarities[idx][idx].item())
    
aggreement_score = sum(agreements)/len(annotator1)

print("Annotator agreements:", aggreement_score)

Annotator agreements: 0.4878907341223497


# LLM as a judge agreament maker

In [3]:
prompt_template = """I have two annotators who answered the same question based on an ontology. 
Compare their answers and judge whether they agree or disagree.
If they disagree, explain the nature of the disagreement (e.g., different entities, relations, interpretations, or missing concepts).
Next, provide a single answer based on two annotators. This response should be straightforward with no extra explanation. 

<question> 
{question}
</question> 


<ontology> 
{ontology}
</ontology> 

<Annotator 1 Answer>
{annotator1}
</Annotator 1 Answer>

<Annotator 2 Answer>
{annotator2}
</Annotator 2 Answer>

Return your output as a following format:

{'agreement': 'agree', 'rationale': '...', 'answer': '...'}
"""

In [5]:
import scripts, config
from openai import OpenAI
import time
from tqdm.notebook import tqdm
import json

ontology_text = scripts.load_ontology("dataset/pizza.owl")

2025-07-30 11:44:48.549150: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-30 11:44:48.569552: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-30 11:44:48.575838: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-30 11:44:48.590703: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
client = OpenAI(api_key=config.openai_token)

functions = [
  {
    "name": "evaluate_characteristic",
    "description": "Extracting the exact `agreement` and `rationale` from the given text.",
    "parameters": {
      "type": "object",
      "properties": {
        "agreement": {
          "type": "string",
          "description": "A agree or disagree string values describing whatever two annotator are agree or not.",
        },
        "rationale": {
          "type": "string",
          "description": "The explanation for the assigned rating."
        },
        "answer": {
          "type": "string",
          "description": "Single unified answer if both are agree."
        }
      },
      "required": ["agreement", "rationale", "answer"]
    }
  }
]



def save_json(data, path):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)


In [7]:
data[0]

{'Q#': 1,
 'Question': 'What are the common toppings of a Margherita Pizza?',
 'Category': 'IND',
 'SPARQL': '\nPREFIX pizza: <http://www.co-ode.org/ontologies/pizza/pizza.owl#>\n\nSELECT ?topping WHERE {\n  pizza:MargheritaPizza pizza:hasTopping ?topping .\n}\n',
 'FC_Ans': 'Mozzarella and tomato',
 'GS_Ans': 'MozzarellaTopping, TomatoTopping |MargheritaPizza has hasTopping some (MozzarellaTopping and TomatoTopping).'}

In [8]:
new_data = []
for item in tqdm(data):
    question = item['Question']
    annotator1 = item['FC_Ans']
    annotator2 = item['GS_Ans']
    
    conversation =    [
        {"role": "system", "content": "You are a LLM-as-a-Judge System."}, 
        {"role": "user", "content": prompt_template.replace("{question}", question).replace("{annotator1}", annotator1).replace("{annotator2}", annotator2).replace("{ontology}", ontology_text)}
    ] 
    
    while True:
        try:
            completion = client.chat.completions.create(
                model="gpt-4o",
                messages=conversation,
                functions=functions
            )

            inference = eval(completion.choices[0].message.function_call.arguments)
            break
        except:
            print("sleep for 5 seconds")
            time.sleep(5)
            
    new_data.append({
        "Q#": item['Q#'],
        "Question": item['Question'],
        "Category": item['Category'],
        "FC_Ans": item['FC_Ans'],
        "GS_Ans": item['GS_Ans'],
        "agreement": inference
    })
    save_json(new_data, "AI-Judger.json")
            

  0%|          | 0/60 [00:00<?, ?it/s]

# Agreements

In [9]:
import json

# Assuming you have a file called data.json
with open('datasets/AI-Judger.json', 'r') as f:
    data = json.load(f)

In [10]:
data[0]

{'Q#': 1,
 'Question': 'What are the common toppings of a Margherita Pizza?',
 'Category': 'IND',
 'FC_Ans': 'Mozzarella and tomato',
 'GS_Ans': 'MozzarellaTopping, TomatoTopping |MargheritaPizza has hasTopping some (MozzarellaTopping and TomatoTopping).',
 'agreement': {'agreement': 'agree',
  'rationale': "Both annotators mention the same toppings for a Margherita Pizza: Mozzarella and Tomato. Annotator 2 uses the terminology from the ontology ('MozzarellaTopping', 'TomatoTopping') but these directly correlate with 'Mozzarella' and 'Tomato' from Annotator 1's answer.",
  'answer': 'Mozzarella and tomato'}}

In [12]:
ag2id = {"agree": 1, "disagree":0}
agreements = [ag2id[item['agreement']['agreement']] for item in data]

print("agreements:", sum(agreements)/len(agreements))

agreements: 0.7666666666666667
