In [1]:
import pandas as pd 
import os 
from pathlib import Path 

In [21]:
data_dir = Path.cwd().parent / 'data/data_files/robustness_results_v2/'
file_name = 'Robust1_lexical_n_grammatical.csv'

In [24]:
file_path = data_dir / file_name 
df = pd.read_csv(file_path)
#df.head(1)

In [33]:
from pydantic import BaseModel, Field
from typing import List

class MetricEvaluation(BaseModel):
    rating: int = Field(..., ge=1, le=5)
    explanation: str

class FinalAssessment(BaseModel):
    overall_rating: int = Field(..., ge=1, le=5)
    explanation: str

class SimilarityEvaluation(BaseModel):
    semantic_similarity: MetricEvaluation
    key_point_alignment: MetricEvaluation
    overall_message_consistency: MetricEvaluation
    final_assessment: FinalAssessment


from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

template = """
You are an expert judge tasked with evaluating the contextual similarity between two answers. 

Evaluation Criteria:
1. Semantic Similarity: How closely do the answers match in terms of meaning?
2. Key Point Alignment: To what extent do the main points or arguments in both answers align?
3. Overall Message Consistency: How consistent are the overall messages conveyed by both answers?

Rating Scale:
1 = Not similar at all
2 = Slightly similar
3 = Moderately similar
4 = Very similar
5 = Extremely similar or identical

Detailed Instructions:
- Rate each metric independently
- Provide a clear explanation for each rating
- Compute an overall rating that mathematically and contextually represents the aggregate similarity
- The final assessment should:
  a) Calculate an overall rating (weighted average of individual metrics)
  b) Provide a comprehensive explanation justifying the overall rating
  c) Highlight nuanced differences or similarities

Answer A:
{answer_a}

Answer B:
{answer_b}

{format_instructions}

# Weighted calculation hint in prompt
Additional Guidance for Final Assessment:
- Overall Rating Calculation:
  * Semantic Similarity: 40% weight
  * Key Point Alignment: 30% weight
  * Overall Message Consistency: 30% weight
- Round to nearest whole number
- Explanation should capture the essence of similarity comprehensively
"""

parser = PydanticOutputParser(pydantic_object=SimilarityEvaluation)

prompt = PromptTemplate(
    template=template,
    input_variables=["answer_a", "answer_b"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

In [44]:
from langchain_ollama.llms import OllamaLLM

llm = OllamaLLM(model='qwen2.5:32b')
chain = prompt | llm | parser

In [64]:
def parse_results(result):
    judge_response = {}
    for metric, value in dict(result).items():
        if 'rating' in dict(value):
            rating = value.rating
        else:
            rating = value.overall_rating
        explanation = value.explanation

        judge_response[f'{metric}_rating'] = rating
        judge_response[f'{metric}_explanation'] = explanation
    return judge_response

original_answer = None 

df_sample = df.iloc[:5]
response_list = []
for idx,row in df_sample.iterrows():
    if isinstance(row['Original Answer'], str):
        original_answer = row['Original Answer']
    
    answer_a = original_answer
    answer_b = row['response']

    result = chain.invoke({
    "answer_a": answer_a, 
    "answer_b": answer_b
    })

    parsed_results = parse_results(result=result)
    response_list.append(parsed_results)    

In [66]:
response_df = pd.DataFrame(response_list)

Unnamed: 0,semantic_similarity_rating,semantic_similarity_explanation,key_point_alignment_rating,key_point_alignment_explanation,overall_message_consistency_rating,overall_message_consistency_explanation,final_assessment_rating,final_assessment_explanation
0,4,Both answers explain that radiation therapy do...,4,"The key points in both answers align well, dis...",4,Both messages consistently convey that radiati...,4,The answers are very similar in their explanat...
1,5,Both Answer A and Answer B use similar languag...,5,The key points in both answers align perfectly...,5,The overall messages in Answer A and Answer B ...,5,"Based on the ratings for semantic similarity, ..."
2,4,Both answers explain that radiation therapy do...,3,While both answers cover the non-immediate nat...,4,Both answers communicate the same overall mess...,4,Both answers exhibit high semantic similarity ...
3,5,Both answers explain that radiation therapy do...,4,The main points about the non-immediate nature...,5,Both answers convey the same overall message t...,4,"The overall rating is 4, indicating a very hig..."
4,5,Both answers use very similar language and con...,4,The key points of both answers are largely ali...,4,Both answers convey that radiation therapy tak...,4,The answers are extremely similar in their sem...


In [69]:
response_df.columns

ValueError: dictionary update sequence element #0 has length 26; 2 is required

In [67]:
pd.concat([df_sample, response_df], axis=1)

Unnamed: 0,Diease Site,Original Question,Linguistic Challenge Variations/Categories,Modified Question,parsed_question,retrieval_similarity,retrieval_relevancy,response,correctness_score,correctness_reasoning,...,polarity_scores,Original Answer,semantic_similarity_rating,semantic_similarity_explanation,key_point_alignment_rating,key_point_alignment_explanation,overall_message_consistency_rating,overall_message_consistency_explanation,final_assessment_rating,final_assessment_explanation
0,General,How long does radiation therapy take to work?,Spelling Errors: Introduce common typos or mis...,How long does radiaiton therpay take to work?,How long does radiation therapy take to work f...,0.870652,0.7,I completely understand your concern about rad...,0.6,**Evaluation Explanation**\n\nThe assistant pr...,...,"{'neg': 0.091, 'neu': 0.798, 'pos': 0.111, 'co...",I understand your concern about how long radia...,4,Both answers explain that radiation therapy do...,4,"The key points in both answers align well, dis...",4,Both messages consistently convey that radiati...,4,The answers are very similar in their explanat...
1,General,,"Grammatical Errors: Use incorrect tense, subje...",How long do radiation therapy takes to work?,How long does radiation therapy take to work f...,0.870652,0.8,I'm glad you're interested in learning more ab...,0.6,**Evaluation Explanation:**\nThe AI assistant ...,...,"{'neg': 0.165, 'neu': 0.734, 'pos': 0.1, 'comp...",,5,Both Answer A and Answer B use similar languag...,5,The key points in both answers align perfectly...,5,The overall messages in Answer A and Answer B ...,5,"Based on the ratings for semantic similarity, ..."
2,General,,Punctuation Errors: Add or remove punctuation ...,How long does radiation therapy take to work,How long does radiation therapy take to work f...,0.870652,0.8,I completely understand your concern about rad...,0.6,**Evaluation Explanation:**\n\nThe assistant's...,...,"{'neg': 0.134, 'neu': 0.774, 'pos': 0.093, 'co...",,4,Both answers explain that radiation therapy do...,3,While both answers cover the non-immediate nat...,4,Both answers communicate the same overall mess...,4,Both answers exhibit high semantic similarity ...
3,General,,Word Order Changes: Rearrange the order of wor...,"Radiation therapy to work, how long does it take?",How long does radiation therapy take to work?,1.0,0.9,I understand you're asking about the duration ...,0.8,**Evaluation Explanation:**\n\nThe assistant's...,...,"{'neg': 0.118, 'neu': 0.672, 'pos': 0.209, 'co...",,5,Both answers explain that radiation therapy do...,4,The main points about the non-immediate nature...,5,Both answers convey the same overall message t...,4,"The overall rating is 4, indicating a very hig..."
4,General,,"Fragmented sentence, missing proper structure",How long radiation therapy take work?,How long does radiation therapy take to work?,1.0,0.9,I understand you're asking about how long it t...,0.7,**Evaluation Explanation**\n\nThe assistant's ...,...,"{'neg': 0.121, 'neu': 0.739, 'pos': 0.14, 'com...",,5,Both answers use very similar language and con...,4,The key points of both answers are largely ali...,4,Both answers convey that radiation therapy tak...,4,The answers are extremely similar in their sem...


In [45]:
result = chain.invoke({
    "answer_a": df.iloc[0]['Original Answer'], 
    "answer_b": df.iloc[0]['response']
})

In [57]:
judge_response

{'semantic_similarity_rating': 4,
 'semantic_similarity_explanation': 'Both answers convey similar explanations regarding how radiation therapy works and its timeline, using comparable analogies such as comparing it to a process that takes time.',
 'key_point_alignment_rating': 3,
 'key_point_alignment_explanation': 'While both answers highlight the delayed effect of radiation therapy on cancer cells and the ongoing death of these cells post-treatment, Answer A provides more specific timelines (e.g., within 1-2 weeks for side effects), whereas Answer B is less detailed in this aspect.',
 'overall_message_consistency_rating': 4,
 'overall_message_consistency_explanation': 'Both answers consistently communicate that radiation therapy does not have an immediate effect and requires time to show its full impact, emphasizing the gradual process involved. The overall tone of both responses is understanding and patient-focused.',
 'final_assessment_rating': 4,
 'final_assessment_explanation': 