In [1]:
import sys
from typing import Any, Dict
import json
sys.path.append("..")

from evaluator.base_evaluator import RAGEvaluator
from evaluator.llm import OpenAIClientLLM
from evaluator.prompt_manager import EvaluationType, PromptManager

class LLMAsJudgeEvaluator(RAGEvaluator):
    def pre_process(
        self,
        answer: str,
        **kwargs
    ) -> str:
        question = kwargs.get("question", "")
        context = kwargs.get("context", "")
        return self.prompt_manager.build_prompt(
            answer=answer,
            question=question,
            context=context,
            eval_type=EvaluationType.RELEVANCE  # or make this configurable
        )
    
    def call_llm(self, processed_data: str) -> str:
        # Execute LLM call with constructed prompt
        return self.llm.generate(processed_data)
    
    def post_process(self, llm_response: str) -> Dict[str, float]:
        """Parse JSON response into scores dictionary"""
        try:
            # Clean response and parse JSON
            response_text = llm_response.strip().replace('```json', '').replace('```', '')
            result = json.loads(response_text)
            
            # Normalize scores and flatten structure
            scores = {
                'score': result.get('score', 
                           result.get('relevance_score', 
                           result.get('coherence_score', 
                           result.get('accuracy_score', 0.0)))),
                'confidence': result.get('confidence', 0.0)
            }
            
            # Add additional metrics
            for key in result:
                if key.endswith('_score') and key != 'score':
                    scores[key] = result[key]
            
            return scores
            
        except (json.JSONDecodeError, KeyError) as e:
            print(f"Error parsing LLM response: {e}")
            return {
                'score': 0.0,
                'confidence': 0.0,
                'error': str(e)
            }
    

In [None]:
from datasets import load_dataset
delucionqa = load_dataset("rungalileo/ragbench", "delucionqa")
df = delucionqa['train'].to_pandas()
a = df.head()
a['flatten_doc'] = a.apply(lambda x: "\n".join([f"`{label}` {sentence}" for label, sentence in [inner_list for middle_list in x['documents_sentences'] for inner_list in middle_list]]), axis = 1)
answer = a.iloc[1]['response']
documents = a.iloc[1]['flatten_doc']
question = a.iloc[1]['question']

In [None]:
print(answer, "\n\n")
print(documents,  "\n\n")
print(question, "\n\n")

In [None]:
import os
# Openai key is the CentML key
os.environ["OPENAI_API_KEY"] = "vjn61Mx-WYYUB07Jez2kRj41k0TIZsyt52M0RaM0Chg"

evaluator = LLMAsJudgeEvaluator(
    llm=OpenAIClientLLM(),
    prompt_manager=PromptManager(default_type=EvaluationType.FACTUAL_ACCURACY)
)

result = evaluator.evaluate(
    answer=answer,
    question=question,
    context=documents,
)

In [None]:
result

## Test the Factual Correctness
This is the F1-Score of statements in RAG answer classified as True Positive, False Positive and False Negative.

### 1.Implement an evaluator for Factual Correctness

In [1]:
import sys
from typing import Dict
import json
sys.path.append("..")

from evaluator.base_evaluator import RAGEvaluator
from evaluator.llm import OpenAIClientLLM
from evaluator.prompt_manager import EvaluationType, PromptManager

class FacCorEvaluator(RAGEvaluator):
    def pre_process(
        self,
        answer: str,
        golden_answer: str
    ) -> str:
        return self.prompt_manager.build_prompt(
            answer=answer,
            golden_answer = golden_answer
        )
    def call_llm(self, processed_data: str) -> str:
        # Execute LLM call with constructed prompt
        return self.llm.generate(processed_data)
    
    def post_process(self, llm_response: str) -> Dict[str, float]:
        """Parse JSON response into scores dictionary"""
        return llm_response
        # try:
        #     # Clean response and parse JSON
        #     response_text = llm_response.strip().replace('```json', '').replace('```', '')
        #     result = json.loads(response_text)
            
        #     # Normalize scores and flatten structure
        #     scores = {
        #         'score': result.get('score', 
        #                    result.get('relevance_score', 
        #                    result.get('coherence_score', 
        #                    result.get('accuracy_score', 0.0)))),
        #         'confidence': result.get('confidence', 0.0)
        #     }
            
        #     # Add additional metrics
        #     for key in result:
        #         if key.endswith('_score') and key != 'score':
        #             scores[key] = result[key]
            
        #     return scores
            
        # except (json.JSONDecodeError, KeyError) as e:
        #     print(f"Error parsing LLM response: {e}")
        #     return {
        #         'score': 0.0,
        #         'confidence': 0.0,
        #         'error': str(e)
        #     }
    

### 2.Test with a test case

In [2]:
import os
# Openai key is the CentML key
os.environ["OPENAI_API_KEY"] = "vjn61Mx-WYYUB07Jez2kRj41k0TIZsyt52M0RaM0Chg"

evaluator = FacCorEvaluator(
    llm=OpenAIClientLLM(),
    prompt_manager=PromptManager(default_type=EvaluationType.FACTUAL_CORRECTNESS)
)


result = evaluator.evaluate(
    answer="The Great Wall of China is located in southern China. It was built to protect against Mongolian invasions and stretches over 15,000 miles.",
    golden_answer="The Great Wall of China is located in northern China. It was originally built to protect against invasions and raids from nomadic groups and stretches over 13,000 miles."

)

In [3]:
result

'```json\n{\n  "extracted_statements": {\n    "golden": ["The Great Wall of China is located in northern China", "It was originally built to protect against invasions and raids from nomadic groups", "It stretches over 13,000 miles"],\n    "generated": ["The Great Wall of China is located in southern China", "It was built to protect against Mongolian invasions", "It stretches over 15,000 miles"]\n  },\n  "TP": 0,\n  "FP": 3,\n  "FN": 3,\n  "factual_correctness_score": 0.0,\n  "reasons": ["Incorrect location mentioned", "Incorrect purpose and invader details", "Incorrect length"]\n}\n```'