In [None]:
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric, ContextualPrecisionMetric, ContextualRecallMetric, ContextualRelevancyMetric
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase
import json
from pydantic import BaseModel
import regex
import torch
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams

In [None]:
generated_answers_path = "/kaggle/input/generated-answers/generated_answers.json"
benchmark_file_path = "/kaggle/input/benchmark-techqa/benchmark_query_rewriting.json"
answerable_file_path = "/kaggle/input/answerability/answerable_questions.json"

In [None]:
with open(benchmark_file_path, "r") as file:
    benchmark_instances = json.load(file)

with open(answerable_file_path, "r") as file:
    answerable_questions = json.load(file)

In [None]:
# Initialize counters
TP = TN = FP = FN = 0

for is_answerable, benchmark_instance in zip(answerable_questions, benchmark_instances):
    is_impossible = benchmark_instances["is_impossible"]
    
    predicted_impossible = False if is_answerable == 1 else True
   
    # Count TP, TN, FP, FN based on conditions
    if is_impossible and predicted_impossible:
        TP += 1
    elif is_impossible and not predicted_impossible:
        FN += 1
    elif not is_impossible and predicted_impossible:
        FP += 1
    elif not is_impossible and not predicted_impossible:
        TN += 1

accuracy = (TP + TN) / (TP + TN + FP + FN)

# Compute metrics for positive class (unanswerable)

precision_pos = TP / (TP + FP)
recall_pos = TP / (TP + FN)
f1_score_pos = 2 * (precision_pos * recall_pos) / (precision_pos + recall_pos)

# Compute metrics for negative class (answerable)
precision_neg = TN / (TN + FN)
recall_neg = TN / (TN + FP)
f1_score_neg = 2 * (precision_neg * recall_neg) / (precision_neg + recall_neg)

# Print the results
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")
print(f"Accuracy: {accuracy:.4f}")

print(f"Precision unanswerable: {precision_pos:.4f}")
print(f"Recall unanswerable: {recall_pos:.4f}")
print(f"F1-Score unanswerable: {f1_score_pos:.4f}")

print(f"Precision answerable: {precision_neg:.4f}")
print(f"Recall answerable: {recall_neg:.4f}")
print(f"F1-Score answerable: {f1_score_neg:.4f}")

macro_f1 = (f1_score_pos + f1_score_neg) / 2
print(f"macro_f1: {macro_f1}")

In [None]:
# Define evaluator class

class CustomEvaluator(DeepEvalBaseLLM):
    def __init__(self, model_name, max_len = 16834):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        self.model = LLM(model=model_name, tensor_parallel_size=2, trust_remote_code=True, max_model_len = max_len, gpu_memory_utilization=0.90)

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel = None) -> BaseModel | str:
        llm = self.load_model()

        json_prompt = ""
        if schema is None:
            # The generated answer will be a normal string
            sampling_params = SamplingParams(temperature=0.2, max_tokens=8096, stop_token_ids=[self.tokenizer.eos_token_id])
        else:
            # The generated answer will be a JSON document
            json_schema = schema.model_json_schema()
            guided_decoding_params = GuidedDecodingParams(json = json_schema)
            sampling_params = SamplingParams(temperature=0.15, max_tokens=8096, guided_decoding = guided_decoding_params)
            json_prompt = "You must always respond only with valid JSON that matches the provided schema. No explanations or extra text." 
        
        # Create messages for the model
        messages = [
            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
            {"role": "user", "content": prompt + json_prompt}
        ]
        
        # Generate model answer 
        text = self.tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True
        )
        with torch.no_grad():
            generated_text = llm.generate(prompt_token_ids=text, sampling_params=sampling_params)[0].outputs[0].text
        
        if schema is None:
            #String output
            return generated_text 
        
        # JSON output 
        match = regex.search(r"\{(?:[^{}]|(?R))*\}", generated_text, regex.DOTALL)  # match JSON from first { to its closing }
        if match is not None:
            json_result = json.loads(match.group(0))
            return schema(**json_result)
        return None

    async def a_generate(self, prompt: str, schema: BaseModel = None) -> BaseModel | str:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "CustomEvaluator-" + self.model_name

In [None]:
# Load custom evaluator
model_name = "Qwen/Qwen3-1.7B"
custom_evaluator = CustomEvaluator(model_name)

# Define relevant metrics
answer_relevancy_metric = AnswerRelevancyMetric(
    threshold=0.0,
    model = custom_evaluator,
    include_reason=True,
    verbose_mode = False,
    async_mode = False
)

contextual_precision_metric = ContextualPrecisionMetric(
    threshold=0.0,
    model=custom_evaluator,
    include_reason=True,
    verbose_mode = False,
    async_mode = False
)

contextual_recall_metric = ContextualRecallMetric(
    threshold=0.0,
    model=custom_evaluator,
    include_reason=True,
    verbose_mode = False,
    async_mode = False
)

contextual_relevancy_metric = ContextualRelevancyMetric(
    threshold=0.0,
    model=custom_evaluator,
    include_reason=True,
    verbose_mode = False,
    async_mode = False
)

In [None]:

with open(generated_answers_path, "r") as file:
    generated_answers = json.load(file)

# Indexes can be edited for specific testing on some questions
start_index = 0
end_index = len(generated_answers) 
all_test_cases = []
for answer_instance in generated_answers[start_index:end_index]:
    question = answer_instance["user_input"]
    ground_truth = answer_instance["reference"]
    llm_answer = answer_instance["answer"]
    retrieved_contexts = answer_instance["retrieved_contexts"]

    # Create test case for current answer
    test_case = LLMTestCase(
        input = question,
        actual_output = llm_answer,
        retrieval_context = retrieved_contexts,
        expected_output = ground_truth
    )
    all_test_cases.append(test_case)

# Evaluate all created tests
try:
    evaluations = evaluate(test_cases = all_test_cases, metrics = [
                                                                    answer_relevancy_metric, 
                                                                    contextual_precision_metric,
                                                                    contextual_recall_metric, 
                                                                    contextual_relevancy_metric
                                                                ])
except Exception as e:
    print(f"Error evaluating test case: {e}")