## LLM Response Generation and Evaluation Framework
<img src="https://raw.githubusercontent.com/NetoAI/Netoai-Llm-Eval/main/netoai_logo.png" alt="NetoAI Logo" width="300"/>


This section demonstrates how to generate responses using a selected LLM and evaluate them with a judge model. The input dataset must contain `question` and `answer` columns. The prompt template should include a `<question>` placeholder, which will be dynamically replaced during inference. The generated output is stored in a structured format and later evaluated to assign numerical scores.

In [None]:
# --- Imports ---

import json
import logging
import os
from pathlib import Path
from typing import Dict, Any, List, Tuple, Union

import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer, PreTrainedModel, pipeline
from langchain.prompts import PromptTemplate
from langchain.schema import BaseOutputParser

# Configure logging to see informative messages
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

print("Imports and logging configured.")

In [None]:
#--- Inference Configuration ---

# This config is for the INFERENCE step (generating answers)
config = {
  "model_to_be_tested": "Qwen/Qwen2-0.5B-Instruct",
  "dataset_hf_name": "squad",
  "judge_model":"microsoft/Phi-4-mini-instruct",
  "dataset_split": "validation",
  "question_column_name": "question",
  "prompt_template": [
    {
      "role": "system",
      "content": "You are an expert assistant who provides concise and accurate answers."
    },
    {
      "role": "user",
      "content": "Please answer the following question: <question>"
    }
  ],
  "num_samples": 3,
  "max_new_tokens": 1000,
  "output_file": "inference_results.json"
}

print("Inference configuration loaded.")

In [None]:
# --- Inference Helper Functions ---

def load_hf_dataset(name: str, split: str, question_column: str) -> Dataset:
    """Loads a dataset from Hugging Face Hub and verifies the question column."""
    logging.info(f"Loading dataset '{name}' (split: {split})...")
    try:
        dataset = load_dataset(name, split=split)
    except Exception as e:
        raise ValueError(f"Failed to load dataset '{name}': {e}")

    if question_column not in dataset.column_names:
        raise ValueError(
            f"Dataset '{name}' does not have a '{question_column}' column. "
            f"Available columns: {dataset.column_names}"
        )
    logging.info(f"Dataset loaded. Using '{question_column}' as the question column.")
    return dataset

def load_model_and_tokenizer(model_name: str) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    """Loads a causal language model and its tokenizer from Hugging Face Hub."""
    logging.info(f"Loading model and tokenizer for '{model_name}'...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        logging.info(f"Model loaded successfully on device: {model.device}")
        return model, tokenizer
    except Exception as e:
        raise ValueError(f"Failed to load model or tokenizer '{model_name}': {e}")

def format_prompt(item: Dict, question_column: str, prompt_template: Any, tokenizer: PreTrainedTokenizer) -> str:
    """Formats the prompt using the provided template and data item."""
    try:
        question = item[question_column]
        if isinstance(prompt_template, str):
            return prompt_template.replace("<question>", question)
        elif isinstance(prompt_template, list):
            messages = [
                {"role": msg["role"], "content": msg["content"].replace("<question>", question)}
                for msg in prompt_template
            ]
            return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        else:
            raise ValueError(f"Unsupported prompt_template type: {type(prompt_template)}")
    except KeyError:
        raise KeyError(f"Question column '{question_column}' not found in the data item.")
    except Exception as e:
        raise ValueError(f"Error formatting prompt: {e}")

def perform_inference(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prompt: str, max_new_tokens: int) -> Tuple[List[int], int]:
    """Performs inference using the model."""
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        input_ids_len = inputs.input_ids.shape[1]
        generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
        return generated_ids[0].tolist(), input_ids_len
    except torch.cuda.OutOfMemoryError as e:
        logging.error("CUDA Out of Memory. Try a smaller model, fewer samples, or quantization.")
        raise e
    except Exception as e:
        logging.error(f"An unexpected error occurred during model generation: {e}")
        raise e

def parse_generation(tokenizer: PreTrainedTokenizer, full_output_ids: List[int], input_length: int) -> str:
    """Decodes the generated token IDs, excluding the prompt."""
    output_ids = full_output_ids[input_length:]
    response = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
    return response

print("Inference helper functions defined.")

In [None]:
# --- Inference Pipeline ---

results = []
try:
    # Unpack configuration
    model_name = config["model_to_be_tested"]
    dataset_name = config["dataset_hf_name"]
    dataset_split = config["dataset_split"]
    question_column = config["question_column_name"]
    prompt_template = config["prompt_template"]
    num_samples = config["num_samples"]
    max_new_tokens = config["max_new_tokens"]
    output_file = Path(config["output_file"])

    # Load Model and Dataset
    model, tokenizer = load_model_and_tokenizer(model_name)
    dataset = load_hf_dataset(dataset_name, dataset_split, question_column)

    # Process Samples
    for i in range(min(num_samples, len(dataset))):
        logging.info(f"--- Processing sample {i+1}/{num_samples} ---")
        item = dataset[i]
        
        prompt = format_prompt(item, question_column, prompt_template, tokenizer)
        generated_ids, input_len = perform_inference(model, tokenizer, prompt, max_new_tokens)
        llm_response = parse_generation(tokenizer, generated_ids, input_len)
        
        ground_truth = item.get("answers", {}).get("text", ["N/A"])[0]

        # SIMPLIFICATION: Save with the key 'ground_truth' to avoid renaming it later.
        results.append({
            "question": item[question_column],
            "ground_truth": ground_truth,
            "llm_response": llm_response,
        })
        logging.info(f"Generated response for sample {i+1}.")

    del model
    del tokenizer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    logging.info("Model and tokenizer unloaded from GPU, CUDA cache cleared.")
    
    # Save Results
    logging.info(f"Saving {len(results)} results to {output_file}...")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\nInference completed successfully! Results saved to {output_file}")

except Exception as e:
    logging.error(f"An unexpected error occurred during inference: {e}", exc_info=True)
    print(f"\nInference pipeline failed. Error: {e}")

In [None]:
class EvaluationOutputParser(BaseOutputParser):
    """Parser for structured evaluation output"""
    
    def parse(self, text: str) -> Dict[str, Any]:
        try:
            # Extract JSON from the response
            start_idx = text.find('{')
            end_idx = text.rfind('}') + 1
            json_str = text[start_idx:end_idx]
            return json.loads(json_str)
        except:
            # Fallback parsing if JSON extraction fails
            return {
                "overall_score": 0.0,
                "accuracy": 0.0,
                "completeness": 0.0,
                "relevance": 0.0,
                "clarity": 0.0,
                "reasoning": "Failed to parse evaluation",
                "specific_issues": [],
                "strengths": []
            }

# Main evaluation prompt template (unchanged)
evaluation_prompt = PromptTemplate(
    input_variables=["question", "ground_truth", "llm_response"],
    template="""
You are an expert evaluator tasked with assessing the quality of an LLM response against ground truth data.

**EVALUATION TASK:**
Question: {question}

Ground Truth Answer: {ground_truth}

LLM Response to Evaluate: {llm_response}

**EVALUATION CRITERIA:**
Please evaluate the LLM response across the following dimensions (scale 0-10):

1. **Accuracy**: How factually correct is the response compared to ground truth?
2. **Completeness**: Does the response cover all key points from the ground truth?
3. **Relevance**: How well does the response address the specific question asked?
4. **Clarity**: Is the response clear, well-structured, and easy to understand?

**EVALUATION INSTRUCTIONS:**
1. Compare the LLM response directly with the ground truth
2. Identify any factual errors, omissions, or additions
3. Consider both content accuracy and presentation quality
4. Provide specific examples for your scoring decisions

**OUTPUT FORMAT:**
Provide ONLY your evaluation as a JSON object with the following structure:

{{
    "overall_score": <float between 0-10>,
    "accuracy": <float between 0-10>,
    "completeness": <float between 0-10>,
    "relevance": <float between 0-10>,
    "clarity": <float between 0-10>,
    "reasoning": "<detailed explanation of your evaluation>",
    "specific_issues": ["<list of specific problems found>"],
    "strengths": ["<list of response strengths>"],
    "factual_errors": ["<list of any factual errors>"],
    "missing_information": ["<list of key information missing from ground truth>"],
    "additional_information": ["<list of relevant info added beyond ground truth>"]
}}

Begin your evaluation:
"""
)

# Alternative detailed evaluation prompt (unchanged)
detailed_evaluation_prompt = PromptTemplate(
    input_variables=["question", "ground_truth", "llm_response", "evaluation_context"],
    template="""
You are conducting a comprehensive evaluation of an LLM response against established ground truth.

**CONTEXT:** {evaluation_context}

**QUESTION:** {question}

**GROUND TRUTH:** {ground_truth}

**LLM RESPONSE:** {llm_response}

**DETAILED EVALUATION FRAMEWORK:**
...

**OUTPUT:**
Provide ONLY a structured JSON evaluation with scores, detailed reasoning, and actionable feedback:

{{
    "scores": {{
        "overall": <0-10>,
        "accuracy": <0-10>,
        "completeness": <0-10>,
        "relevance": <0-10>,
        "clarity": <0-10>,
        "organization": <0-10>
    }},
    "analysis": {{
        "key_strengths": ["strength1", "strength2"],
        "critical_weaknesses": ["weakness1", "weakness2"],
        "factual_errors": ["error1", "error2"],
        "missing_key_points": ["point1", "point2"],
        "added_value": ["addition1", "addition2"]
    }},
    "detailed_feedback": "<comprehensive explanation of evaluation>",
    "improvement_suggestions": ["suggestion1", "suggestion2"],
    "confidence_level": "<high/medium/low> - your confidence in this evaluation"
}}
"""
)

# Parse JSON configuration from string
def load_config_from_string(config_json: str) -> Dict[str, str]:
    try:
        return json.loads(config_json)
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON string provided: {str(e)}")

# Hugging Face Evaluator Setup with device-agnostic support
def setup_hf_evaluator(config_json: str, hf_token: str = None):
    """
    Setup Hugging Face model as the evaluator LLM based on JSON string config, using CUDA if available, else CPU
    
    Args:
        config_json: JSON string containing judge_model
        hf_token: Hugging Face API token (if None, will use HF_TOKEN env var)
    
    Returns:
        Hugging Face pipeline for text generation
    """
    # Determine device
    if torch.cuda.is_available():
        device = "cuda"
        torch_dtype = torch.float16  # Use FP16 for CUDA
        print("Using CUDA device:", torch.cuda.get_device_name(0))
    elif torch.backends.mps.is_available():  # For Apple silicon
        device = "mps"
        torch_dtype = torch.float16
        print("Using MPS device")
    else:
        device = "cpu"
        torch_dtype = torch.float32  # Use FP32 for CPU
        print("Using CPU device")

    # Load configuration from JSON string
    config = load_config_from_string(config_json)
    judge_model = config.get("judge_model")
    if not judge_model:
        raise ValueError("Judge model not specified in the JSON configuration.")
    
    # Get Hugging Face token
    if hf_token is None:
        hf_token = os.getenv("HF_TOKEN")
        if not hf_token:
            raise ValueError("Hugging Face API token not found. Set HF_TOKEN environment variable or pass hf_token parameter.")
    
    # Load model and tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(judge_model, token=hf_token)
        model = AutoModelForCausalLM.from_pretrained(
            judge_model,
            token=hf_token,
            device_map=device,  # Use selected device
            torch_dtype=torch_dtype  # Use appropriate precision
        )
        
        # Setup pipeline for text generation
        evaluator = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=200,
            temperature=0.1,  # Low temperature for consistent evaluation
            device_map=device,  # Use selected device
            return_full_text=False  # Return only generated text
        )
        print(f"Model {judge_model} loaded successfully on {device}")
        return evaluator
    except Exception as e:
        raise ValueError(f"Failed to load Hugging Face model {judge_model}: {str(e)}")

# Modified evaluation function to return raw response
def evaluate_llm_response(question: str, ground_truth: str, llm_response: str, 
                         config_json: str, use_detailed: bool = False, 
                         evaluation_context: str = "General knowledge evaluation",
                         hf_token: str = None):
    """
    Evaluate an LLM response against ground truth using Hugging Face model from JSON string config
    
    Args:
        question: The original question asked
        ground_truth: The correct/expected answer
        llm_response: The LLM's response to evaluate
        config_json: JSON string containing judge_model
        use_detailed: Whether to use the detailed evaluation prompt
        evaluation_context: Context for the evaluation task
        hf_token: Hugging Face API token (optional if env var is set)
    
    Returns:
        Dictionary containing evaluation results and raw LLM response
    """
    
    # Setup Hugging Face evaluator
    llm_evaluator = setup_hf_evaluator(config_json=config_json, hf_token=hf_token)
    
    # Format prompt
    if use_detailed:
        prompt = detailed_evaluation_prompt.format(
            question=question,
            ground_truth=ground_truth,
            llm_response=llm_response,
            evaluation_context=evaluation_context
        )
    else:
        prompt = evaluation_prompt.format(
            question=question,
            ground_truth=ground_truth,
            llm_response=llm_response
        )
    
    # Get evaluation from Hugging Face model
    try:
        evaluation_response = llm_evaluator(
            prompt,
            do_sample=True,
            top_p=0.9,
            return_full_text=False
        )[0]['generated_text']
        print(f"Raw DLL Response:\n{evaluation_response}\n{'-'*50}")
    except Exception as e:
        print(f"Error calling Hugging Face model: {e}")
        return {
            "error": f"Model inference failed: {str(e)}",
            "overall_score": 0.0,
            "raw_response": None
        }
    
    # Parse the response
    parser = EvaluationOutputParser()
    evaluation_results = parser.parse(evaluation_response)
    
    # Include raw response in results
    evaluation_results["raw_response"] = evaluation_response
    
    return evaluation_results

# Modified batch evaluation function to include raw response
def batch_evaluate_responses(evaluation_data: list, config_json: str, 
                           use_detailed: bool = False, hf_token: str = None):
    """
    Evaluate multiple LLM responses in batch using Hugging Face model from JSON string config
    
    Args:
        evaluation_data: List of dicts with keys: 'question', 'ground_truth', 'llm_response'
        config_json: JSON string containing judge_model
        use_detailed: Whether to use detailed evaluation
        hf_token: Hugging Face API token
    
    Returns:
        List of evaluation results including raw responses
    """
    evaluator = setup_hf_evaluator(config_json=config_json, hf_token=hf_token)
    
    results = []
    for i, data in enumerate(evaluation_data):
        print(f"Evaluating response {i+1}/{len(evaluation_data)}...")
        
        result = evaluate_llm_response(
            question=data['question'],
            ground_truth=data['ground_truth'],
            llm_response=data['llm_response'],
            config_json=config_json,
            use_detailed=use_detailed,
            hf_token=hf_token
        )
        
        result['evaluation_id'] = i
        results.append(result)
    
    return results

print("Judge Model Setup Complete")

In [None]:
# JSON configuration as a string
config_json = json.dumps(config)

# Example evaluation data
evaluation_data = results

# Evaluate responses
results = batch_evaluate_responses(
    evaluation_data=evaluation_data,
    config_json=config_json,
    use_detailed=False,
    hf_token="Sample_hf"  # Replace with your actual HF token or set HF_TOKEN env variable
)

# Print results
for result in results:
    print(json.dumps(result, indent=2))

In [None]:
#CONVERT RESULTS TO A FILE

output_file = "evaluation_results.json"
try:
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    logging.info(f"Successfully saved evaluation results to '{output_file}'")
except IOError as e:
    logging.error(f"Error: Failed to write to '{output_file}': {e}")
    raise
except Exception as e:
    logging.error(f"Error: An unexpected error occurred while saving '{output_file}': {e}")
    raise