In [1]:
import torch
torch.set_float32_matmul_precision('high')

In [2]:
# --- Imports ---

import json
import logging
import os
from pathlib import Path
from typing import Dict, Any, List, Tuple, Union

import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer, PreTrainedModel, pipeline
from langchain.prompts import PromptTemplate
from langchain.schema import BaseOutputParser

# Configure logging to see informative messages
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

print("Imports and logging configured.")

  from .autonotebook import tqdm as notebook_tqdm


Imports and logging configured.


In [3]:
# --- Inference Configuration ---

# This config is for the INFERENCE step (generating answers)
config = {
  "model_to_be_tested": "Qwen/Qwen2-0.5B-Instruct",
  "dataset_hf_name": "squad",
  "judge_model":"google/gemma-3-1b-it",
  "dataset_split": "validation",
  "question_column_name": "question",
  "prompt_template": [
    {
      "role": "system",
      "content": "You are an expert assistant who provides concise and accurate answers."
    },
    {
      "role": "user",
      "content": "Please answer the following question: <question>"
    }
  ],
  "num_samples": 3,
  "max_new_tokens": 1000,
  "output_file": "inference_results.json"
}

print("Inference configuration loaded.")

Inference configuration loaded.


In [4]:
# --- Inference Helper Functions ---

def load_hf_dataset(name: str, split: str, question_column: str) -> Dataset:
    """Loads a dataset from Hugging Face Hub and verifies the question column."""
    logging.info(f"Loading dataset '{name}' (split: {split})...")
    try:
        dataset = load_dataset(name, split=split)
    except Exception as e:
        raise ValueError(f"Failed to load dataset '{name}': {e}")

    if question_column not in dataset.column_names:
        raise ValueError(
            f"Dataset '{name}' does not have a '{question_column}' column. "
            f"Available columns: {dataset.column_names}"
        )
    logging.info(f"Dataset loaded. Using '{question_column}' as the question column.")
    return dataset

def load_model_and_tokenizer(model_name: str) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    """Loads a causal language model and its tokenizer from Hugging Face Hub."""
    logging.info(f"Loading model and tokenizer for '{model_name}'...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        logging.info(f"Model loaded successfully on device: {model.device}")
        return model, tokenizer
    except Exception as e:
        raise ValueError(f"Failed to load model or tokenizer '{model_name}': {e}")

def format_prompt(item: Dict, question_column: str, prompt_template: Any, tokenizer: PreTrainedTokenizer) -> str:
    """Formats the prompt using the provided template and data item."""
    try:
        question = item[question_column]
        if isinstance(prompt_template, str):
            return prompt_template.replace("<question>", question)
        elif isinstance(prompt_template, list):
            messages = [
                {"role": msg["role"], "content": msg["content"].replace("<question>", question)}
                for msg in prompt_template
            ]
            return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        else:
            raise ValueError(f"Unsupported prompt_template type: {type(prompt_template)}")
    except KeyError:
        raise KeyError(f"Question column '{question_column}' not found in the data item.")
    except Exception as e:
        raise ValueError(f"Error formatting prompt: {e}")

def perform_inference(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prompt: str, max_new_tokens: int) -> Tuple[List[int], int]:
    """Performs inference using the model."""
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        input_ids_len = inputs.input_ids.shape[1]
        generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
        return generated_ids[0].tolist(), input_ids_len
    except torch.cuda.OutOfMemoryError as e:
        logging.error("CUDA Out of Memory. Try a smaller model, fewer samples, or quantization.")
        raise e
    except Exception as e:
        logging.error(f"An unexpected error occurred during model generation: {e}")
        raise e

def parse_generation(tokenizer: PreTrainedTokenizer, full_output_ids: List[int], input_length: int) -> str:
    """Decodes the generated token IDs, excluding the prompt."""
    output_ids = full_output_ids[input_length:]
    response = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
    return response

print("Inference helper functions defined.")

Inference helper functions defined.


In [5]:
# --- Inference Pipeline ---

results = []
try:
    # Unpack configuration
    model_name = config["model_to_be_tested"]
    dataset_name = config["dataset_hf_name"]
    dataset_split = config["dataset_split"]
    question_column = config["question_column_name"]
    prompt_template = config["prompt_template"]
    num_samples = config["num_samples"]
    max_new_tokens = config["max_new_tokens"]
    output_file = Path(config["output_file"])

    # Load Model and Dataset
    model, tokenizer = load_model_and_tokenizer(model_name)
    dataset = load_hf_dataset(dataset_name, dataset_split, question_column)

    # Process Samples
    for i in range(min(num_samples, len(dataset))):
        logging.info(f"--- Processing sample {i+1}/{num_samples} ---")
        item = dataset[i]
        
        prompt = format_prompt(item, question_column, prompt_template, tokenizer)
        generated_ids, input_len = perform_inference(model, tokenizer, prompt, max_new_tokens)
        llm_response = parse_generation(tokenizer, generated_ids, input_len)
        
        ground_truth = item.get("answers", {}).get("text", ["N/A"])[0]

        # SIMPLIFICATION: Save with the key 'ground_truth' to avoid renaming it later.
        results.append({
            "question": item[question_column],
            "ground_truth": ground_truth,
            "llm_response": llm_response,
        })
        logging.info(f"Generated response for sample {i+1}.")

    # Save Results
    logging.info(f"Saving {len(results)} results to {output_file}...")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\nInference completed successfully! Results saved to {output_file}")

except Exception as e:
    logging.error(f"An unexpected error occurred during inference: {e}", exc_info=True)
    print(f"\nInference pipeline failed. Error: {e}")

2025-06-17 11:11:33 - INFO - Loading model and tokenizer for 'Qwen/Qwen2-0.5B-Instruct'...
2025-06-17 11:11:34 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2025-06-17 11:11:34 - INFO - Model loaded successfully on device: cuda:0
2025-06-17 11:11:34 - INFO - Loading dataset 'squad' (split: validation)...
2025-06-17 11:11:35 - INFO - Dataset loaded. Using 'question' as the question column.
2025-06-17 11:11:35 - INFO - --- Processing sample 1/3 ---
2025-06-17 11:11:36 - INFO - Generated response for sample 1.
2025-06-17 11:11:36 - INFO - --- Processing sample 2/3 ---
2025-06-17 11:11:36 - INFO - Generated response for sample 2.
2025-06-17 11:11:36 - INFO - --- Processing sample 3/3 ---
2025-06-17 11:11:37 - INFO - Generated response for sample 3.
2025-06-17 11:11:37 - INFO - Saving 3 results to inference_results.json...



Inference completed successfully! Results saved to inference_results.json


In [6]:
# --- Simplified Evaluation Functions ---

import json
import logging
from langchain.prompts import PromptTemplate
from langchain.schema import BaseOutputParser
from transformers import pipeline

class SimpleScoreParser(BaseOutputParser):
    """A simple parser to extract a single score from the judge's response."""
    def parse(self, text: str) -> dict:
        """Parses the LLM output to extract the first valid JSON object."""
        try:
            # Find the start of the first JSON object
            start_idx = text.find('{')
            if start_idx == -1:
                raise ValueError("No JSON object found in the response.")

            # Find the corresponding closing brace for the first JSON object
            open_braces = 0
            for i in range(start_idx, len(text)):
                if text[i] == '{':
                    open_braces += 1
                elif text[i] == '}':
                    open_braces -= 1
                    if open_braces == 0:
                        end_idx = i + 1
                        break
            else: # This 'else' belongs to the 'for' loop
                raise ValueError("Could not find a complete JSON object.")

            # Extract and parse the identified JSON string
            json_str = text[start_idx:end_idx]
            data = json.loads(json_str)

            # Ensure required keys exist, default to safe values
            score = float(data.get("overall_score", 0.0))
            explanation = data.get("explanation", "No explanation provided")
            return {"overall_score": score, "explanation": explanation}

        except (json.JSONDecodeError, ValueError, AttributeError, NameError) as e:
            # NameError can happen if end_idx is not assigned
            logging.warning(f"Could not parse score from response: {text}\nError: {e}")
            return {"overall_score": 0.0, "explanation": f"Parsing failed: {str(e)}"}

# A much simpler and stricter prompt for the judge model
simple_evaluation_prompt = PromptTemplate(
    input_variables=["ground_truth", "llm_response"],
    template="""You are a strict evaluator. Your only job is to provide a single score and a detailed explanation for the score.
Based on the Ground Truth, score the LLM Response from 0.0 (completely wrong) to 10.0 (perfectly correct).

Ground Truth: {ground_truth}
LLM Response: {llm_response}

You MUST ONLY output a JSON object with exactly two keys: "overall_score" and "explanation". Do not add any other text or code blocks.
Example JSON output: {{"overall_score": 8.5, "explanation": "The response is accurate but lacks some details present in the ground truth."}}

JSON output:
"""
)

def setup_hf_evaluator(judge_model_name: str, hf_token: str = None):
    """Sets up the Hugging Face pipeline for the evaluator model."""
    logging.info(f"Setting up judge model: {judge_model_name}")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    try:
        return pipeline("text-generation", model=judge_model_name, device=device, token=hf_token)
    except Exception as e:
        raise RuntimeError(f"Failed to load judge model '{judge_model_name}': {e}")

def batch_evaluate_simple_score(evaluation_data: list, judge_model_name: str, hf_token: str = None):
    """Evaluates responses to get a single score and explanation for each."""
    if not evaluation_data:
        return []

    evaluator = setup_hf_evaluator(judge_model_name, hf_token)
    parser = SimpleScoreParser()
    final_results = []

    for i, item in enumerate(evaluation_data):
        logging.info(f"--- Evaluating item {i+1}/{len(evaluation_data)} ---")
        prompt = simple_evaluation_prompt.format(
            ground_truth=item['ground_truth'],
            llm_response=item['llm_response']
        )
        
        raw_response = evaluator(prompt, max_new_tokens=200, return_full_text=False)[0]['generated_text']
        # Parse the response using SimpleScoreParser
        parsed_response = parser.parse(raw_response)
        
        # Construct the final dictionary
        final_results.append({
            "question": item['question'],
            "ground_truth": item['ground_truth'],
            "llm_response": item['llm_response'],
            "final_score": parsed_response["overall_score"],
            "judge_raw_response": parsed_response["explanation"],
        })
            
    return final_results

print("Simplified evaluation functions defined.")

Simplified evaluation functions defined.


In [7]:
# --- Final Execution Pipeline ---

# 1. Define configuration for the JUDGE model
judge_model_name = config["judge_model"]
hf_token = "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"  # IMPORTANT: Replace with your actual Hugging Face token

# 2. Load the results from the inference step
def load_json_from_file(file_path: str) -> Union[Dict, List, Any]:
    """Safely loads data from a JSON file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        logging.error(f"File not found: {file_path}")
        return None

inference_results_file = config.get("output_file", "inference_results.json")
data_to_evaluate = load_json_from_file(inference_results_file)

# 3. Run the simplified batch evaluation
if data_to_evaluate:
    print("\nStarting simplified batch evaluation...")
    final_scores = batch_evaluate_simple_score(
        evaluation_data=data_to_evaluate,
        judge_model_name=judge_model_name,
        hf_token=hf_token
    )

    # 4. Save and print the final, simple results
    evaluation_output_file = "final_scores.json"
    with open(evaluation_output_file, "w", encoding="utf-8") as f:
        json.dump(final_scores, f, indent=2, ensure_ascii=False)
    
    print(f"\nEvaluation complete. Results saved to {evaluation_output_file}")
    print("\n--- FINAL SCORES ---")
    print(json.dumps(final_scores, indent=2))
else:
    print("\nCould not proceed with evaluation. No data loaded from inference file.")

2025-06-17 11:11:41 - INFO - Setting up judge model: google/gemma-3-1b-it



Starting simplified batch evaluation...


Device set to use cuda
2025-06-17 11:11:44 - INFO - --- Evaluating item 1/3 ---
2025-06-17 11:12:11 - INFO - --- Evaluating item 2/3 ---
2025-06-17 11:12:30 - INFO - --- Evaluating item 3/3 ---



Evaluation complete. Results saved to final_scores.json

--- FINAL SCORES ---
[
  {
    "question": "Which NFL team represented the AFC at Super Bowl 50?",
    "ground_truth": "Denver Broncos",
    "llm_response": "The New England Patriots represented the AFC at Super Bowl 50.",
    "final_score": 2.0,
    "judge_raw_response": "The response is incorrect and fails to accurately represent the Denver Broncos."
  },
  {
    "question": "Which NFL team represented the NFC at Super Bowl 50?",
    "ground_truth": "Carolina Panthers",
    "llm_response": "The New Orleans Saints represented the NFC at Super Bowl 50. They played their final game against the Pittsburgh Steelers in Super Bowl LII, where they won by a score of 34-28.",
    "final_score": 9.0,
    "judge_raw_response": "The response accurately states the Carolina Panthers' representation at Super Bowl 50, the opposing team, and the outcome of the game. It is a solid and direct answer."
  },
  {
    "question": "Where did Super Bow