In [17]:
!pip install tensorflow-datasets bert-score rouge-score langchain-google-genai git+https://github.com/google-research/bleurt.git nltk

Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to /tmp/pip-req-build-wj4e5fhj
  Running command git clone --filter=blob:none --quiet https://github.com/google-research/bleurt.git /tmp/pip-req-build-wj4e5fhj
  Resolved https://github.com/google-research/bleurt.git to commit cebe7e6f996b40910cfaa520a63db47807e3bf5c
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: BLEURT
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Created wheel for BLEURT: filename=BLEURT-0.0.2-py3-none-any.whl size=16456764 sha256=3f57a26571360cd654fd38b6f8d30c3f2eabee3d3fd894d3c9872607d469cc96
  Stored in directory: /tmp/pip-ephem-wheel-cache-k0cywmqq/wheels/30/af/34/e148007788b060e4c76e7ecf68e70c692dff0f2632e62ac454
Successfully built BLEURT
Installing collected packages: BLEURT
Successfully installed BLEURT-0.0.2


In [20]:
!wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip
!unzip BLEURT-20.zip -d bleurt-checkpoint

--2025-05-01 03:05:15--  https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.107.207, 173.194.202.207, 192.178.163.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.107.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2140294207 (2.0G) [application/octet-stream]
Saving to: ‘BLEURT-20.zip’


2025-05-01 03:05:29 (140 MB/s) - ‘BLEURT-20.zip’ saved [2140294207/2140294207]

Archive:  BLEURT-20.zip
   creating: bleurt-checkpoint/BLEURT-20/
  inflating: bleurt-checkpoint/BLEURT-20/bert_config.json  
  inflating: bleurt-checkpoint/BLEURT-20/saved_model.pb  
   creating: bleurt-checkpoint/BLEURT-20/variables/
  inflating: bleurt-checkpoint/BLEURT-20/variables/variables.index  
  inflating: bleurt-checkpoint/BLEURT-20/variables/variables.data-00000-of-00001  
  inflating: bleurt-checkpoint/BLEURT-20/sent_piece.vocab  
  inflating: bleurt-checkpoint/BLEURT-20/bleur

In [22]:
import os
import logging
import tensorflow_datasets as tfds
import numpy as np
from bert_score import score as bert_score
from rouge_score import rouge_scorer
from bleurt import score as bleurt_score
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
from langchain_google_genai import GoogleGenerativeAI
import nltk
from tqdm import tqdm

# Download NLTK data for METEOR
nltk.download('wordnet')
nltk.download('punkt')

# Set up logging
logging.basicConfig(level=logging.INFO)

# Gemini API key
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

# Prompt template
QNA_PROMPT = """
Context: {context}
Question: {question}
Answer:
"""

def clean_context(context):
    return context.strip()

class Generator:
    def __init__(self, model_name="models/gemini-2.0-flash"):
        self.llm = GoogleGenerativeAI(model=model_name, google_api_key="AIzaSyAzEuo4oO6OLSglM2VceLIlRX1jMMFQnr0") #GEMINI_API_KEY)

    def generate_answer(self, question, chunks):
        try:
            context = "\n".join(chunks)
            prompt = QNA_PROMPT.format(question=question, context=clean_context(context))
            answer = self.llm.invoke(prompt)
            logging.info(f"Generated answer for question: {question}")
            return {"question": question, "answer": answer.strip(), "references": None}
        except Exception as e:
            logging.error(f"Error generating answer: {e}")
            raise

def evaluate_metrics(generated_answers, reference_answers):
    generated = [str(ans) for ans in generated_answers]
    references = [str(ref) for ref in reference_answers]

    # BERTScore
    P, R, F1 = bert_score(generated, references, lang="en", verbose=False)
    bert_scores = {
        "precision": np.mean(P.numpy()),
        "recall": np.mean(R.numpy()),
        "f1": np.mean(F1.numpy())
    }

    # ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
    for gen, ref in zip(generated, references):
        scores = scorer.score(ref, gen)
        for key in rouge_scores:
            rouge_scores[key] += scores[key].fmeasure
    rouge_scores = {k: v / len(generated) for k, v in rouge_scores.items()}

    # BLUERT
    # Specify the path to the BLUERT checkpoint (e.g., BLEURT-20)
    bleurt_scorer = bleurt_score.BleurtScorer("bleurt-checkpoint/BLEURT-20")
    bleurt_scores = bleurt_scorer.score(references=references, candidates=generated)
    avg_bleurt = np.mean(bleurt_scores)

    # METEOR
    meteor_scores = []
    for gen, ref in zip(generated, references):
        # Tokenize the sentences for METEOR
        gen_tokens = word_tokenize(gen)
        ref_tokens = word_tokenize(ref)
        score = meteor_score([ref_tokens], gen_tokens)
        meteor_scores.append(score)
    avg_meteor = np.mean(meteor_scores)

    return {
        "bertscore": bert_scores,
        "rouge": rouge_scores,
        "bleurt": avg_bleurt,
        "meteor": avg_meteor
    }

def main():
    from time import sleep
    # Load QuAC validation dataset (100 samples)
    dataset = tfds.load("quac", split="validation")
    
    # Initialize generator
    generator = Generator()

    generated_answers = []
    reference_answers = []
    
    # Process exactly 100 samples
    for example in tqdm(dataset.take(100)):
        question = example["question"].numpy().decode("utf-8")
        context = example["context"].numpy().decode("utf-8")
        answer = example["answers"]["text"][0].numpy().decode("utf-8")
        
        reference_answers.append(answer)
        response = generator.generate_answer(question, [context])
        generated_answers.append(response["answer"])
        sleep(5)
    # Evaluate metrics
    metrics = evaluate_metrics(generated_answers, reference_answers)

    # Print results
    print("Evaluation Metrics (100 samples):")
    print(f"BERTScore:")
    print(f"  Precision: {metrics['bertscore']['precision']:.4f}")
    print(f"  Recall: {metrics['bertscore']['recall']:.4f}")
    print(f"  F1: {metrics['bertscore']['f1']:.4f}")
    print(f"ROUGE:")
    print(f"  ROUGE-1: {metrics['rouge']['rouge1']:.4f}")
    print(f"  ROUGE-2: {metrics['rouge']['rouge2']:.4f}")
    print(f"  ROUGE-L: {metrics['rouge']['rougeL']:.4f}")
    print(f"BLUERT: {metrics['bleurt']:.4f}")
    print(f"METEOR: {metrics['meteor']:.4f}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 100/100 [10:50<00:00,  6.51s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics (100 samples):
BERTScore:
  Precision: 0.8525
  Recall: 0.8652
  F1: 0.8578
ROUGE:
  ROUGE-1: 0.2356
  ROUGE-2: 0.1492
  ROUGE-L: 0.2140
BLUERT: 0.3771
METEOR: 0.2631
