In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import datasets
import torch
import pandas as pd
from nltk.translate.bleu_score import SmoothingFunction
# All QA model names that need to be evaluated
models = [
    ["damapika/roberta-base_mod",386],
    ["damapika/distilbert-base-uncased_mod",384],
    ["damapika/electra-base-discriminator_squad_mod",386]
]
# Load the SQuAD validation dataset 
val_dataset = load_dataset("squad", split="validation")
results=[]

smoother = SmoothingFunction().method1
for model_name in models:
  tokenizer = AutoTokenizer.from_pretrained(model_name[0])
  model=AutoModelForQuestionAnswering.from_pretrained(model_name[0])
  result=[]
  bleu_scores=[]
  for example in val_dataset:
    
    # Get the reference answer and question
    reference_answer = example["answers"]["text"][0]
    question = example["question"]
    context = example["context"]

    # Tokenize the input
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt",max_length=model_name[1],truncation=True)

    # Generate an answer using the model
    outputs = model(**inputs)
    start_index = torch.argmax(outputs.start_logits)
    end_index = torch.argmax(outputs.end_logits)
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index + 1].tolist()))
    # Compute BLEU-4 score
    reference_tokens = nltk.word_tokenize(reference_answer.lower())
    predicted_tokens = nltk.word_tokenize(predicted_answer.lower())
    bleu_score = sentence_bleu([reference_tokens], predicted_tokens, smoothing_function=smoother)
    bleu_scores.append(bleu_score)
    print(bleu_score)
        # Compute METEOR score
        # meteor_score = meteor_score([reference_answer], predicted_answer)
        # meteor_scores.append(meteor_score)

    # Calculate the average scores
  avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
  result.append(model_name[0])
  result.append(avg_bleu_score)
  results.append(result)
  # Print the average evaluation results
  print(model_name[0])
  print(f"Average BLEU-4 score: {avg_bleu_score}")


In [4]:
df = pd.DataFrame(results)
df.to_csv('qa_models_squad_bleu_eval.csv') 