In [2]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import datasets
import torch
import pandas as pd
from nltk.translate.bleu_score import SmoothingFunction
# All QA model names that need to be evaluated
models = [
    ["damapika/roberta-base_mod_squad",386],
    ["damapika/distilbert-base-uncased_mod_squad",384],
    ["damapika/electra-base-discriminator_squad_mod",386],
]
# Load the SQuAD validation dataset 
test_dataset = load_dataset("squad", split="validation").shuffle()
results=[]

smoother = SmoothingFunction().method1
for model_name in models:
  tokenizer = AutoTokenizer.from_pretrained(model_name[0])
  model=AutoModelForQuestionAnswering.from_pretrained(model_name[0])
  result=[]
  bleu_scores=[]
  for example in test_dataset:
    
    # Get the reference answer and question
    reference_answer = example["answers"]["text"][0]
    question = example["question"]
    context = example["context"]

    # Tokenize the input
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt",max_length=model_name[1],truncation=True)

    # Generate an answer using the model
    outputs = model(**inputs)
    start_index = torch.argmax(outputs.start_logits)
    end_index = torch.argmax(outputs.end_logits)
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index + 1].tolist()))
    # Compute BLEU-4 score
    reference_tokens = nltk.word_tokenize(reference_answer.lower())
    predicted_tokens = nltk.word_tokenize(predicted_answer.lower())
    bleu_score = sentence_bleu([reference_tokens], predicted_tokens, smoothing_function=smoother)
    bleu_scores.append(bleu_score)
    print(bleu_score)

    # Calculate the average scores
  avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
  result.append(model_name[0])
  result.append(avg_bleu_score)
  results.append(result)
  # Print the average evaluation results
  print(model_name[0])
  print(f"Average BLEU-4 score: {avg_bleu_score}")
df = pd.DataFrame(results)
df.to_csv('qa_models_squad_bleu_eval.csv') 

Found cached dataset squad (C:/Users/dama_/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


1.0
0.1778279410038923
0.06541924356118012
0.7598356856515925
0.1778279410038923
0.316227766016838
0.06541924356118012
0.1778279410038923
0.11362193664674995
0.19180183554164504
0.17216896116316355
1.0
0.24028114141347542
0.316227766016838
0.16348126556655487
1.0
0.5623413251903491
0.316227766016838
0.007913247271422612
0.3976353643835253
0.316227766016838
0.1778279410038923
1.0
0.5623413251903491
0.316227766016838
1.0
0.008853531856477262
0.1778279410038923
0.5623413251903491
0.08034284189446518
0.5623413251903491
0.1778279410038923
0.316227766016838
0.8187307530779819
0.316227766016838
0.5623413251903491
0.1778279410038923
0.316227766016838
0
1.0
1.0
0.06541924356118012
0.14823156396438122
0
1.0
0.08034284189446518
0.316227766016838
0.1778279410038923
0.05372849659117709
0.316227766016838
0.5623413251903491
0.668740304976422
1.0
0.1778279410038923
1.0
1.0
2.1945711360427958e-05
0.5623413251903491
0.19180183554164504
0.1778279410038923
0.05372849659117709
0.316227766016838
0.397635364

In [4]:
df = pd.DataFrame(results)
df.to_csv('qa_models_squad_bleu_eval.csv') 