In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import datasets
import torch
import pandas as pd
from nltk.translate.bleu_score import SmoothingFunction
# All QA model names that need to be evaluated
models = [
    ["damapika/roberta-base_mod_squad",386],
    ["damapika/distilbert-base-uncased_mod_squad",384],
    ["damapika/electra-base-discriminator_squad_mod",386]
]
# Load the SQuAD validation dataset 
test_dataset = load_dataset("squad", split="validation").shuffle()
results=[]

smoother = SmoothingFunction().method1
for model_name in models:
  tokenizer = AutoTokenizer.from_pretrained(model_name[0])
  model=AutoModelForQuestionAnswering.from_pretrained(model_name[0])
  result=[]
  bleu_scores=[]
  for example in test_dataset:
    
    # Get the reference answer and question
    reference_answer = example["answers"]["text"][0]
    question = example["question"]
    context = example["context"]

    # Tokenize the input
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt",max_length=model_name[1],truncation=True)

    # Generate an answer using the model
    outputs = model(**inputs)
    start_index = torch.argmax(outputs.start_logits)
    end_index = torch.argmax(outputs.end_logits)
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index + 1].tolist()))
    # Compute BLEU-4 score
    reference_tokens = nltk.word_tokenize(reference_answer.lower())
    predicted_tokens = nltk.word_tokenize(predicted_answer.lower())
    bleu_score = sentence_bleu([reference_tokens], predicted_tokens, smoothing_function=smoother)
    bleu_scores.append(bleu_score)
    print(bleu_score)

    # Calculate the average scores
  avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
  result.append(model_name[0])
  result.append(avg_bleu_score)
  results.append(result)
  # Print the average evaluation results
  print(model_name[0])
  print(f"Average BLEU-4 score: {avg_bleu_score}")
df = pd.DataFrame(results)
df.to_csv('qa_models_squad_bleu_eval.csv') 

### Quoref eval

In [1]:
import nltk
from nltk.translate.meteor_score import meteor_score as calc_meteor
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import datasets
import torch
import pandas as pd

models = [["damapika/roberta-base_mod_quoref",386],
["damapika/distilbert-base-uncased_mod",384],
["damapika/electra-base-discriminator_mod_quoref",386]]
test_dataset = load_dataset("quoref", split="validation").shuffle()
results=[]

for model_name in models:
  tokenizer = AutoTokenizer.from_pretrained(model_name[0])
  model=AutoModelForQuestionAnswering.from_pretrained(model_name[0])
  result=[]
  meteor_scores=[]
  for example in test_dataset:
    
    # Get the reference answer and question
    reference_answer = example["answers"]["text"][0]
    question = example["question"]
    context = example["context"]

    # Tokenize the input
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt",max_length=model_name[1],truncation=True)

    # Generate an answer using the model
    outputs = model(**inputs)
    start_index = torch.argmax(outputs.start_logits)
    end_index = torch.argmax(outputs.end_logits)
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index + 1].tolist()))
    # Compute METEOR score
    reference_tokens = nltk.word_tokenize(reference_answer.lower())
    predicted_tokens = nltk.word_tokenize(predicted_answer.lower())
    meteor_score = calc_meteor([reference_tokens], predicted_tokens)
    meteor_scores.append(meteor_score)
    print(meteor_score)

    # Calculate the average scores
  avg_meteor_score = sum(meteor_scores) / len(meteor_scores)
  result.append(model_name[0])
  result.append(avg_meteor_score)
  results.append(result)
  # Print the average evaluation results
  print(model_name[0])
  print(f"Average meteor score: {avg_meteor_score}")
df = pd.DataFrame(results)
df.to_csv('qa_models_quoref_bleu_eval.csv') 

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset quoref (C:/Users/dama_/.cache/huggingface/datasets/quoref/default/0.1.0/82bb58a6b25cd8dbb4625a7ba6a5d0a224af1f4d392ca0de8b9e0c23e78557fe)


0.996
0.0
0.5
0.5
0.9375
0.5
0.9375
0.0
0.9375
0.5
0.5
0.5
0.0
0.9375
0.0
0.5
0.0
0.0
0.5
0.996
0.5
0.0
0.9375
0.0
0.9375
0.5
0.5
0.5
0.0
0.21739130434782608
0.0
0.5
0.5
0.9814814814814815
0.0
0.996
0.0
0.5
0.0
0.9375
0.5
0.5
0.5
0.0
0.5
0.5
0.5
0.0
0.9375
0.9375
0.0
0.5
0.0
0.9814814814814815
0.5
0.0
0.0
0.5
0.9375
0.5
0.0
0.0
0.0
0.0
0.5
0.0
0.027855153203342625
0.5
0.0
0.9814814814814815
0.0
0.9375
0.9375
0.5
0.5
0.9921875
0.5
0.0
0.5
0.0
0.9375
0.5
0.5
0.9375
0.5
0.6465517241379309
0.0
0.0
0.0
0.9814814814814815
0.9375
0.9921875
0.5
0.0
0.0
0.9375
0.0
0.5
0.0
0.5
0.0
0.5
0.5
0.0
0.5
0.0
0.5
0.9814814814814815
0.0
0.9375
0.9375
0.0
0.5
0.9375
0.0
0.5
0.0
0.5
0.9375
0.5
0.5
0.5
0.9375
0.0
0.0
0.0
0.0
0.9814814814814815
0.5
0.9498207885304659
0.9375
0.38265306122448983
0.0
0.9679878048780488
0.5
0.45454545454545453
0.0
0.5
0.5
0.0
0.03546099290780143
0.0
0.5
0.9375
0.0
0.5
0.5
0.2173913043478261
0.0
0.5
0.0
0.0
0.0
0.9921875
0.08333333333333336
0.9375
0.9375
0.5
0.0
0.0
0.0
0.5
0.9375