In [2]:
import nltk
from nltk.translate.meteor_score import meteor_score as calc_meteor
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import datasets
import torch
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
test_dataset = load_dataset("squad", split="validation")
test_dataset.shuffle()

Found cached dataset squad (C:/Users/dama_/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})

In [6]:
models = [
    ["damapika/roberta-base_mod_squad",386],
    ["damapika/distilbert-base-uncased_mod_squad",384],
    ["damapika/electra-base-discriminator_squad_mod",386]
]
# Load the SQuAD test dataset 
results=[]

for model_name in models:
  tokenizer = AutoTokenizer.from_pretrained(model_name[0])
  model=AutoModelForQuestionAnswering.from_pretrained(model_name[0])
  result=[]
  meteor_scores=[]
  for example in test_dataset:
    
    # Get the reference answer and question
    reference_answer = example["answers"]["text"][0]
    question = example["question"]
    context = example["context"]

    # Tokenize the input
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt",max_length=model_name[1],truncation=True)

    # Generate an answer using the model
    outputs = model(**inputs)
    start_index = torch.argmax(outputs.start_logits)
    end_index = torch.argmax(outputs.end_logits)
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index + 1].tolist()))
    # Compute METEOR score
    reference_tokens = nltk.word_tokenize(reference_answer.lower())
    predicted_tokens = nltk.word_tokenize(predicted_answer.lower())
    meteor_score = calc_meteor([reference_tokens], predicted_tokens)
    meteor_scores.append(meteor_score)
    print(meteor_score)

    # Calculate the average scores
  avg_meteor_score = sum(meteor_scores) / len(meteor_scores)
  result.append(model_name[0])
  result.append(avg_meteor_score)
  results.append(result)
  # Print the average evaluation results
  print(model_name[0])
  print(f"Average meteor score: {avg_meteor_score}")
df = pd.DataFrame(results)
df.to_csv('qa_models_squad_meteor_eval.csv') 

0.9375
0.9375
0.7937500000000001
0.9375
0.5
0.4934210526315789
0.9921875
0.9814814814814815
0.4934210526315789
0.9814814814814815
0.9921875
0.9375
0.9814814814814815
0.8522727272727273
0.9814814814814815
0.5
0.0
0.8522727272727273
0.9814814814814815
0.5
0.9921875
0.0
0.9375
0.0
0.9375
0.5
0.9375
0.8099489795918368
0.8928571428571429
0.9375
0.9375
0.5
0.5
0.9375
0.9814814814814815
0.9375
0.9814814814814815
0.9814814814814815
0.5
0.9375
0.5
0.9375
0.5
0.5
0.9814814814814815
0.9375
0.9375
0.0
0.9814814814814815
0.9375
0.9814814814814815
0.9375
0.9375
0.9375
0.2631578947368421
0.9375
0.5
0.5
0.6465517241379309
0.17857142857142855
0.078125
0.5
0.9375
0.5
0.5
0.9375
0.5
0.5
0.5
0.9375
0.5
0.5
0.5
0.2631578947368421
0.9375
0.5
0.5
0.5
0.9375
0.5
0.5
0.9814814814814815
0.5
0.9921875
0.9814814814814815
0.5
0.9814814814814815
0.5
0.9375
0.5
0.5
0.9814814814814815
0.9375
0.5
0.5
0.9814814814814815
0.5
0.9921875
0.5
0.5
0.9921875
0.9814814814814815
0.9814814814814815
0.5
0.9921875
0.9375
0.9375
0.

### Quoref eval meteor

In [1]:
import nltk
from nltk.translate.meteor_score import meteor_score as calc_meteor
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import datasets
import torch
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
models = [["damapika/roberta-base_mod_quoref",386],
["damapika/distilbert-base-uncased_mod",384],
["damapika/electra-base-discriminator_mod_quoref",386]]

In [5]:
test_dataset = load_dataset("quoref", split="validation").shuffle()

Found cached dataset quoref (C:/Users/dama_/.cache/huggingface/datasets/quoref/default/0.1.0/82bb58a6b25cd8dbb4625a7ba6a5d0a224af1f4d392ca0de8b9e0c23e78557fe)


In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [6]:
results=[]

for model_name in models:
  tokenizer = AutoTokenizer.from_pretrained(model_name[0])
  model=AutoModelForQuestionAnswering.from_pretrained(model_name[0])
  result=[]
  meteor_scores=[]
  for example in test_dataset:
    
    # Get the reference answer and question
    reference_answer = example["answers"]["text"][0]
    question = example["question"]
    context = example["context"]

    # Tokenize the input
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt",max_length=model_name[1],truncation=True)

    # Generate an answer using the model
    outputs = model(**inputs)
    start_index = torch.argmax(outputs.start_logits)
    end_index = torch.argmax(outputs.end_logits)
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index + 1].tolist()))
    # Compute METEOR score
    reference_tokens = nltk.word_tokenize(reference_answer.lower())
    predicted_tokens = nltk.word_tokenize(predicted_answer.lower())
    meteor_score = calc_meteor([reference_tokens], predicted_tokens)
    meteor_scores.append(meteor_score)
    print(meteor_score)

    # Calculate the average scores
  avg_meteor_score = sum(meteor_scores) / len(meteor_scores)
  result.append(model_name[0])
  result.append(avg_meteor_score)
  results.append(result)
  # Print the average evaluation results
  print(model_name[0])
  print(f"Average meteor score: {avg_meteor_score}")
df = pd.DataFrame(results)
df.to_csv('qa_models_quoref_meteor_eval.csv') 

0.0
0.5
0.0
0.0
0.0
0.9921875
0.9375
0.9375
0.9375
0.0
0.5
0.0
0.0
0.2631578947368421
0.5
0.5
0.5
0.5
0.5
0.0
0.5681818181818182
0.5
0.0
0.9921875
0.25
0.19230769230769232
0.6465517241379309
0.0
0.0
0.9976851851851852
0.5
0.0
0.0
0.0
0.5
0.5
0.0
0.0
0.2631578947368421
0.5
0.5
0.5
0.0
0.0
0.2631578947368421
0.5
0.5
0.0
0.06097560975609756
0.9375
0.5
0.5
0.754985754985755
0.5
0.9375
0.9375
0.5
0.5
0.2631578947368421
0.0
0.5
0.5
0.5
0.0
0.0
0.9814814814814815
0.9375
0.5
0.5
0.5
0.5
0.5
0.0
0.0
0.5
0.5
0.0
0.0
0.0
0.0
0.0
0.0
0.9921875
0.5
0.5
0.0
0.9375
0.5
0.5
0.0
0.5
0.5
0.5
0.0
0.0
0.0
0.5
0.5
0.9375
0.0
0.0
0.5
0.0
0.0
0.5
0.5
0.0
0.5
0.5
0.5
0.0
0.9375
0.0
0.5
0.5
0.9814814814814815
0.5
0.5
0.0
0.5
0.9375
0.9375
0.9976851851851852
0.9814814814814815
0.5
0.45454545454545453
0.5
0.5
0.9375
0.5
0.9814814814814815
0.0
0.6465517241379309
0.9375
0.5
0.5
0.0
0.5
0.9375
0.0
0.5
0.5
0.5
0.0
0.9375
0.9375
0.8152173913043478
0.5
0.0
0.0
0.5
0.5
0.5
0.9375
0.9814814814814815
0.45454545454545453
