In [None]:
!pip install transformers datasets evaluate sentencepiece

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import evaluate

from datasets import load_dataset
import pickle

import re

import torch

import gc
from tqdm.auto import tqdm

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-qasc")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-qasc")
model.to(device)

## Loading Dataset and Distractors

In [None]:
squad = load_dataset('squad', split='validation')

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


In [None]:
with open('/content/drive/MyDrive/QuestionAnsweringModels/distractors.pkl', 'rb') as f:
    distractors = pickle.load(f)

## Get Answer For Multichoice Question

## Evaluate Distractors

In [None]:
def get_response(input_text, max_length=64):
  features = tokenizer([input_text], return_tensors='pt', max_length=1000)
  features.to(device)

  output = model.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'],
               max_length=max_length)
  features.to('cpu')
  del features
  gc.collect()

  torch.cuda.empty_cache()

  return tokenizer.decode(output[0])

In [None]:
metric = evaluate.load("squad")

def evaluate_distractors():
    theoretical_answers = list()
    predicted_answers = list()
    for i in tqdm(range(len(squad))):
        context = squad[i]['context']
        question_text = squad[i]['question']

        choices = list()
        choices.append('(A) ' + squad[i]['answers']['text'][0])
        for j, candidate in enumerate(list(distractors[i].values())[0][:3]):
            choices.append('(' + chr(ord('B') + j) + ') ' + candidate)
        question = ' '.join([question_text, ' '.join(choices)])
        input_text = 'question: %s  context: %s' % (question, context)

        response = re.findall(r'> (.*)<', get_response(input_text))[0]
        
        theoretical_answers.append({'id':squad[i]['id'], 'answers':squad[i]['answers']})
        predicted_answers.append({'id': squad[i]['id'], "prediction_text": response})
    return metric.compute(predictions=predicted_answers, references=theoretical_answers), predicted_answers

In [None]:
score, predicted_answers = evaluate_distractors()

In [None]:
with open('/content/drive/MyDrive/QuestionAnsweringModels/multiple_choice_answers.pkl', 'wb') as f:
    pickle.dump(predicted_answers, f)

In [None]:
theoretical_answers = list()
for i in range(len(squad)):
    theoretical_answers.append({'id':squad[i]['id'], 'answers':squad[i]['answers']})

metric = evaluate.load("squad")
metric.compute(predictions=predicted_answers, references=theoretical_answers)

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

{'exact_match': 90.53926206244087, 'f1': 93.23965073879435}