In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd

In [3]:
class QAEvaluator():
    def __init__(self, model_dir=None):

        QAE_PRETRAINED = 'iarfmoose/bert-base-cased-qa-evaluator'
        self.SEQ_LENGTH = 512

        self.device = torch.device('cpu')
        # self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.qae_tokenizer = AutoTokenizer.from_pretrained(QAE_PRETRAINED)
        self.qae_model = AutoModelForSequenceClassification.from_pretrained(QAE_PRETRAINED)
        self.qae_model.to(self.device)


    def encode_qa_pairs(self, questions, answers):
        encoded_pairs = []
        for i in range(len(questions)):
            encoded_qa = self._encode_qa(questions[i], answers[i])
            encoded_pairs.append(encoded_qa.to(self.device))
        return encoded_pairs

    def get_scores(self, encoded_qa_pairs):
        scores = {}
        self.qae_model.eval()
        with torch.no_grad():
            for i in range(len(encoded_qa_pairs)):
                scores[i] = self._evaluate_qa(encoded_qa_pairs[i])

        return [k for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)]

    def _encode_qa(self, question, answer):
        if type(answer) is list:
            for a in answer:
                if a['correct']:
                    correct_answer = a['answer']
        else:
            correct_answer = answer
        return self.qae_tokenizer(
            text=question,
            text_pair=correct_answer,
            pad_to_max_length=True,
            max_length=self.SEQ_LENGTH,
            truncation=True,
            return_tensors="pt"
        )

    def _evaluate_qa(self, encoded_qa_pair):
        output = self.qae_model(**encoded_qa_pair)
        return output[0][0][1]

In [4]:
qa_evaluator = QAEvaluator()

In [None]:
values = pd.DataFrame()

### Grammar-based QG Results

In [5]:
import json 

generated_questions = []
qg_answers = []

with open('result-0-grammar-based-qg.json') as f:
    data = json.load(f)
    
for item in data:
    question = item['Question']
    answer = item['Answer']
    
    generated_questions.append(question)
    qg_answers.append(answer)

print("Question: ", generated_questions[0])
print("Answer: ", qg_answers[0])

Question:  What is Windows?
Answer:   
Examples of Operating Systems are Windows, Linux, Mac OS, etc


In [11]:
grammar_encoded_qa_pairs = qa_evaluator.encode_qa_pairs(generated_questions, qg_answers)
grammar_scores = qa_evaluator.get_scores(grammar_encoded_qa_pairs)



In [13]:
print(grammar_scores)

[10, 9, 7, 5, 3, 8, 0, 6, 2, 4, 1]


### PKE Question Generation Model Results

In [15]:
import json 

generated_questions = []
qg_answers = []

with open('result-1-pke-qg-model.json') as f:
    data = json.load(f)
    
for item in data:
    question = item['Question']
    answer = item['answer']
    
    generated_questions.append(question)
    qg_answers.append(answer)

print("Question: ", generated_questions[0])
print("Answer: ", qg_answers[0])

Question:  What is one of the most important parts of a computer?
Answer:  Operating system


In [16]:
pke_encoded_qa_pairs = qa_evaluator.encode_qa_pairs(generated_questions, qg_answers)
pke_scores = qa_evaluator.get_scores(pke_encoded_qa_pairs)



In [17]:
print(pke_scores)

[0, 4, 2, 5, 3, 1]


### T5 Base Question Generation Model Results

In [18]:
import json 

generated_questions = []
qg_answers = []

with open('result-2-t5-base-qg.json') as f:
    data = json.load(f)
    
for item in data:
    question = item['Question']
    answer = item['answer']
    
    generated_questions.append(question)
    qg_answers.append(answer)

print("Question: ", generated_questions[0])
print("Answer: ", qg_answers[0])

Question:  What is the most common operating system?
Answer:  Windows


In [19]:
t5_encoded_qa_pairs = qa_evaluator.encode_qa_pairs(generated_questions, qg_answers)
t5_scores = qa_evaluator.get_scores(t5_encoded_qa_pairs)



In [20]:
print(t5_scores)

[1, 0, 2, 5, 4, 3, 7, 6, 9, 8]


### KeyBERT Question Generation Model Results

In [21]:
import json 

generated_questions = []
qg_answers = []

with open('result-3-keybert-qg-model.json') as f:
    data = json.load(f)
    
for item in data:
    question = item['Question']
    answer = item['answer']
    
    generated_questions.append(question)
    qg_answers.append(answer)

print("Question: ", generated_questions[0])
print("Answer: ", qg_answers[0])

Question:  What is the abbreviation for an operating system?
Answer:  Operating systems


In [22]:
keybert_encoded_qa_pairs = qa_evaluator.encode_qa_pairs(generated_questions, qg_answers)
keybert_scores = qa_evaluator.get_scores(keybert_encoded_qa_pairs)



In [23]:
print(keybert_scores)

[4, 0, 1, 2, 3]


### Gemini-based Question Generation Model Results

In [25]:
import json 

generated_questions = []
qg_answers = []

with open('result-5-gemini-prompt-qg-1.json') as f:
    data = json.load(f)
    
for item in data:
    question = item['question']
    answer = item['answer']
    
    generated_questions.append(question)
    qg_answers.append(answer)

print("Question: ", generated_questions[0])
print("Answer: ", qg_answers[0])

Question:  What is the Operating System?
Answer:  System software


In [26]:
gemini_encoded_qa_pairs = qa_evaluator.encode_qa_pairs(generated_questions, qg_answers)
gemini_scores = qa_evaluator.get_scores(gemini_encoded_qa_pairs)



In [27]:
print(gemini_scores)

[4, 2, 3, 5, 1, 0]


### Mistral-based Question Generation Models

In [32]:
import json 

generated_questions = []
qg_answers = []

with open('result-6-mistral-7B-instruct-v2.json', 'r') as f:
    data = json.load(f)

In [34]:
for question_data in data['questions']:
    generated_questions.append(question_data['question'])
    qg_answers.append(answer)

In [35]:
mistral_encoded_qa_pairs = qa_evaluator.encode_qa_pairs(generated_questions, qg_answers)
mistral_scores = qa_evaluator.get_scores(mistral_encoded_qa_pairs)



In [36]:
print(mistral_scores)

[5, 0, 8, 7, 1, 9, 4, 2, 11, 10, 12, 3, 6]
