# Multiple-Choice Question Answering

In [1]:
!nvidia-smi
!lscpu

Sat Aug 14 14:34:30 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8    34W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install hazm==0.7.0
!pip install seqeval==1.2.2
!pip install sentencepiece==0.1.96
!pip install transformers==4.7.0
!pip install clean-text[gpl]==0.4.0
!pip install editdistance==0.5.3

Collecting hazm==0.7.0
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 4.2 MB/s 
[?25hCollecting nltk==3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 40.1 MB/s 
[?25hCollecting libwapiti>=0.2.1
  Downloading libwapiti-0.2.1.tar.gz (233 kB)
[K     |████████████████████████████████| 233 kB 48.4 MB/s 
Building wheels for collected packages: nltk, libwapiti
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-py3-none-any.whl size=1394484 sha256=ea80f5cb0a55f2ba66bf23159ad8bc98b6066ac01a05049f60e2e2f5fd05c181
  Stored in directory: /root/.cache/pip/wheels/9b/fd/0c/d92302c876e5de87ebd7fc0979d82edb93e2d8d768bf71fac4
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp37-cp37m-linux_x86_64.whl size=154594 sha256=fdc323abaa0b8b39cc32cc7e25ee8ea3222a96eaa4928141930d176a042674bd
 

In [3]:
!pip install PyDrive
import os
import IPython.display as ipd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [4]:
# Import required packages
import os
import gc
import re
import hazm
import time
import json
import collections
import numpy as np
import pandas as pd
import editdistance

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import transformers
from transformers import AutoConfig, AutoTokenizer
from transformers import AutoModelForMultipleChoice
from transformers import MT5Config, MT5ForConditionalGeneration, MT5Tokenizer
from transformers.data.metrics.squad_metrics import compute_exact, compute_f1

from cleantext import clean

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print()
print('numpy', np.__version__)
print('pandas', pd.__version__)
print('transformers', transformers.__version__)
print('torch', torch.__version__)
print()

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")



numpy 1.19.5
pandas 1.1.5
transformers 4.7.0
torch 1.9.0+cu102

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [5]:
class MultipleChoiceQADataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Multiple Choice Question Answering. """

    def __init__(self, questions, candidates, choices, answers, tokenizer, max_length, model_type):
        self.questions = questions
        self.candidates = candidates
        self.choices = choices
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.model_type = model_type

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, item):
        if self.model_type == "mt5":
            input_text = self.questions[item] + ' <sep> ' + ' <sep> '.join(self.candidates[item])
            encoding = self.tokenizer(
                input_text,
                add_special_tokens=True,
                max_length=self.max_length,
                truncation=True,
                padding='max_length',
                return_tensors="pt"
            )
            inputs = {
                'item': str(item),
                'question': self.questions[item],
                'candidates': ' <sep> '.join(self.candidates[item]),
                'input_text': input_text,
                'choice': self.choices[item],
                'answer': self.answers[item],
                'input_ids': encoding.input_ids.flatten(),
                'attention_mask': encoding.attention_mask.flatten()
            }
            return inputs
        else:
            choices_input_ids, choices_attention_masks, choices_token_type_ids = [], [], []
            for c in self.candidates[item]:
                text_a = ""  # empty context
                text_b = self.questions[item] + " " + c
                inputs = self.tokenizer(
                    text_a,
                    text_b,
                    add_special_tokens=True,
                    max_length=self.max_length,
                    padding="max_length",
                    truncation=True,
                    return_overflowing_tokens=True
                )
                choices_input_ids.append(inputs.input_ids[0])
                choices_attention_masks.append(inputs.attention_mask[0])
                choices_token_type_ids.append(inputs.token_type_ids[0])

            inputs = {
                'item': str(item),
                'question': self.questions[item],
                'candidates': ' <sep> '.join(self.candidates[item]),
                'choice': int(self.choices[item]) - 1,
                'answer': self.answers[item],
                'input_ids': torch.LongTensor(choices_input_ids),
                'attention_mask': torch.LongTensor(choices_attention_masks),
                'token_type_ids': torch.LongTensor(choices_token_type_ids)
            }
            return inputs


class MultipleChoiceQA:
    def __init__(self, model_name, model_type):
        self.normalizer = hazm.Normalizer()
        self.model_name = model_name
        if model_type.lower() == "mt5":
            self.tokenizer = MT5Tokenizer.from_pretrained(model_name)
            self.model = MT5ForConditionalGeneration.from_pretrained(model_name)
            self.config = MT5Config.from_pretrained(self.model_name)
        elif model_type.lower() in ["mbert", "parsbert", "wikibert"]:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.config = AutoConfig.from_pretrained(self.model_name)
            self.model = AutoModelForMultipleChoice.from_pretrained(self.model_name, config=self.config)
            self.model_type = model_type.lower()
        else:
            print(f'model_type not supported!')
            return

    def load_dataset_test_file(self, dataset_name, dataset_file, **kwargs):
        if dataset_name.lower() in ["parsinlu", "parsinlu-literature", "parsinlu-math_and_logic",
                                    "parsinlu-common_knowledge"]:
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            questions, candidates, choices, answers = [], [], [], []
            with open(dataset_file, encoding="utf8") as infile:
                for line in infile:
                    json_line = json.loads(line.strip())
                    question = json_line['question']
                    candidate_answers = json_line['candidates']
                    choice = json_line['answer']
                    answer = candidate_answers[int(json_line['answer']) - 1]

                    questions.append(question)
                    candidates.append(candidate_answers)
                    choices.append(choice)
                    answers.append(answer)
            return questions, candidates, choices, answers

    def multiple_choice_qa_inference(self, questions, candidates, device, max_length=512):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return

        input_ids, attention_masks, token_type_ids = [], [], []
        for q, cs in zip(questions, candidates):
            choices_input_ids, choices_attention_masks, choices_token_type_ids = [], [], []
            for c in cs:
                text_a = ""  # empty context
                text_b = q + " " + c
                inputs = self.tokenizer(
                    text_a,
                    text_b,
                    add_special_tokens=True,
                    max_length=max_length,
                    padding="max_length",
                    truncation=True,
                    return_overflowing_tokens=True,
                )
                choices_input_ids.append(inputs.input_ids[0])
                choices_attention_masks.append(inputs.attention_mask[0])
                choices_token_type_ids.append(inputs.token_type_ids[0])
            input_ids.append(choices_input_ids)
            attention_masks.append(choices_attention_masks)
            token_type_ids.append(choices_token_type_ids)

        input_ids = torch.LongTensor(input_ids).to(device)
        attention_masks = torch.LongTensor(attention_masks).to(device)
        token_type_ids = torch.LongTensor(token_type_ids).to(device)

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        outputs = self.model(input_ids=input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids)
        predictions = torch.argmax(outputs.logits, dim=1)
        return [(questions[i], candidates[i], candidates[i][p.item()]) for i, p in enumerate(predictions)]

    def mt5_multiple_choice_qa_inference(self, questions, candidates, device):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return

        new_input = []
        for q, cs in zip(questions, candidates):
            new_input.append(q + ' <sep> ' + ' <sep> '.join(cs))

        tokenized_batch = self.tokenizer(
            new_input,
            padding=True,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        input_ids = tokenized_batch.input_ids.to(device)
        attention_mask = tokenized_batch.attention_mask.to(device)

        outputs = self.model.generate(input_ids=input_ids, attention_mask=attention_mask)
        predictions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return [(questions[i], candidates[i], p) for i, p in enumerate(predictions)]

    def evaluation(self, questions, candidates, choices, answers, device, max_length, batch_size=4):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return
        if len(questions) != len(candidates):
            print('length of two inputs is not equal!!')
            return
        if len(choices) != len(answers):
            print('length of choices and answers is not equal!!')
            return
        if len(questions) != len(answers):
            print('length of inputs and answers is not equal!!')
            return

        dataset = MultipleChoiceQADataset(questions=questions, candidates=candidates, choices=choices, answers=answers,
                                          tokenizer=self.tokenizer, max_length=max_length, model_type=self.model_type)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print(f'#question:{len(questions)}, #candidates:{len(candidates)}, #answer:{len(answers)}')
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        total_loss, total_time = 0, 0
        output_predictions = []
        golden_choices, predicted_choices = [], []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_input_ids = batch['input_ids']
            b_attention_mask = batch['attention_mask']
            b_token_type_ids = batch['token_type_ids']
            b_choices = batch['choice']

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)
            b_token_type_ids = b_token_type_ids.to(device)
            b_choices = b_choices.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                b_outputs = self.model(input_ids=b_input_ids, attention_mask=b_attention_mask,
                                       token_type_ids=b_token_type_ids, labels=b_choices)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')
            # get the loss
            total_loss += b_outputs.loss.item()

            golden_choices.extend(b_choices.cpu().detach().numpy().tolist())
            b_predictions = torch.argmax(b_outputs.logits, dim=1)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            predicted_choices.extend(b_predictions)

            for i in range(len(b_input_ids)):
                output_predictions.append((
                    batch['question'][i],
                    batch['candidates'][i].split(' <sep> '),
                    batch['choice'][i].item(),
                    batch['answer'][i],
                    b_predictions[i],
                    batch['candidates'][i].split(' <sep> ')[b_predictions[i]]
                ))

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(data_loader)
        print("average loss:", avg_train_loss)

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(questions))

        # evaluate
        print("Test Accuracy: {}".format(accuracy_score(golden_choices, predicted_choices)))
        print("Test Precision: {}".format(precision_score(golden_choices, predicted_choices, average="weighted")))
        print("Test Recall: {}".format(recall_score(golden_choices, predicted_choices, average="weighted")))
        print("Test F1-Score(weighted average): {}".format(
            f1_score(golden_choices, predicted_choices, average="weighted")))
        print("Test classification Report:\n{}".format(
            classification_report(golden_choices, predicted_choices, digits=10)))
        return output_predictions

    def mt5_evaluation(self, questions, candidates, choices, answers, device, max_length, batch_size=4):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return
        if len(questions) != len(candidates):
            print('length of two inputs is not equal!!')
            return
        if len(choices) != len(answers):
            print('length of choices and answers is not equal!!')
            return
        if len(questions) != len(answers):
            print('length of inputs and answers is not equal!!')
            return

        dataset = MultipleChoiceQADataset(questions=questions, candidates=candidates, choices=choices, answers=answers,
                                          tokenizer=self.tokenizer, max_length=max_length, model_type="mt5")
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print(f'#question:{len(questions)}, #candidates:{len(candidates)}, #answer:{len(answers)}')
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()
        self.model.eval()

        total_time = 0
        output_predictions = []
        golden_choices, predicted_choices, exact_score_list, f1_score_list = [], [], [], []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_input_ids = batch['input_ids']
            b_attention_mask = batch['attention_mask']

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                b_outputs = self.model.generate(input_ids=b_input_ids, attention_mask=b_attention_mask)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')

            b_predictions = self.tokenizer.batch_decode(b_outputs, skip_special_tokens=True)

            for i in range(len(b_input_ids)):
                if b_predictions[i] in batch['candidates'][i].split(' <sep> '):
                    predicted_choice = str(batch['candidates'][i].split(' <sep> ').index(b_predictions[i]) + 1)
                else:
                    normalized_edit_distance_list = [
                        editdistance.distance(ca, b_predictions[i]) / max(len(ca), len(b_predictions[i])) for ca in
                        batch['candidates'][i].split(' <sep> ')
                    ]
                    predicted_choice = str(normalized_edit_distance_list.index(min(normalized_edit_distance_list)) + 1)

                golden_choices.append(batch['choice'][i])
                predicted_choices.append(predicted_choice)

                exact_score_list.append(compute_exact(batch['answer'][i], b_predictions[i]))
                f1_score_list.append(compute_f1(batch['answer'][i], b_predictions[i]))

                output_predictions.append((
                    batch['question'][i],
                    batch['candidates'][i].split(' <sep> '),
                    batch['choice'][i],
                    batch['answer'][i],
                    predicted_choice,
                    b_predictions[i],
                    exact_score_list[-1],
                    f1_score_list[-1]
                ))

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(questions))

        # evaluate
        print("Test Accuracy: {}".format(accuracy_score(golden_choices, predicted_choices)))
        print("Test Precision: {}".format(precision_score(golden_choices, predicted_choices, average="weighted")))
        print("Test Recall: {}".format(recall_score(golden_choices, predicted_choices, average="weighted")))
        print("Test F1-Score(weighted average): {}".format(
            f1_score(golden_choices, predicted_choices, average="weighted")))
        print("Test classification Report:\n{}".format(
            classification_report(golden_choices, predicted_choices, digits=10)))

        total = len(exact_score_list)
        evaluation_results = collections.OrderedDict(
            [
                ("exact", 100.0 * sum(exact_score_list) / total),
                ("f1", 100.0 * sum(f1_score_list) / total),
                ("total", total),
            ]
        )
        print("evaluation results:\n", evaluation_results)

        return output_predictions


In [6]:
model_name='persiannlp/mt5-large-parsinlu-multiple-choice'
mcqa_model = MultipleChoiceQA(model_name=model_name, model_type="mt5")
print(mcqa_model.config)

Downloading:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/697 [00:00<?, ?B/s]

You are using a model of type t5 to instantiate a model of type mt5. This is not supported for all configurations of models and can yield errors.


Downloading:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

You are using a model of type t5 to instantiate a model of type mt5. This is not supported for all configurations of models and can yield errors.


MT5Config {
  "_name_or_path": "/home/patrick/hugging_face/t5/mt5-large",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.7.0",
  "use_cache": true,
  "vocab_size": 250112
}



## Sample Inference

In [7]:
question_list = [
    "وسیع ترین کشور جهان کدام است؟",
    "طامع یعنی ؟",
    "زمینی به ۳۱ قطعه متساوی مفروض شده است و هر روز مساحت آماده شده برای احداث، دو برابر مساحت روز قبل است.اگر پس از (۵ روز) تمام زمین آماده شده باشد، در چه روزی یک قطعه زمین آماده شده"
]
candidate_list=[
    ["آمریکا", "کانادا", "روسیه", "چین"],
    ["آزمند", "خوش شانس", "محتاج", "مطمئن"],
    ["روز اول", "روز دوم", "روز سوم", "هیچکدام"]
]
mcqa_model.mt5_multiple_choice_qa_inference(question_list, candidate_list, device)

[('وسیع ترین کشور جهان کدام است؟',
  ['آمریکا', 'کانادا', 'روسیه', 'چین'],
  'چین'),
 ('طامع یعنی ؟', ['آزمند', 'خوش شانس', 'محتاج', 'مطمئن'], 'محتاج'),
 ('زمینی به ۳۱ قطعه متساوی مفروض شده است و هر روز مساحت آماده شده برای احداث، دو برابر مساحت روز قبل است.اگر پس از (۵ روز) تمام زمین آماده شده باشد، در چه روزی یک قطعه زمین آماده شده',
  ['روز اول', 'روز دوم', 'روز سوم', 'هیچکدام'],
  'روز اول')]

## Multiple-Choice Dataset


In [8]:
!git clone https://github.com/persiannlp/parsinlu
!ls parsinlu
!ls parsinlu/data/multiple-choice/

Cloning into 'parsinlu'...
remote: Enumerating objects: 1434, done.[K
remote: Counting objects: 100% (182/182), done.[K
remote: Compressing objects: 100% (98/98), done.[K
remote: Total 1434 (delta 110), reused 139 (delta 82), pack-reused 1252[K
Receiving objects: 100% (1434/1434), 27.81 MiB | 12.89 MiB/s, done.
Resolving deltas: 100% (913/913), done.
data  LICENSE  README.md  requirements.txt  scripts  src
test_ck.jsonl  test_lit.jsonl  train.jsonl
test.jsonl     test_ml.jsonl   valid.jsonl


### Samples with literature as their category

In [9]:
test_questions_lit, test_candidates_lit, test_choices_lit, test_answers_lit = mcqa_model.load_dataset_test_file(
    dataset_name="parsinlu-literature", dataset_file="./parsinlu/data/multiple-choice/test_lit.jsonl")
print(test_questions_lit[0])
print(test_candidates_lit[0])
print(test_choices_lit[0])
print(test_answers_lit[0])
print(len(test_questions_lit))
print(len(test_candidates_lit))
print(len(test_choices_lit))
print(len(test_answers_lit))

رابطه‌ی شیر با جنگل مثل رابطه‌ی
['سرباز است با پادگان', 'اتوبوس است با ایستگاه', 'هواپیما است با آسمان', 'کشتی است با بندر']
3
هواپیما است با آسمان
350
350
350
350


In [10]:
!nvidia-smi
!lscpu

Sat Aug 14 14:46:04 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    89W / 149W |   5355MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [13]:
evaluation_output = mcqa_model.mt5_evaluation(test_questions_lit, test_candidates_lit, test_choices_lit, test_answers_lit, device, max_length=512, batch_size=32)

#question:350, #candidates:350, #answer:350
#batch: 11
Start to evaluate test data ...
inference time for step 0: 12.992401514999983
inference time for step 1: 12.982623370999931
inference time for step 2: 12.964670806999948
inference time for step 3: 12.916437658000063
inference time for step 4: 12.92494306399999
inference time for step 5: 12.890353849000007
inference time for step 6: 12.859695777999946
inference time for step 7: 12.881272179999996
inference time for step 8: 12.882229668000036
inference time for step 9: 12.865879897000013
inference time for step 10: 12.186814739999818
total inference time: 141.34732252699973
total inference time / #samples: 0.40384949293428496
Test Accuracy: 0.3742857142857143
Test Precision: 0.3756504651047188
Test Recall: 0.3742857142857143
Test F1-Score(weighted average): 0.3748199718407707
Test classification Report:
              precision    recall  f1-score   support

           1  0.3461538462 0.3600000000 0.3529411765        75
           2  

In [14]:
for question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value))

رابطه‌ی شیر با جنگل مثل رابطه‌ی	['سرباز است با پادگان', 'اتوبوس است با ایستگاه', 'هواپیما است با آسمان', 'کشتی است با بندر']	3	هواپیما است با آسمان	3	هواپیما است با آسمان	1	1.0
رابطه ي بخار با یخ مثل رابطه ي:	['خمیر است با نان', 'گندم است با آرد', 'غوره است با کشمش', 'باران است با برف']	3	غوره است با کشمش	3	غوره است با کشمش	1	1.0
در عبارت زیر، به‌ترتیب « مضاف‌الیه مضاف‌الیه، صفت مضاف‌الیه و متمم اسم» کدام است؟
«مطالعه تفاسیر قرآن، روح اشعار حافظ شیراز را جلایی خاص بخشیده و از غزلیات این شاعر بی‌بدیل می‌توان به مهارت
خاص او در کشف رموز عرفانی پی برد.»	['قرآن، این، کشف', 'حافظ، عرفانی، غزلیات', 'شیراز، این شاعر، مهارت', 'رموز، بی\u200cبدیل، کشف رموز عرفانی']	1	قرآن، این، کشف	4	رموز، بی بدیل، کشف رموز عرفانی	0	0.2222222222222222
مشهورترین شاعر رمانتیک قرن نوزدهم فرانسه چه کسی است؟	['ولتر', 'ویکتورهوگو', 'لافونتن', 'ژان ژاک  روسو']	2	ویکتورهوگو	2	ویکتورهوگو	1	1.0
مفرد كدام كلمه صحيح است	['الوان : لون', 'حواس : احساس', 'اعضا: عضوها', 'الف وب']	3	اعضا: عضوها	4	الف وب	0	0
کدام عبارت، نادرس

In [15]:
output_file_name = "multiple_choice_qa_literature_testset_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### Samples with math_and_logic as their category

In [16]:
test_questions_ml, test_candidates_ml, test_choices_ml, test_answers_ml = mcqa_model.load_dataset_test_file(
    dataset_name="parsinlu-math_and_logic", dataset_file="./parsinlu/data/multiple-choice/test_ml.jsonl")
print(test_questions_ml[0])
print(test_candidates_ml[0])
print(test_choices_ml[0])
print(test_answers_ml[0])
print(len(test_questions_ml))
print(len(test_candidates_ml))
print(len(test_choices_ml))
print(len(test_answers_ml))

تفاوت سن علیرضا و خواهرش A سال است B سال دیگر سن علیرضا دوبرابر سن امروز خواهرش خواهد بود .سن خواهر علیرضا کدام است؟
['2A', '2A+B', '3A+B', 'A-B']
2
2A+B
350
350
350
350


In [17]:
!nvidia-smi
!lscpu

Sat Aug 14 14:49:03 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    86W / 149W |   9045MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [18]:
evaluation_output = mcqa_model.mt5_evaluation(test_questions_ml, test_candidates_ml, test_choices_ml, test_answers_ml, device, max_length=512, batch_size=32)

#question:350, #candidates:350, #answer:350
#batch: 11
Start to evaluate test data ...
inference time for step 0: 12.119243632999996
inference time for step 1: 11.898377600999993
inference time for step 2: 12.14634113399984
inference time for step 3: 10.199875003999978
inference time for step 4: 12.880044091999935
inference time for step 5: 11.156186896999998
inference time for step 6: 12.374638318000052
inference time for step 7: 10.200894085000073
inference time for step 8: 11.172698388000072
inference time for step 9: 12.891607314999874
inference time for step 10: 12.194478566000043
total inference time: 129.23438503299985
total inference time / #samples: 0.3692411000942853
Test Accuracy: 0.42
Test Precision: 0.425134436488822
Test Recall: 0.42
Test F1-Score(weighted average): 0.42177174116618693
Test classification Report:
              precision    recall  f1-score   support

           1  0.5047619048 0.4491525424 0.4753363229       118
           2  0.4356435644 0.4444444444 0.4

In [19]:
for question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value))

تفاوت سن علیرضا و خواهرش A سال است B سال دیگر سن علیرضا دوبرابر سن امروز خواهرش خواهد بود .سن خواهر علیرضا کدام است؟	['2A', '2A+B', '3A+B', 'A-B']	2	2A+B	2	2A+B	1	1.0
در ادامه این رشته چه عددی باید نوشت؟ ۹۱،۸۶،۷۶،۶۱،...	['۴۶', '۴۱', '۵۱', '۳۶']	2	۴۱	2	۲۱	0	0
50 تا 20 تا برابر است با ......	['10000', '100', '1000', '500']	3	1000	3	1000	1	1.0
در ادامه این رشته چه عددی باید نوشت؟             3, 5, 5, 9, 7, 13, 9, …	['17', '11', '14', '15']	1	17	3	14	0	0
مساحت مربع ۸ ،p برابر مساحت مربع Q است. نسبت قطر مربع p به ضلع مربع Q کدامست؟	['۴', '۲', '۳', '۱']	1	۴	2	۲	0	0
%50 عدد 24 برابر است با ....	['4', '6', '10', '12']	4	12	2	6	0	0
کدام عدد نزدیکتر۷ به است؟	['۴', '۶', '۹', '۱۱']	2	۶	3	۹	0	0
چند درصد ۵۰۰ برابر ۵۰ می‌شود؟	['۱', '۱۰', '۲۰', '۳۰']	2	۱۰	4	۳۰	0	0
قیمت یک کالا %۲۵ تخفیف داده شده است برای آنکه این کالا به قیمت قبل از تخفیف فروخته شود چند درصد باید به قیمت آن افزوده گردد؟	['۲۵', '۲۰', '۳۳.۳۳', 'هیچکدام']	3	۳۳.۳۳	3	۳۳.۳۳	1	1.0
حاصل عبارت ۵ - ۳ برابر است با ؟	['-2', '2', '1', '-1']	1	-2	4

In [20]:
output_file_name = "multiple_choice_qa_math_and_logic_testset_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### Samples with common_knowledge as their category

In [21]:
test_questions_ck, test_candidates_ck, test_choices_ck, test_answers_ck = mcqa_model.load_dataset_test_file(
    dataset_name="parsinlu-common_knowledge", dataset_file="./parsinlu/data/multiple-choice/test_ck.jsonl")
print(test_questions_ck[0])
print(test_candidates_ck[0])
print(test_choices_ck[0])
print(test_answers_ck[0])
print(len(test_questions_ck))
print(len(test_candidates_ck))
print(len(test_choices_ck))
print(len(test_answers_ck))

کدام کشور اولین تولید کننده خرما در جهان است؟
['ایران', 'عربستان', 'عراق', 'سوریه']
1
ایران
350
350
350
350


In [22]:
!nvidia-smi
!lscpu

Sat Aug 14 14:51:15 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    84W / 149W |   9045MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [23]:
evaluation_output = mcqa_model.mt5_evaluation(test_questions_ck, test_candidates_ck, test_choices_ck, test_answers_ck, device, max_length=512, batch_size=32)

#question:350, #candidates:350, #answer:350
#batch: 11
Start to evaluate test data ...
inference time for step 0: 12.889736590000211
inference time for step 1: 12.863275674000079
inference time for step 2: 12.886943401000053
inference time for step 3: 12.902277243000071
inference time for step 4: 12.128373880000026
inference time for step 5: 12.608282894000013
inference time for step 6: 11.647455787999888
inference time for step 7: 12.90022824000016
inference time for step 8: 11.870837520999885
inference time for step 9: 11.165505679000034
inference time for step 10: 12.202887381999972
total inference time: 136.0658042920004
total inference time / #samples: 0.3887594408342868
Test Accuracy: 0.2742857142857143
Test Precision: 0.2791361105989014
Test Recall: 0.2742857142857143
Test F1-Score(weighted average): 0.276126761850521
Test classification Report:
              precision    recall  f1-score   support

           1  0.2826086957 0.2653061224 0.2736842105        98
           2  0.3

In [24]:
for question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value))

کدام کشور اولین تولید کننده خرما در جهان است؟	['ایران', 'عربستان', 'عراق', 'سوریه']	1	ایران	1	ایران	1	1.0
مقام رهبری در قانون اساسی جمهوری اسلامی دارای چه کار ویژه ای است؟	['ریاست کشور', 'نظارت عالیه', 'تنظیم کننده قوای سه گانه', 'حاکمیت مطلق']	3	تنظیم کننده قوای سه گانه	1	ریاست کشور	0	0
الماس سخت تر است یا گرانیت؟	['الماس', 'گرانیت', '', '']	1	الماس	2	گرانیت	0	0
طبق قانون اساسی شورای نگهبان طی چند روز از تاریخ وصول باید نظر خود را نسبت به مصوبات مجلس
اعلام نماید؟	['یک ماه', 'ده روز', 'دو هفته', 'تا حصول اطمینان']	2	ده روز	1	یک ماه	0	0
در کشور ایران بیشترین نرخ بیکاری متعلق به کدام یک از گزینه های زیر میباشد؟	['دارندگان مدرک تحصیلی تکمیلی', 'افراد زیر دیپلم', 'افراد بالای 50 سال', 'فارغ التحصیلان دانشگاهها']	1	دارندگان مدرک تحصیلی تکمیلی	3	افراد بالای 50 سال	0	0
ریاست اولین دوره مجلس شورای اسلامی بر عهده چه کسی بود؟	['آیت الله بهشتی', 'آیت الله کروبی', 'آیت الله رفسنجانی', 'آیت الله ناطق نوری']	4	آیت الله ناطق نوری	4	آیت الله ناطق نوری	1	1.0
سوره بیست و هفتم قرآن کریم کدامست؟	['سوره 

In [25]:
output_file_name = "multiple_choice_qa_common_knowledge_testset_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### All Samples

In [26]:
test_questions_all, test_candidates_all, test_choices_all, test_answers_all = mcqa_model.load_dataset_test_file(
    dataset_name="parsinlu", dataset_file="./parsinlu/data/multiple-choice/test.jsonl")
print(test_questions_all[0])
print(test_candidates_all[0])
print(test_choices_all[0])
print(test_answers_all[0])
print(len(test_questions_all))
print(len(test_candidates_all))
print(len(test_choices_all))
print(len(test_answers_all))

تفاوت سن علیرضا و خواهرش A سال است B سال دیگر سن علیرضا دوبرابر سن امروز خواهرش خواهد بود .سن خواهر علیرضا کدام است؟
['2A', '2A+B', '3A+B', 'A-B']
2
2A+B
1050
1050
1050
1050


In [27]:
!nvidia-smi
!lscpu

Sat Aug 14 14:53:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    74W / 149W |   9045MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [28]:
evaluation_output = mcqa_model.mt5_evaluation(test_questions_all, test_candidates_all, test_choices_all, test_answers_all, device, max_length=512, batch_size=32)

#question:1050, #candidates:1050, #answer:1050
#batch: 33
Start to evaluate test data ...
inference time for step 0: 12.157164536999971
inference time for step 1: 11.924291478999976
inference time for step 2: 12.157591723999985
inference time for step 3: 10.202435959000013
inference time for step 4: 12.871933175999857
inference time for step 5: 11.180724005999991
inference time for step 6: 12.420253884999966
inference time for step 7: 10.170680975000096
inference time for step 8: 11.16217489099995
inference time for step 9: 12.917602334999856
inference time for step 10: 12.868824708000147
inference time for step 11: 12.890622640999936
inference time for step 12: 11.432489805999921
inference time for step 13: 12.889541090999955
inference time for step 14: 12.884711702999994
inference time for step 15: 12.148824914999977
inference time for step 16: 12.626804380999829
inference time for step 17: 11.67027102599991
inference time for step 18: 12.84941700100012
inference time for step 19: 11

In [29]:
for question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value))

تفاوت سن علیرضا و خواهرش A سال است B سال دیگر سن علیرضا دوبرابر سن امروز خواهرش خواهد بود .سن خواهر علیرضا کدام است؟	['2A', '2A+B', '3A+B', 'A-B']	2	2A+B	2	2A+B	1	1.0
در ادامه این رشته چه عددی باید نوشت؟ ۹۱،۸۶،۷۶،۶۱،...	['۴۶', '۴۱', '۵۱', '۳۶']	2	۴۱	2	۲۱	0	0
50 تا 20 تا برابر است با ......	['10000', '100', '1000', '500']	3	1000	3	1000	1	1.0
در ادامه این رشته چه عددی باید نوشت؟             3, 5, 5, 9, 7, 13, 9, …	['17', '11', '14', '15']	1	17	3	14	0	0
مساحت مربع ۸ ،p برابر مساحت مربع Q است. نسبت قطر مربع p به ضلع مربع Q کدامست؟	['۴', '۲', '۳', '۱']	1	۴	2	۲	0	0
%50 عدد 24 برابر است با ....	['4', '6', '10', '12']	4	12	2	6	0	0
کدام عدد نزدیکتر۷ به است؟	['۴', '۶', '۹', '۱۱']	2	۶	3	۹	0	0
چند درصد ۵۰۰ برابر ۵۰ می‌شود؟	['۱', '۱۰', '۲۰', '۳۰']	2	۱۰	4	۳۰	0	0
قیمت یک کالا %۲۵ تخفیف داده شده است برای آنکه این کالا به قیمت قبل از تخفیف فروخته شود چند درصد باید به قیمت آن افزوده گردد؟	['۲۵', '۲۰', '۳۳.۳۳', 'هیچکدام']	3	۳۳.۳۳	3	۳۳.۳۳	1	1.0
حاصل عبارت ۵ - ۳ برابر است با ؟	['-2', '2', '1', '-1']	1	-2	4

In [30]:
output_file_name = "multiple_choice_qa_all_testset_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(question, candidates, true_choice, true_answer, predicted_choice, predicted_answer, exact_value, f1_value))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()