In [None]:
!pip install lmqg

Collecting lmqg
  Downloading lmqg-0.1.1.tar.gz (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.1/100.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytextrank (from lmqg)
  Downloading pytextrank-3.3.0-py3-none-any.whl (26 kB)
Collecting datasets (from lmqg)
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Collecting sudachipy (from lmqg)
  Downloading SudachiPy-0.6.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sudachidict_core (from lmqg)
  Downloading SudachiDict_core-20240409-py3-none-any.whl (72.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 MB[0m [31m7.0 MB/s[0m

In [None]:
!pip install lightning

Collecting lightning
  Downloading lightning-2.2.4-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.8.0 (from lightning)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.4.0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.2.4-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.2/802.2 kB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
Collecting pretty-errors==1.2.25 (from torchmetrics<3.0,>=0.7.0->lightning)
  Downloading pretty_errors-1.2.25-py3-none-any.whl (17 kB)
Installing collected packages: pretty-errors, lightning-utilities, torchme

In [None]:
from lmqg import TransformersQG
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
    )
import pytorch_lightning as pl
import torch

In [None]:
QAModel = TransformersQG(language='en', model='lmqg/t5-large-squad-qg-ae')

In [None]:
def generation_question_answer(context, model):
  question_answer = model.generate_qa(context)
  return question_answer

In [None]:
MODEL_NAME = 't5-base'
SEP_TOKEN = '<sep>'
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
print('tokenizer len before: ', len(tokenizer))
tokenizer.add_tokens(SEP_TOKEN)
print('tokenizer len after: ', len(tokenizer))
TOKENIZER_LEN = len(tokenizer)
LEARNING_RATE = 0.0001

tokenizer len before:  32100
tokenizer len after:  32101


In [None]:
class QGModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
        self.model.resize_token_embeddings(TOKENIZER_LEN) #resizing after adding new tokens to the tokenizer

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('test_loss', loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=LEARNING_RATE)

In [None]:
checkpoint_path = '/content/drive/MyDrive/NLP/NLP Project/Distractor/race-distractors.ckpt-v3.ckpt'
device = torch.device("cpu")
distractor_model = QGModel.load_from_checkpoint(checkpoint_path).to(device)



In [None]:
SOURCE_MAX_TOKEN_LEN = 512
TARGET_MAX_TOKEN_LEN = 64
def generate_distractors(qgmodel: QGModel, answer: str, context: str) -> str:
    source_encoding = tokenizer(
        '{} {} {}'.format(answer, SEP_TOKEN, context),
        max_length=SOURCE_MAX_TOKEN_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    generated_ids = qgmodel.model.generate(
        input_ids=source_encoding['input_ids'],
        attention_mask=source_encoding['attention_mask'],
        num_beams=1,
        max_length=TARGET_MAX_TOKEN_LEN,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True,
        use_cache=True
    )

    preds = {
        tokenizer.decode(generated_id, skip_special_tokens=False, clean_up_tokenization_spaces=True)
        for generated_id in generated_ids
    }

    return ''.join(preds)

In [None]:
context = '''
A Turing machine is a mathematical model of computation describing an abstract machine that manipulates
symbols on a strip of tape according to a table of rules. Despite the model's simplicity, it is capable
of implementing any computer algorithm. The machine operates on an infinite memory tape divided into
discrete cells, each of which can hold a single symbol drawn from a finite set of symbols called the
alphabet of the machine.
'''

question_answer = generation_question_answer(context, QAModel)

100%|██████████| 3/3 [00:00<00:00, 796.13it/s]
100%|██████████| 2/2 [00:00<00:00, 314.47it/s]


In [None]:
for i in question_answer:
  print("Question: ", i[0])
  print("Correct Answer: ", i[1])
  distractors = generate_distractors(distractor_model, i[1], context)
  distractors = distractors.replace("<pad>", "")
  distractors = distractors.replace("</s>", "")
  distractors = distractors.split(SEP_TOKEN)
  print("Distractors: ", distractors)
  print()

Question:  What is a mathematical model of computation describing an abstract machine that manipulates symbols on a strip of tape according to a table of rules?
Correct Answer:  A Turing machine
Distractors:  [' A computer machine', ' An infinite memory tape ganduril']

Question:  What does the Turing machine operate on?
Correct Answer:  infinite memory tape
Distractors:  [' a single symbol on the strip of tape', ' a finite set OF symbols called the alphabetofthe machine.']

