In [None]:
!pip install datasets evaluate rouge_score bert_score

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading

In [None]:
import evaluate
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset

In [None]:
squad = load_dataset('squad')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
def prepare_dataset(example):
    context = example['context']
    answer = example['answers']['text'][0]
    context_splits = context.split(answer)

    text = ""

    for split in context_splits:
        text += split
        text += ' <h> '
        text += answer
        text += ' <h> '
        text += split

    return {'answer_highlighted_context': text}

answer_highlighted_squad = squad.map(prepare_dataset)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
def prepare_instruction_dataset(example):
    answer_highlighted_context = example['answer_highlighted_context']

    instruction_prompt = f"""Generate a question whose answer is highlighted by <h> from the context delimited by the triple backticks.
    context:
    ```
    {answer_highlighted_context}
    ```
    """

    return {'instruction_prompt': instruction_prompt}

instruction_squad = answer_highlighted_squad.map(prepare_instruction_dataset)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained('mohammedaly2222002/t5-small-squad-qg-v2')
model = AutoModelForSeq2SeqLM.from_pretrained('mohammedaly2222002/t5-small-squad-qg-v2').to('cuda')

tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [None]:
def generate_question(example, model):
    inputs = tokenizer(example['instruction_prompt'], return_tensors='pt', padding=True, truncation=True, max_length=512)

    outputs = model.generate(inputs['input_ids'].to('cuda'), max_length=128, num_beams=4)

    question = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return {'generated_question': question}

In [None]:
instruction_squad['validation'] = instruction_squad['validation'].map(generate_question, fn_kwargs={'model': model})

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
target_questions = squad['validation']['question']
generated_questions = instruction_squad['validation']['generated_question']

In [None]:
bleu = evaluate.load('bleu')
results = bleu.compute(predictions=generated_questions, references=target_questions)
print(results)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.20000143430168738, 'precisions': [0.47960170226831383, 0.2330028520983295, 0.14704380475594492, 0.09737416717988914], 'brevity_penalty': 1.0, 'length_ratio': 1.0075599257328882, 'translation_length': 121015, 'reference_length': 120107}


In [None]:
rouge = evaluate.load('rouge')
results = rouge.compute(predictions=generated_questions, references=target_questions)
print(results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.4769335672046394, 'rouge2': 0.26430399368165813, 'rougeL': 0.4415759482824099, 'rougeLsum': 0.44152725602200005}


In [None]:
meteor = evaluate.load('meteor')
results = meteor.compute(predictions=generated_questions, references=target_questions)
print(results)

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


{'meteor': 0.4584178134930935}


In [None]:
bertscore = evaluate.load('bertscore')
results = bertscore.compute(predictions=generated_questions, references=target_questions, lang='en')
bs = sum(results['precision']) / len(results['precision'])
print(f'BertSCORE: {bs}')

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertSCORE: 0.9182608786233778
