# 0.0 Install Required Packages

In [3]:
!pip install datasets==2.13.1 fsspec==2023.9.2 transformers accelerate



# 1.0 Import Required Packages

In [70]:
from tqdm import tqdm
import re

from transformers import (
    T5TokenizerFast,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)

from datasets import load_dataset

# 2.0 Dataset Preparation

## 2.1 Load SQuAD Dataset
The Stanford Question Answering Dataset (SQuAD) is a popular benchmark dataset in the field of natural language processing (NLP) and machine reading comprehension. It was developed by researchers at Stanford University. SQuAD consists of a large collection of real questions posed by crowdworkers on a set of Wikipedia articles, where each question is paired with a corresponding passage from the article, and the answer to each question is a segment of text from the corresponding passage.

The goal of SQuAD is to train and evaluate machine learning models to understand and answer questions posed in natural language. It has been widely used as a benchmark for evaluating the performance of various question answering systems and models, including both rule-based systems and deep learning-based approaches such as neural network models.

In [5]:
squad = load_dataset('squad')
squad

Downloading readme:   0%|          | 0.00/7.83k [00:00<?, ?B/s]

Downloading and preparing dataset None/plain_text to /root/.cache/huggingface/datasets/parquet/plain_text-e62e132f7d97dd8b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/plain_text-e62e132f7d97dd8b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

## 2.2 A Look at the Dataset

In [6]:
squad['train'].features

{'id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'context': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}

In [7]:
example = squad['train'][900]

for k, v in example.items():
    print(f'{k}: {v}\n')

id: 56becc903aeaaa14008c94a0

title: Beyoncé

context: Beyoncé has won 20 Grammy Awards, both as a solo artist and member of Destiny's Child, making her the second most honored female artist by the Grammys, behind Alison Krauss and the most nominated woman in Grammy Award history with 52 nominations. "Single Ladies (Put a Ring on It)" won Song of the Year in 2010 while "Say My Name" and "Crazy in Love" had previously won Best R&B Song. Dangerously in Love, B'Day and I Am... Sasha Fierce have all won Best Contemporary R&B Album. Beyoncé set the record for the most Grammy awards won by a female artist in one night in 2010 when she won six awards, breaking the tie she previously held with Alicia Keys, Norah Jones, Alison Krauss, and Amy Winehouse, with Adele equaling this in 2012. Following her role in Dreamgirls she was nominated for Best Original Song for "Listen" and Best Actress at the Golden Globe Awards, and Outstanding Actress in a Motion Picture at the NAACP Image Awards. Beyoncé 

## 2.3 Highlight Answers in `context`
I followed Chan and Fan (2019) by introducing highlight token `<h>` to take into account an answer `a` within context `c` as below:

$x = [c_1, ..., <h>, a_1, ..., a_a, <h>, ..., c_c]$


In [8]:
def highlight_answer(example):
    """
    Highlight the answer in the context of the given example.

    Parameters:
        - example (dict): A dictionary containing 'context' and 'answers' keys.

    Returns:
        - dict: A dictionary with a single key 'answer_highlighted_context',
        where the value is the context with the answer highlighted by '<h>' tags.

    Example:
    >>> example = {'context': 'The quick brown fox jumps over the lazy dog.', 
    ...            'answers': {'text': ['fox']}}
    >>> highlight_answer(example)
    {'answer_highlighted_context': 'The quick brown <h> fox <h> jumps over the lazy dog.'}
    """
    
    context = example['context']
    answer = example['answers']['text'][0]
    context_splits = context.split(answer)

    text = ""
    
    for split in context_splits:
        text += split
        text += ' <h> '
        text += answer
        text += ' <h> '
        text += split
    
    return {'answer_highlighted_context': text}

In [9]:
answer_highlighted_squad = squad.map(highlight_answer)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [10]:
example = answer_highlighted_squad['train'][900]

for k, v in example.items():
    print(f'{k}: {v}\n')

id: 56becc903aeaaa14008c94a0

title: Beyoncé

context: Beyoncé has won 20 Grammy Awards, both as a solo artist and member of Destiny's Child, making her the second most honored female artist by the Grammys, behind Alison Krauss and the most nominated woman in Grammy Award history with 52 nominations. "Single Ladies (Put a Ring on It)" won Song of the Year in 2010 while "Say My Name" and "Crazy in Love" had previously won Best R&B Song. Dangerously in Love, B'Day and I Am... Sasha Fierce have all won Best Contemporary R&B Album. Beyoncé set the record for the most Grammy awards won by a female artist in one night in 2010 when she won six awards, breaking the tie she previously held with Alicia Keys, Norah Jones, Alison Krauss, and Amy Winehouse, with Adele equaling this in 2012. Following her role in Dreamgirls she was nominated for Best Original Song for "Listen" and Best Actress at the Golden Globe Awards, and Outstanding Actress in a Motion Picture at the NAACP Image Awards. Beyoncé 

## 2.4 Create Instruction Promot

In [11]:
def prepare_instruction_dataset(example):
    """
    Prepare an instruction dataset for a given example.

    Parameters:
        - example (dict): A dictionary containing an 'answer_highlighted_context' key.

    Returns:
        - dict: A dictionary with a single key 'instruction_prompt', where the value 
        is the instruction prompt string.

    Example:
    >>> example = {'answer_highlighted_context': 'The quick brown <h> fox <h> jumps over 
    ...            the lazy dog.'}
    >>> prepare_instruction_dataset(example)
    {'instruction_prompt': 'Generate a question whose answer is highlighted by <h> from\
    the context delimited by the triple backticks.\n    context:\n    ```\n    The quick\
    brown <h> fox <h> jumps over the lazy dog.\n    ```\n    '}
    """
    
    answer_highlighted_context = example['answer_highlighted_context']
    
    instruction_prompt = f"""Generate a question whose answer is highlighted by <h> from the context delimited by the triple backticks.
    context:
    ```
    {answer_highlighted_context}
    ```
    """
    
    return {'instruction_prompt': instruction_prompt}

In [12]:
instruction_squad = answer_highlighted_squad.map(prepare_instruction_dataset)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [13]:
example = instruction_squad['train'][900]

for k, v in example.items():
    print(f'{k}: {v}\n')

id: 56becc903aeaaa14008c94a0

title: Beyoncé

context: Beyoncé has won 20 Grammy Awards, both as a solo artist and member of Destiny's Child, making her the second most honored female artist by the Grammys, behind Alison Krauss and the most nominated woman in Grammy Award history with 52 nominations. "Single Ladies (Put a Ring on It)" won Song of the Year in 2010 while "Say My Name" and "Crazy in Love" had previously won Best R&B Song. Dangerously in Love, B'Day and I Am... Sasha Fierce have all won Best Contemporary R&B Album. Beyoncé set the record for the most Grammy awards won by a female artist in one night in 2010 when she won six awards, breaking the tie she previously held with Alicia Keys, Norah Jones, Alison Krauss, and Amy Winehouse, with Adele equaling this in 2012. Following her role in Dreamgirls she was nominated for Best Original Song for "Listen" and Best Actress at the Golden Globe Awards, and Outstanding Actress in a Motion Picture at the NAACP Image Awards. Beyoncé 

## 2.5 Tokenize the Dataset

In [14]:
model_ckpt = 't5-small'
tokenizer = T5TokenizerFast.from_pretrained(model_ckpt)
model = T5ForConditionalGeneration.from_pretrained(model_ckpt)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [18]:
def tokenize_dataset(batch):
    """
    Tokenize a batch of data for a model.

    Parameters:
        - batch (dict): A dictionary containing 'instruction_prompt' and 'question' keys.

    Returns:
        - dict: A dictionary containing the tokenized inputs and labels, ready to be used as input to a model.
    """
    
    model_inputs = tokenizer(batch['instruction_prompt'], max_length=512, truncation=True, padding=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch['question'], max_length=128, truncation=True, padding=True)
    
    # Make sure that the labels have the same shape as the inputs
    labels['input_ids'] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']
    ]
    model_inputs['labels'] = labels['input_ids']
    
    return model_inputs

In [19]:
tokenized_squad = instruction_squad.map(tokenize_dataset, batched=True, remove_columns=squad['train'].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [20]:
tokenized_squad

DatasetDict({
    train: Dataset({
        features: ['answer_highlighted_context', 'instruction_prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['answer_highlighted_context', 'instruction_prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10570
    })
})

# 3.0 Model Training

## 3.1 Configure HyperParameters

In [33]:
training_args = TrainingArguments(
    output_dir='t5-small-squad-qg',
    num_train_epochs=3,
    evaluation_strategy='steps',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    save_steps=1000,
    load_best_model_at_end=True,
)

## 3.2 Define `DataCollatorForSeq2Seq`

In [34]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

## 3.3 Define the `Trainer` API

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad['train'],
    eval_dataset=tokenized_squad['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer
)

## 3.4 Start Fine-Tuning

In [36]:
trainer.train()

Step,Training Loss,Validation Loss
500,2.4623,2.373411
1000,2.4617,2.285991
1500,2.3629,2.245
2000,2.2836,2.215383
2500,2.2393,2.196553
3000,2.2242,2.184935
3500,2.2134,2.176022
4000,2.2058,2.17314


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=4107, training_loss=2.30376795268796, metrics={'train_runtime': 6223.4759, 'train_samples_per_second': 42.227, 'train_steps_per_second': 0.66, 'total_flos': 3.5567419401437184e+16, 'train_loss': 2.30376795268796, 'epoch': 3.0})

## 3.5 Pushing the Model to the Hub

In [99]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [100]:
trainer.push_to_hub('Commit Successfully!')

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mohammedaly2222002/t5-small-squad-qg/commit/0c75ac479d6ffb0ad281e24745a1c5f58a2f0efb', commit_message='Commit Successfully!', commit_description='', oid='0c75ac479d6ffb0ad281e24745a1c5f58a2f0efb', pr_url=None, pr_revision=None, pr_num=None)

# 4.0 Using the Model in Inference

In [65]:
def highlight_answer(context, answer):
    """
    Highlight the answer in the given context.

    Parameters:
        - context (str): The context in which the answer is found.
        - answer (str): The answer to be highlighted.

    Returns:
        - str: The context with the answer highlighted by '<h>' tags.

    Example:
    >>> context = 'The quick brown fox jumps over the lazy dog.'
    >>> answer = 'fox'
    >>> highlight_answer(context, answer)
    'The quick brown <h> fox <h> jumps over the lazy dog.'
    """
    
    context_splits = context.split(answer)
    
    text = ""
    for split in context_splits:
        text += split
        text += ' <h> '
        text += answer
        text += ' <h> '
        text += split
    
    return text


def prepare_instruction(answer_highlighted_context):
    """
    Prepare an instruction prompt for generating a question.

    Parameters:
        - answer_highlighted_context (str): The context with the answer highlighted by '<h>' tags.

    Returns:
        - str: The instruction prompt string.

    Example:
    >>> answer_highlighted_context = 'The quick brown <h> fox <h> jumps over the lazy dog.'
    >>> prepare_instruction(answer_highlighted_context)
    'Generate a question whose answer is highlighted by <h> from the context delimited by the triple backticks.\\n    context:\\n    ```\\n    The quick brown <h> fox <h> jumps over the lazy dog.\\n    ```\\n    '
    """
    
    instruction_prompt = f"""Generate a question whose answer is highlighted by <h> from the context delimited by the triple backticks.
    context:
    ```
    {answer_highlighted_context}
    ```
    """
    
    return instruction_prompt

In [156]:
from transformers import pipeline

pipe = pipeline('text2text-generation', model='mohammedaly2222002/t5-small-squad-qg', device_map='auto')

In [185]:
context = """During the 2011–12 season, he set the La Liga and European records\
for most goals scored in a single season, while establishing himself as Barcelona's\
all-time top scorer. The following two seasons, Messi finished second for the Ballon\
d'Or behind Cristiano Ronaldo (his perceived career rival), before regaining his best\
form during the 2014–15 campaign, becoming the all-time top scorer in La Liga and \
leading Barcelona to a historic second treble, after which he was awarded a fifth \
Ballon d'Or in 2015. Messi assumed captaincy of Barcelona in 2018, and won a record \
sixth Ballon d'Or in 2019. Out of contract, he signed for French club Paris Saint-Germain\
in August 2021, spending two seasons at the club and winning Ligue 1 twice. Messi \
joined American club Inter Miami in July 2023, winning the Leagues Cup in August of that year.
"""

answer_highlighted_context = highlight_answer(context=context, answer='Inter Miami')
prompt = prepare_instruction(answer_highlighted_context)

print(prompt)

"Generate a question whose answer is highlighted by <h> from the context delimited by the triple backticks.\n    context:\n    ```\n    During the 2011–12 season, he set the La Liga and European recordsfor most goals scored in a single season, while establishing himself as Barcelona'sall-time top scorer. The following two seasons, Messi finished second for the Ballond'Or behind Cristiano Ronaldo (his perceived career rival), before regaining his bestform during the 2014–15 campaign, becoming the all-time top scorer in La Liga and leading Barcelona to a historic second treble, after which he was awarded a fifth Ballon d'Or in 2015. Messi assumed captaincy of Barcelona in 2018, and won a record sixth Ballon d'Or in 2019. Out of contract, he signed for French club Paris Saint-Germainin August 2021, spending two seasons at the club and winning Ligue 1 twice. Messi joined American club  <h> Inter Miami <h> During the 2011–12 season, he set the La Liga and European recordsfor most goals scor

In [186]:
outputs = pipe(prompt, num_return_sequences=5, num_beams=5, num_beam_groups=5, diversity_penalty=1.0)
for output in outputs:
    print(output['generated_text'])

What club did Messi join in the 2023 season?
What was Messi's name of the club that won the Leagues Cup in July 20
In what city did Messi join?
Which club won the Leagues Cup in July 2023?
What club did Messi join in the Leagues Cup in July 2023?
