In [None]:
!pip install --quiet  datasets #to access squad dataset
!pip install --quiet pyarrow   #to deal with parquet files for saving dataset if required
!pip install --quiet  tqdm     #for progress bars
!pip install --quiet transformers # for t5 model
!pip install --quiet tokenizers  #tokenizers from HuggingFace
!pip install --quiet sentencepiece #subword tokenizer used by T5
!pip install --quiet pytorch-lightning # pytorch wrapper
!pip install --quiet torchtext # text utilities

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m89.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m95.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

# Fetching Datasets

In [1]:
#imports
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from pprint import pprint
import copy

In [2]:
device  = 'cuda' if torch.cuda.is_available() else "cpu"

In [3]:
pd.options.display.max_rows , pd.options.display.max_columns  = 100,100

In [4]:
def create_pandas_dataset(data,
                          answer_threshold=7,
                          verbose = False):

  ''' Create a Pandas Dataframe from hugging face dataset.
  Params:
        answer_threshold: Only consider those Question Answer pairs where the Answer is short.
  '''
  count_long ,count_short = 0 , 0
  result_df  = pd.DataFrame(columns = ['context', 'answer','question'])
  for index,val in enumerate(tqdm(data)):
      passage = val['context']
      question = val['question']
      answer = val['answers']['text'][0]
      no_of_words = len(answer.split())
      if no_of_words >= answer_threshold:
          count_long = count_long + 1
          continue
      else:
          result_df.loc[count_short] = [passage] + [answer] + [question]
          count_short = count_short + 1
  if verbose:
    return (result_df,
            count_long,
            count_short)
  else:
    return result_df

In [5]:
train_dataset = load_dataset('squad', split='train')
valid_dataset = load_dataset('squad', split='validation')
print(f"Total Train Samples:{len(train_dataset)} , Total Validation Samples:{len(valid_dataset)}")

Reusing dataset squad (/nfs/home/kabenamualus/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41)
Reusing dataset squad (/nfs/home/kabenamualus/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41)


Total Train Samples:87599 , Total Validation Samples:10570


In [6]:
sample_validation_dataset = next(iter(valid_dataset))
pprint (sample_validation_dataset)

context = sample_validation_dataset['context']
question = sample_validation_dataset['question']
answer = sample_validation_dataset['answers']['text'][0]
print('---------------'*9)
print('\nBreaking it Down\n')
print ("context:",context)
print ("question:",question)
print ("answer:",answer)

{'answers': {'answer_start': [177, 177, 177],
             'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']},
 'context': 'Super Bowl 50 was an American football game to determine the '
            'champion of the National Football League (NFL) for the 2015 '
            'season. The American Football Conference (AFC) champion Denver '
            'Broncos defeated the National Football Conference (NFC) champion '
            'Carolina Panthers 24–10 to earn their third Super Bowl title. The '
            "game was played on February 7, 2016, at Levi's Stadium in the San "
            'Francisco Bay Area at Santa Clara, California. As this was the '
            '50th Super Bowl, the league emphasized the "golden anniversary" '
            'with various gold-themed initiatives, as well as temporarily '
            'suspending the tradition of naming each Super Bowl game with '
            'Roman numerals (under which the game would have been known as '
            '"Super

In [7]:
df_train , df_validation = create_pandas_dataset(train_dataset) , create_pandas_dataset(valid_dataset)
print(f"\n Total Train Samples:{df_train.shape} , Total Validation Samples:{df_validation.shape}")

100%|██████████| 87599/87599 [06:58<00:00, 209.30it/s]
100%|██████████| 10570/10570 [00:14<00:00, 747.56it/s]


 Total Train Samples:(78664, 3) , Total Validation Samples:(9652, 3)





In [8]:
# Saving data for future use
df_train.to_parquet('train_squad.parquet')
df_validation.to_parquet('validation_squad.parquet')

# Creating a Pytorch DataSet for T5 Training and Validation

In [9]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [10]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small',model_max_length=512)
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')

In [11]:
class QuestionGenerationDataset(Dataset):
    def __init__(self, tokenizer, filepath, max_len_inp=512,max_len_out=96):
        self.path = filepath

        self.passage_column = "context"
        self.answer = "answer"
        self.question = "question"

        # self.data = pd.read_csv(self.path)
        self.data = pd.read_parquet(self.path).iloc[:2000,:]

        self.max_len_input = max_len_inp
        self.max_len_output = max_len_out
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  #squeeze to get rid of the batch dimension
        target_mask = self.targets[index]["attention_mask"].squeeze()  # convert [batch,dim] to [dim]

        labels = copy.deepcopy(target_ids)
        labels [labels==0] = -100

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask,"labels":labels}

    def _build(self):
        for rownum,val in tqdm(self.data.iterrows()): # Iterating over the dataframe
            passage,answer,target = val[self.passage_column],val[self.answer],val[self.question]

            input_ = f"context: {passage}  answer: {answer}" # T5 Input format for question answering tasks
            target = f"question: {str(target)}" # Output format we require

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len_input,padding='max_length',
                truncation = True,return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len_output,padding='max_length',
                truncation = True,
                return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [12]:
train_path = 'train_squad.parquet' # change this accordingly
validation_path = 'validation_squad.parquet'
train_dataset = QuestionGenerationDataset(t5_tokenizer,train_path)
validation_dataset = QuestionGenerationDataset(t5_tokenizer,validation_path)

2000it [00:02, 747.98it/s]
2000it [00:02, 818.24it/s]


In [13]:
# Data Sample

train_sample = train_dataset[50] # thanks to __getitem__
decoded_train_input = t5_tokenizer.decode(train_sample['source_ids'])
decoded_train_output = t5_tokenizer.decode(train_sample['target_ids'])

print(decoded_train_input)
print(decoded_train_output)

context: In 1882, Albert Zahm (John Zahm's brother) built an early wind tunnel used to compare lift to drag of aeronautical models. Around 1899, Professor Jerome Green became the first American to send a wireless message. In 1931, Father Julius Nieuwland performed early work on basic reactions that was used to create neoprene. Study of nuclear physics at the university began with the building of a nuclear accelerator in 1936, and continues now partly through a partnership in the Joint Institute for Nuclear Astrophysics. answer: Professor Jerome Green</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

# Fine Tuning T5

In [14]:
import pytorch_lightning as pl
from torch.optim import AdamW
import argparse
from transformers import (
    get_linear_schedule_with_warmup
  )

class T5Tuner(pl.LightningModule):

    def __init__(self,t5model, t5tokenizer,batchsize=4):
        super().__init__()
        self.model = t5model
        self.tokenizer = t5tokenizer
        self.batch_size = batchsize

    def forward( self, input_ids, attention_mask=None,
                decoder_attention_mask=None,
                lm_labels=None):

         outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

         return outputs

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log('train_loss',loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log("val_loss",loss)
        return loss

    def train_dataloader(self):
        return DataLoader(train_dataset, batch_size=self.batch_size,
                          num_workers=2)

    def val_dataloader(self):
        return DataLoader(validation_dataset,
                          batch_size=self.batch_size,
                          num_workers=2)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=3e-4, eps=1e-8)
        return optimizer

In [15]:
model = T5Tuner(t5_model,t5_tokenizer)

trainer = pl.Trainer(max_epochs = 3,accelerator=device)

trainer.fit(model)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  "Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning`"
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)
SLURM 

Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
# saving the model
!mkdir "t5_tokenizer"
!mkdir "t5_trained_model"
model.model.save_pretrained('t5_trained_model')
t5_tokenizer.save_pretrained('t5_tokenizer')

('t5_tokenizer/tokenizer_config.json',
 't5_tokenizer/special_tokens_map.json',
 't5_tokenizer/spiece.model',
 't5_tokenizer/added_tokens.json')

# Inference / Predictions

In [None]:
trained_model_path = 't5_trained_model'
trained_tokenizer = 't5_tokenizer'
device = 'cpu'

In [None]:
model = T5ForConditionalGeneration.from_pretrained(trained_model_path)
tokenizer = T5Tokenizer.from_pretrained(trained_tokenizer)

Text Sample

In [None]:
context ="President Donald Trump said and predicted that some states would reopen this month."
answer = "Donald Trump"
text = "context: "+context + " " + "answer: " + answer
print(text)

context: President Donald Trump said and predicted that some states would reopen this month. answer: Donald Trump


In [None]:
context ="Since its topping out in 2013, One World Trade Center in New York City has been the tallest skyscraper in the United States."
answer = "World Trade Center"
text = "context: "+context + " " + "answer: " + answer
print(text)

context: Since its topping out in 2013, One World Trade Center in New York City has been the tallest skyscraper in the United States. answer: World Trade Center


In [None]:
encoding = tokenizer.encode_plus(text,max_length =512,padding='max_length',
                                 truncation = True,
                                 return_tensors="pt").to(device)
print (encoding.keys())
input_ids,attention_mask  = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

dict_keys(['input_ids', 'attention_mask'])


In [None]:
model.eval()
beam_outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=72, # How long the generated questions should be
    early_stopping=True,
    num_beams=5,
    num_return_sequences=2
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print(sent)

question: What is the tallest skyscraper in the United States?
question: What is the name of the tallest skyscraper in the United States?


# Deployment Demo

In [None]:
!pip install --quiet gradio==3.9

In [None]:
def get_question(sentence,answer,mdl,tknizer):

  ''' function to generate questions. Takes a sentence,answer,
      model and tokenizer
  '''

  text = "context: {} answer: {}".format(sentence,answer)
  print (text)
  max_len = 256
  encoding = tknizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt")

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = mdl.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=72)


  dec = [tknizer.decode(ids,skip_special_tokens=True) for ids in outs]


  Question = dec[0].replace("question:","")
  Question= Question.strip()
  return Question

In [None]:
context = "Donald Trump is an American media personality and businessman who served as the 45th president of the United States."
answer = "Donald Trump"

ques = get_question(context,answer,model,tokenizer)
print ("question: ",ques)

context: Donald Trump is an American media personality and businessman who served as the 45th president of the United States. answer: Donald Trump
question:  Who served as the 45th president of the United States?


In [None]:
import gradio as gr

context = gr.inputs.Textbox(lines=5,placeholder="Enter paragraph/context here...")
answer = gr.inputs.Textbox(lines=3, placeholder="Enter answer/keyword here...")
question = gr.outputs.Textbox( type="auto", label="Question")

def generate_question(context,answer):
  return get_question(context,answer,model,tokenizer)

iface = gr.Interface(
  fn=generate_question,
  inputs=[context,answer],
  outputs=question)

iface.launch(debug=False,share=True)



IMPORTANT: You are using gradio version 3.9, however version 3.14.0 is available, please upgrade.
--------
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://7c67647ce4455126.gradio.app

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


(<gradio.routes.App at 0x7fc62ea10df0>,
 'http://127.0.0.1:7860/',
 'https://7c67647ce4455126.gradio.app')