<a href="https://colab.research.google.com/github/Karthick47v2/question-generator/blob/main/model_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## ADD test
## Add opt

### Install 3rd party libraries

In [None]:
!pip3 install transformers==4.1.1
!pip3 install pytorch-lightning==1.1.3
!pip3 install tokenizers==0.9.4

!pip install git+https://github.com/PyTorchLightning/lightning-bolts

### Import libraries

> You ***may*** need to restart runtime after installing python packages. (If importing `pytorch_lightning` throws error)

In [None]:
import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl
import copy

from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, T5ForConditionalGeneration, T5Tokenizer
from sklearn.model_selection import train_test_split

# set random state
pl.seed_everything(42)

### Load and split dataset

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
dataset = 'sciq' ## squad or sciq

df = pd.read_csv(f"gdrive/MyDrive/mcq-gen/{'SQuAD' if dataset == 'squad' else 'SciQ'}-processed.csv")
df.head()

NameError: ignored

In [5]:
train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)
train_df.shape, test_df.shape

((8512, 2), (2128, 2))

### Load base model

In [None]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

### Create dataset class inheriting Torch Dataset

In [7]:
class QuestionGenerationDataset(Dataset):
  def __init__(self, tokenizer, data, max_in_len=512, max_out_len=96):
    self.data = data
    self.max_in_len = max_in_len
    self.max_out_len = max_out_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []
    self.__tokenize()

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask = self.inputs[index]["attention_mask"].squeeze()
    target_mask = self.targets[index]["attention_mask"].squeeze()
    
    labels = copy.deepcopy(target_ids)
    labels[labels==0] = -100

    return {"source_ids" : source_ids, "source_mask" : src_mask, "target_ids" : target_ids, "target_mask" : target_mask, "labels" : labels}

  def __check_token_len(self, text, max):
    test_encoding = self.tokenizer.encode_plus(
                              text,
                              truncation=False,
                              return_tensors='pt'
                              )
    token_len = len(test_encoding['input_ids'][0])

    return token_len > max


  
  def __tokenize(self):
    for _, row in self.data.iterrows():
      source_text, target_text = row['source_text'], row['target_text']

      if self.__check_token_len(source_text, self.max_in_len):
          continue
      if self.__check_token_len(target_text, self.max_out_len):
          continue     

      tokenized_source = self.tokenizer.batch_encode_plus(
                              [source_text],
                              max_length=self.max_in_len,
                              pad_to_max_length=True,
                              return_tensors='pt'
                              )
      
      tokenized_target = self.tokenizer.batch_encode_plus(
                              [target_text],
                              max_length=self.max_out_len,
                              pad_to_max_length=True,
                              return_tensors='pt'
                              )
      
      self.inputs.append(tokenized_source)
      self.targets.append(tokenized_input)

In [8]:
train_dataset = QuestionGenerationDataset(t5_tokenizer, train_df)
validation_dataset = QuestionGenerationDataset(t5_tokenizer, test_df)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Token indices sequence length is longer than the specified maximum sequence length for this model (639 > 512). Running this sequence through the model will result in indexing errors


### ***SQuAD (80-20 split)***

***Train***

Before: 70,079

After filtering out data with exceeding tokens: 69,869

***Validation***

Before: 17,520

After filtering out data with exceeding tokens: 17,467


### ***SciQ (80-20 split)***

***Train***

Before: 8,512

After filtering out data with exceeding tokens: 8,409

***Validation***

Before: 2,128

After filtering out data with exceeding tokens: 2,092


### Create module class inheriting Python Lightning, LightningModule

In [9]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, batch_size, t5model, t5tokenizer):
    super(T5FineTuner, self).__init__()
    self.batch_size = batch_size
    self.model = t5model
    self.tokenizer = t5tokenizer

  def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, 
              decoder_attention_mask=None, lm_labels=None):
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_attention_mask=decoder_attention_mask,
        labels=lm_labels,
    )
    return outputs

  def training_step(self, batch, batch_idx):
    outputs = self.forward(
        input_ids=batch['source_ids'],
        attention_mask=batch['source_mask'],
        decoder_input_ids=batch['target_ids'],
        decoder_attention_mask=batch['target_mask'],
        lm_labels=batch['labels']
    )

    loss = outputs[0]
    self.log('train_loss',loss)
    return loss

  def validation_step(self, batch, batch_idx):
    outputs = self.forward(
        input_ids=batch['source_ids'],
        attention_mask=batch['source_mask'],
        decoder_input_ids=batch['target_ids'],
        decoder_attention_mask=batch['target_mask'],
        lm_labels=batch['labels']
    )

    loss = outputs[0]
    self.log('val_loss',loss, on_step=True, on_epoch=True, prog_bar=True, 
             logger=True)
    return loss

  def train_dataloader(self):
    return DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True,
                      num_workers=4)

  def val_dataloader(self):
    return DataLoader(validation_dataset, batch_size=self.batch_size,
                      num_workers=4)

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=3e-4, eps=1e-8)
    return optimizer

In [11]:
model = T5FineTuner(8, t5_model, t5_tokenizer)
trainer = pl.Trainer(max_epochs=1 ,gpus=1)
trainer.fit(model)

  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"


MisconfigurationException: ignored