<a href="https://colab.research.google.com/github/ausdauerer/capstone-2023/blob/main/question_generation/models/T5/Test_HF_QG(local_HotpotQA_finetuned).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
%pip install datasets
%pip install keras-tuner --upgrade
%pip install rouge_score
#%pip install --upgrade spacy
#%pip install torch
%pip install tensorflow
#!pip install tflearn - batchnorm
%pip install datasets
%pip install transformers[sentencepiece]

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelWithLMHead
from datasets import list_datasets, load_dataset, load_metric
import pandas as pd
import spacy
import numpy as np
import json
import sklearn
from sklearn.model_selection import train_test_split, GroupShuffleSplit
import re
import os
import random
import typing
from typing import Any, Tuple, List, NamedTuple
import gensim
import gensim.downloader as gloader
from gensim.models import KeyedVectors
from itertools import chain
from tqdm import tqdm
import datetime
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
import torch
from torch.utils.data import DataLoader
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

## Configs

In [None]:
batch_size = 10

dataset_config = {
    'num_examples': 90000,
    # 'num_examples': 100,
    'train_size': 0.65,
    'test_size': 0.40,
    'num_words_context': 45000,
    'num_words_question': 28000,
    'buffer_size': 32000,
    'batch_size': batch_size,
    'random_seed': 13,
}

path = {
    'training_json_path': "/content/drive/MyDrive/Shared drive/Qgen_seq2seq_keras/data/hotpot_dev_fullwiki_v1.json",
    #'save_pkl_path': "./data/squadv2.pkl",
    'save_pkl_path': "/content/drive/MyDrive/Shared drive/Qgen_seq2seq_keras/data/hotpotqa.pkl",
    'checkpoint_dir': "./training_checkpoints",
}

evaluation_config = {
    'temperature' : 0.7
}

## Dataset Class

In [None]:
class Dataset(NamedTuple):
  """
  This class represent a a 3-way split processed dataset. 
  """
  # Reference :- https://github.com/topper-123/Articles/blob/master/New-interesting-data-types-in-Python3.rst
  train: tf.data.Dataset
  val: tf.data.Dataset
  test: tf.data.Dataset

class SQuAD:
  def __init__(self):
    self.random_seed = None
    self.squad_df = None
    self.preproc_squad_df = None
    self.tokenizer = None
    self.buffer_size = 0

  def __call__(self, dataset_config, path, tokenized=True, tensor_type=True,split=True):
    """The call() method loads the SQuAD dataset, preprocess it and optionally it returns 
    it tokenized. Moreover it also perform a 3-way split.

    Args:
        num_examples (int): number of examples to be taken from the original SQuAD dataset
        num_words (int): the maximum number of words to keep, based on word frequency. Only the most common num_words-1 words will be kept. 
        buffer_size (int): buffer size for the shuffling operation
        batch_size (int): size of the batches
        tokenized (boolean): specifies if the context and question data should be both tokenized
        pos_ner_tag (boolean):
        tensor_type (boolean): 

    Returns (depending on the input parameters):
        pd.DataFrame: training dataset
        pd.DataFrame: validation dataset
        pd.DataFrame: testing dataset
          OR
        NamedTuple: dataset, (dict, dict, dict)
    """
    self.random_seed = dataset_config['random_seed']
    self.buffer_size = dataset_config['buffer_size']
    self.batch_size = dataset_config['batch_size']
    self.train_size = dataset_config['train_size']
    self.test_size = dataset_config['test_size']
    self.training_json_path = path['training_json_path']
    self.save_pkl_path = path['save_pkl_path']
    self.max_length_context = 0
    self.max_length_question = 0

    # Load dataset from file
    self.load_dataset(dataset_config['num_examples'])
    # Extract answer
    #self.extract_answer()
    # Preprocess context and question
    self.preprocess()
    self.compute_max_length()

    if(not split):
        return self.squad_df
    # Perform splitting
    X_train, y_train, X_val, y_val, X_test, y_test = self.split_train_val(self.preproc_squad_df)

    # Initialize Tokenizer for the source: in our case the context sentences
    self.tokenizer_context = tf.keras.preprocessing.text.Tokenizer(filters='',
                                                                   oov_token='<unk>',
                                                                   num_words=dataset_config['num_words_context'])
    # initialize also for the target, namely the question sentences
    self.tokenizer_question = tf.keras.preprocessing.text.Tokenizer(filters='',
                                                                   oov_token='<unk>',
                                                                   num_words=dataset_config['num_words_question'])

    if tokenized:
      X_train_tokenized, word_to_idx_train_context = self.__tokenize_context(X_train, test=False)
      y_train_tokenized, word_to_idx_train_question = self.__tokenize_question(y_train, test=False)

      # update the max length for the other splits
      self.max_length_context = X_train_tokenized.context.iloc[0].shape[0]
      self.max_length_question = y_train_tokenized.iloc[0].shape[0]

      X_val_tokenized, word_to_idx_val_context = self.__tokenize_context(X_val, test=False)
      y_val_tokenized, word_to_idx_val_question = self.__tokenize_question(y_val, test=False)

      # The test set should handle the oov words as unkwown words
      X_test_tokenized, word_to_idx_test_context = self.__tokenize_context(X_test, test=True)
      y_test_tokenized, word_to_idx_test_question = self.__tokenize_question(y_test, test=True)

      word_to_idx_context = (word_to_idx_train_context, word_to_idx_val_context, word_to_idx_test_context)
      word_to_idx_question = (word_to_idx_train_question, word_to_idx_val_question, word_to_idx_test_question)
      
      if tensor_type:
        AUTOTUNE = tf.data.AUTOTUNE

        # Returns tf.Data.Dataset objects (tokenized)
        train_dataset = self.to_tensor(X_train_tokenized, y_train_tokenized)
        val_dataset = self.to_tensor(X_val_tokenized, y_val_tokenized)
        test_dataset = self.to_tensor(X_test_tokenized, y_test_tokenized)

        # Configure the dataset for performance
        train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
        val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)
        test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)

        dataset = Dataset(
            train=train_dataset, 
            val=val_dataset,
            test=test_dataset)

        return dataset, word_to_idx_context, word_to_idx_question
      else:
        # Returns pd.DataFrame objects (tokenized)
        return X_train_tokenized, y_train_tokenized, X_val_tokenized, y_val_tokenized, X_test_tokenized, y_test_tokenized
    else:
        return X_train, y_train, X_val, y_val, X_test, y_test

  def compute_max_length(self):
    context_list = list(self.preproc_squad_df.context)
    question_list = list(self.preproc_squad_df.question)

    context_length = [len(sen.split()) for sen in context_list]
    question_length = [len(sen.split()) for sen in question_list]

    self.max_length_context = int(np.quantile(context_length, 0.995))
    self.max_length_question = int(np.quantile(question_length, 0.995))

  def load_dataset(self, num_examples):
    """
    Extract the dataset from the json file. Already grouped by title.

    :param path: [Optional] specifies the local path where the training_set.json file is located

    :return
        - the extracted dataset in a dataframe format
    """
    if os.path.exists(self.save_pkl_path):
      print('File already exists! Loading from .pkl...\n')
      print(f'Dir path {self.save_pkl_path}')
      self.squad_df = pd.read_pickle(self.save_pkl_path)
      self.squad_df = self.squad_df[:num_examples]
    else:
      print('Loading from .json...\n')
      print(f'Dir path {self.training_json_path}')
      with open(self.training_json_path) as f:
          data = json.load(f)

      df_array = []
      #for current_subject in data['data']:
      for current_subject in data:
          id=current_subject['_id']
          answer=current_subject['answer']
          question=current_subject['question']
          context=""

          for supporting_fact in current_subject['supporting_facts']:
              fact_title=supporting_fact[0]
              for fact in current_subject['context']:
                if(fact[0]==fact_title):
                  for sent in fact[1]:
                    context+=sent

          record = { "id": id,
                    "context": context,
                    "question": question,
                    "answer": answer
                    }
          df_array.append(record)
        
      # Save file
      pd.to_pickle(pd.DataFrame(df_array), self.save_pkl_path)
      self.squad_df = pd.DataFrame(df_array)[:num_examples]

  def preprocess(self):
    df = self.squad_df.copy()

    # Pre-processing context
    context = list(df.context)
    preproc_context = []

    for c in context:
      c = self.__preprocess_sentence(c, question=False)
      preproc_context.append(c)
    
    df.context = preproc_context

    # Pre-processing questions
    question = list(df.question)
    preproc_question = []

    for q in question:
      q = self.__preprocess_sentence(q, question=True)
      preproc_question.append(q)
    
    df.question = preproc_question

    # Remove features that are not useful
    df = df.drop(['id'], axis=1)
    self.preproc_squad_df = df

  def __preprocess_sentence(self, sen, question):
    # Creating a space between a word and the punctuation following it
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    sen = re.sub(r"([?.!,¿])", r" \1 ", sen)
    sen = re.sub(r'[" "]+', " ", sen)

    # Replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    sen = re.sub(r"[^a-zA-Z0-9?.!,¿]+", " ", sen)

    sen = sen.strip()

    # Adding a start and an end token to the sentence so that the model know when to 
    # start and stop predicting.
    # if not question: sen = '<SOS> ' + sen + ' <EOS>'
    sen = '<SOS> ' + sen + ' <EOS>'
    return sen

  def __answer_start_end(self, df):
    """
    Creates a list of starting indexes and ending indexes for the answers.

    :param df: the target Dataframe

    :return: a dataframe containing the start and the end indexes foreach answer (ending index is excluded).

    """
    start_idx = df.answer_start
    end_idx = [start + len(list(answer)) for start, answer in zip(list(start_idx), list(df.answer))]
    return pd.DataFrame(list(zip(start_idx, end_idx)), columns=['start', 'end'])

  def split_train_val(self, df):
    """
    This method splits the dataframe in training and test sets, or eventually, in training, validation and test sets.

    Args
        :param df: the target Dataframe
        :param random_seed: random seed used in the splits
        :param train_size: represents the absolute number of train samples

    Returns:
        - Data and labels for training, validation and test sets if val is True 
        - Data and labels for training and test sets if val is False 

    """
    # Maybe we have also to return the index for the starting answer
    X = df.drop(['answer_start', 'question', 'answer'], axis=1).copy()
    idx = self.__answer_start_end(df)
    X['start'] = idx['start']
    X['end'] = idx['end']
    y = df['question']

    # In the first step we will split the data in training and remaining dataset
    splitter = GroupShuffleSplit(train_size=self.train_size, n_splits=2, random_state=self.random_seed)
    split = splitter.split(X, groups=X['title'])
    train_idx, rem_idx = next(split)

    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_rem = X.iloc[rem_idx]
    y_rem = y.iloc[rem_idx]


    # Val and test test accounts for the remaining percentage of the total data
    splitter = GroupShuffleSplit(test_size=self.test_size, n_splits=2, random_state=self.random_seed)
    split = splitter.split(X_rem, groups=X_rem['title'])
    val_idx, test_idx = next(split)

    X_val = X_rem.iloc[val_idx]
    y_val = y_rem.iloc[val_idx]

    X_test = X_rem.iloc[test_idx]
    y_test = y_rem.iloc[test_idx]

    return X_train, y_train, X_val, y_val, X_test, y_test

  def __tokenize_context(self, X, test):
    context = X.context
    if not test: self.tokenizer_context.fit_on_texts(context)
    context_tf = self.tokenizer_context.texts_to_sequences(context)

    if self.max_length_context != 0:
      context_tf_pad = tf.keras.preprocessing.sequence.pad_sequences(context_tf, maxlen=self.max_length_context, padding='post')
    else:
      context_tf_pad = tf.keras.preprocessing.sequence.pad_sequences(context_tf, padding='post')

    for i, _ in enumerate(context):
      X['context'].iloc[i] = context_tf_pad[i]

    # Add the padding
    self.tokenizer_context.word_index['<pad>'] = 0
    self.tokenizer_context.index_word[0] = '<pad>'

    return X, self.tokenizer_context.word_index

  def __tokenize_question(self, y, test):
    question = y
    if not test: self.tokenizer_question.fit_on_texts(question)
    question_tf = self.tokenizer_question.texts_to_sequences(question)
    
    if self.max_length_question != 0:
      question_tf_pad = tf.keras.preprocessing.sequence.pad_sequences(question_tf, maxlen=self.max_length_question, padding='post')
    else:
      question_tf_pad = tf.keras.preprocessing.sequence.pad_sequences(question_tf, padding='post')

    for i, _ in enumerate(question):
      y.iloc[i] = question_tf_pad[i]

    # Add the padding
    self.tokenizer_question.word_index['<pad>'] = 0
    self.tokenizer_question.index_word[0] = '<pad>'

    return y, self.tokenizer_question.word_index

  def extract_answer(self):
    """
    This method extracts the answer from the context of each sample, it uses the already answer index present in the dataset.
    """
    df = self.squad_df.copy()
    start_end = self.__answer_start_end(df)
    context = list(df.context)
    
    selected_sentences = []
    for i, par in enumerate(context):
      sentences = sent_tokenize(par)
      start = start_end.iloc[i].start
      end = start_end.iloc[i].end      
      right_sentence = ""
      context_characters = 0

      for j, sen in enumerate(sentences):
        sen += ' '
        context_characters += len(sen)
        # If the answer is completely in the current sentence
        if(start < context_characters and end <= context_characters):
          right_sentence = sen
          selected_sentences.append(right_sentence)
          break
        # the answer is in both the current and the next sentence
        if(start < context_characters and end > context_characters):
          right_sentence = sen + sentences[j+1]
          selected_sentences.append(right_sentence)
          break 

    self.squad_df.context = selected_sentences

  def to_tensor(self, X, y, train=True):
    X = X.context.copy()
    y = y.copy()

    # Reference:- https://www.tensorflow.org/api_docs/python/tf/data/Dataset
    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(list(X), tf.int64), 
         tf.cast(list(y), tf.int64)))
    if train: 
      dataset = dataset.shuffle(self.buffer_size).batch(self.batch_size, drop_remainder=True)
    else:
      dataset = dataset.batch(self.batch_size, drop_remainder=True)

    return dataset

## Dataset creation

In [None]:
dataset_creator = SQuAD()

hotpot_df= dataset_creator(dataset_config, path, tokenized=False,split=False)

File already exists! Loading from .pkl...

Dir path /content/drive/MyDrive/Shared drive/Qgen_seq2seq_keras/data/hotpotqa.pkl


In [None]:
df=hotpot_df.iloc[:5000]
df = df.reset_index(drop=True)

print(df)

                            id  \
0     5a8b57f25542995d1e6f1371   
1     5a8c7595554299585d9e36b6   
2     5a85ea095542994775f606a8   
3     5adbf0a255429947ff17385a   
4     5a8e3ea95542995a26add48d   
...                        ...   
4995  5ab8ae585542991b5579efd8   
4996  5a84bda45542992a431d1a96   
4997  5a710bb15542994082a3e50d   
4998  5a7270395542992359bc30a8   
4999  5a73d3ea5542992d56e7e3af   

                                                context  \
0                                                         
1     Kiss and Tell is a 1945 American comedy film s...   
2     Animorphs is a science fantasy series of young...   
3     The Laleli Mosque (Turkish: "Laleli Camii, or ...   
4     Big Stone Gap is a 2014 American drama romanti...   
...                                                 ...   
4995  Best Foot Forward is a 1941 musical with songs...   
4996                                                      
4997                                                      
4

In [None]:
test_df=df[:500]
val_df=df[500:1000]
train_df=df[1000:]

## Loader Creation

In [None]:
from torch.utils.data import Dataset
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
    )

class QGDataset(Dataset):

    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        source_max_token_len: int,
        target_max_token_len: int
        ):

        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        source_encoding = tokenizer(
            '{} {} {}'.format(data_row['answer'], '<sep>', data_row['context']),
            max_length= self.source_max_token_len,
            padding='max_length',
            truncation= True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
            )
    
        target_encoding = tokenizer(
            '{} {} {}'.format(data_row['answer'], '<sep>', data_row['question']),
            max_length=self.target_max_token_len,
            padding='max_length',
            truncation = True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
            )

        labels = target_encoding['input_ids']  
        labels[labels == 0] = -100

        return dict(
            answer_text = data_row['answer'],
            context = data_row['context'],
            question = data_row['question'],
            input_ids = source_encoding['input_ids'].flatten(),
            attention_mask = source_encoding['attention_mask'].flatten(),
            labels=labels.flatten()
            )

In [None]:
%pip install pytorch_lightning
import pytorch_lightning as pl

class QGDataModule(pl.LightningDataModule):

    def __init__(
        self,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size,
        source_max_token_len: int,
        target_max_token_len: int
        ): 
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self,stage=None):
        self.train_dataset = QGDataset(self.train_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        self.val_dataset = QGDataset(self.val_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        self.test_dataset = QGDataset(self.test_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size = self.batch_size, shuffle=True, num_workers = 2)

    def val_dataloader(self): 
        return DataLoader(self.val_dataset, batch_size=1, num_workers=2)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, num_workers=2)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_lightning
  Downloading pytorch_lightning-1.7.2-py3-none-any.whl (705 kB)
[K     |████████████████████████████████| 705 kB 33.8 MB/s 
[?25hCollecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.9.3-py3-none-any.whl (419 kB)
[K     |████████████████████████████████| 419 kB 38.6 MB/s 
[?25hCollecting tensorboard>=2.9.1
  Downloading tensorboard-2.10.0-py3-none-any.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 62.4 MB/s 
Collecting pyDeprecate>=0.3.1
  Downloading pyDeprecate-0.3.2-py3-none-any.whl (10 kB)
Installing collected packages: torchmetrics, tensorboard, pyDeprecate, pytorch-lightning
  Attempting uninstall: tensorboard
    Found existing installation: tensorboard 2.8.0
    Uninstalling tensorboard-2.8.0:
      Successfully uninstalled tensorboard-2.8.0
[31mERROR: pip's dependency resolver does not currently take into account all the pac

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
print('tokenizer len before: ', len(tokenizer))
tokenizer.add_tokens('<sep>')
print('tokenizer len after: ', len(tokenizer))
TOKENIZER_LEN = len(tokenizer)

data_module = QGDataModule(train_df, val_df, test_df, tokenizer, 2, 128, 64)
data_module.setup()

Downloading spiece.model:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

tokenizer len before:  32100
tokenizer len after:  32101


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
MODEL_NAME = 't5-small'
SOURCE_MAX_TOKEN_LEN = 300
TARGET_MAX_TOKEN_LEN = 80

N_EPOCHS = 5
BATCH_SIZE = 16
LEARNING_RATE = 0.0001

DF_TAKE_PERCENTAGE = 1

TAKE_TRAIN = int(len(train_df) * DF_TAKE_PERCENTAGE)
TAKE_DEV = int(len(val_df) * DF_TAKE_PERCENTAGE)
TAKE_TEST = int(len(test_df) * DF_TAKE_PERCENTAGE)

print('Taking', DF_TAKE_PERCENTAGE * 100, '%')
print(TAKE_TRAIN, 'of', len(train_df))
print(TAKE_DEV, 'of', len(val_df))
print(TAKE_TEST, 'of', len(test_df))

Taking 100 %
4000 of 4000
500 of 500
500 of 500


In [None]:
class QGModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
        self.model.resize_token_embeddings(TOKENIZER_LEN) #resizing after adding new tokens to the tokenizer

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('test_loss', loss, prog_bar=True, logger=True)
        return loss
  
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=LEARNING_RATE)

In [None]:
import sys

IN_COLAB = 'google.colab' in sys.modules
RUN_TRAINING_CELLS = IN_COLAB

EXPERIMENT_NAME = 'HOtPotQA/'
DRIVE_FOLDER_LOCATION = '/content/drive/MyDrive/Shared drive/' + EXPERIMENT_NAME

if IN_COLAB:
    from google.colab import drive

    #drive.mount('/content/drive', force_remount=True)

if IN_COLAB:
    # Adapted from:  https://robertbrucecarter.com/writing/2020/06/setting-your-working-directory-to-google-drive-in-a-colab-notebook/
    import os 

    def create_and_set_working_directory(path: str):
        # check if your project folder exists. if not, it will be created.
        if os.path.isdir(path) == False:
            os.mkdir(path)
            print(path + ' did not exist but was created.')

        # change the OS to use your project folder as the working directory
        os.chdir(path)

        print('Working directory changed to: \n' + path)

    create_and_set_working_directory(DRIVE_FOLDER_LOCATION)
    !pwd

Working directory changed to: 
/content/drive/MyDrive/Shared drive/HOtPotQA/
/content/drive/MyDrive/Shared drive/HOtPotQA


In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint

if RUN_TRAINING_CELLS:
    checkpoint_callback = ModelCheckpoint(
        dirpath='checkpoints',
        filename='best-checkpoint',
        save_top_k=-1,
        verbose=True,
        monitor='val_loss',
        mode='min'
    )
if RUN_TRAINING_CELLS:
    trainer = pl.Trainer(
        callbacks=[checkpoint_callback],
        max_epochs=N_EPOCHS,
    )

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
model = QGModel()
# model = QGModel.load_from_checkpoint('checkpoints/best-checkpoint-v42.ckpt')

#trainer.fit(model, data_module)

Downloading pytorch_model.bin:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [None]:
model = QGModel.load_from_checkpoint('checkpoints/best-checkpoint.ckpt')
model.freeze()
model.eval()

QGModel(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32101, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32101, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_features=204

In [None]:
SEP_TOKEN='<sep>'

def generate(qgmodel: QGModel, answer: str, context: str) -> str:
    source_encoding = tokenizer(
        '{} {} {}'.format(answer, SEP_TOKEN, context),
        max_length=SOURCE_MAX_TOKEN_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    generated_ids = qgmodel.model.generate(
        input_ids=source_encoding['input_ids'],
        attention_mask=source_encoding['attention_mask'],
        num_beams=1,
        max_length=TARGET_MAX_TOKEN_LEN,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True,
        use_cache=True
    )

    preds = {
        tokenizer.decode(generated_id, skip_special_tokens=False, clean_up_tokenization_spaces=True)
        for generated_id in generated_ids
    }

    return ''.join(preds)

In [None]:
sample_question = test_df.iloc[40]

def show_result(generated: str, answer: str, context:str, original_question: str = ''):
    print('Generated: ', generated)
    if original_question:
        print('Original : ', original_question)

    print()
    print('Answer: ', answer)
    print('Conext: ', context)
    print('-----------------------------')

generated = generate(model, sample_question['answer'], sample_question['context'])
show_result(generated, sample_question['answer'], sample_question['context'], sample_question['question'])

Generated:  <pad> Scotch Collie<sep> What breed is the long-haired (now known as Rough) Collie and the short-hairing (now known as Smooth) Collies?</s>
Original :  Which dog's ancestors include Gordon and Irish Setters: the Manchester Terrier or the Scotch Collie?

Answer:  Scotch Collie
Conext:  The Scotch Collie is a landrace breed of dog which originated from the highland regions of Scotland. The breed consisted of both the long-haired (now known as Rough) Collie and the short-haired (now known as Smooth) Collie. It is generally believed to have descended from a variety of ancient herding dogs, some dating back to the Roman occupation, which may have included Roman Cattle Dogs, Native Celtic Dogs and Viking Herding Spitzes. Other ancestors include the Gordon and Irish Setters.The Scotch Collie is a landrace breed of dog which originated from the highland regions of Scotland. The breed consisted of both the long-haired (now known as Rough) Collie and the short-haired (now known as Sm

In [None]:
class SQ_Dataset(Dataset):
 
  def __init__(self,X_val,y_val):
    self.X_val=X_val
    self.y_val=y_val
 
  def __len__(self):
    return len(self.y_val)
   
  def __getitem__(self,idx):
    return self.X_val.iloc[idx].to_dict(),self.y_val.iloc[idx].to_dict()


myDs=SQ_Dataset(df[['id','context','answer']],df[['question']])
val_loader=DataLoader(myDs,batch_size=batch_size,shuffle=False)
myDs.__len__()
x,y=myDs.__getitem__(0)
print(y)
print(x['id'])
print(x['context'])
print(x['answer'])

{'question': 'Were Scott Derrickson and Ed Wood of the same nationality?'}
5a8b57f25542995d1e6f1371

yes


In [None]:
for i, (data, labels) in enumerate(val_loader):
  print(data)
  print(labels)
  break;
 

{'id': ['5a8b57f25542995d1e6f1371', '5a8c7595554299585d9e36b6', '5a85ea095542994775f606a8', '5adbf0a255429947ff17385a', '5a8e3ea95542995a26add48d', '5abd94525542992ac4f382d2', '5a85b2d95542997b5ce40028', '5a87ab905542996e4f3088c1', '5a7bbb64554299042af8f7cc', '5a8db19d5542994ba4e3dd00'], 'context': ['', "Kiss and Tell is a 1945 American comedy film starring then 17-year-old Shirley Temple as Corliss Archer. In the film, two teenage girls cause their respective parents much concern when they start to become interested in boys. The parents' bickering about which girl is the worse influence causes more problems than it solves.", 'Animorphs is a science fantasy series of young adult books written by Katherine Applegate and her husband Michael Grant, writing together under the name K. A. Applegate, and published by Scholastic. It is told in first person, with all six main characters taking turns narrating the books through their own perspectives. Horror, war, dehumanization, sanity, moralit

## Model

In [None]:
#tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",use_fast=False)
#model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")

In [None]:
#hpqa_tokenizer = AutoTokenizer.from_pretrained("ck46/t5-small-hotpot-qa-qg")
#hpqa_model = AutoModelForSeq2SeqLM.from_pretrained("ck46/t5-small-hotpot-qa-qg")

## Evaluation Script

In [None]:
def compute_bleu(y_pred, y_true):
    metric = load_metric('bleu')
    metric.add_batch(predictions=y_pred, references=y_true)
    report = metric.compute()
    bleu = report['bleu'] * 100
    return bleu

def Model_Tokenizer(device):
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-small").to(device)
    tokenizer = AutoTokenizer.from_pretrained("t5-small")
    return model.to(device), tokenizer

def evaluation(loader, model, tokenizer, device):
    y_true = []
    y_pred = []
    for i,(x,y) in enumerate(loader):
        # Prepare and tokenize the source sentences
        if(i==25):
          break
        #src_sentences = ["answer: %s  context: %s </s>" % (x['context'][i],y['question'][i]) for i in range(batch_size)]
        generated = [generate(model, x['answer'][i], x['context'][i])  for i in range(batch_size) ]
        #encoded_input = tokenizer(src_sentences, max_length=128,
        #                          padding=True, truncation=True,
        #                          return_tensors='pt', add_special_tokens=True).input_ids.to(device)

        # Translate and decode the inputs
        #outputs = model.generate(encoded_input, max_length=175)
        #batch_pred = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Concatenate the translated and reference sentences
        print("Batch ",i,"\n\n")
        print("Actual Questions ##########################\n\n")
        for i in range(batch_size):
            sentence=y['question'][i]
            print(sentence)
            sentence = tokenizer.tokenize(sentence)
            y_true.append([sentence])
        print()

        print("Generated Questions #######################\n\n")
        for sentence in generated:
            #sentence=sentence[10:]
            for i in range(len(sentence)-4):
              st=sentence[i:i+5]
              #print(st)
              if(st=="<sep>"):
                sentence=sentence[i+6:-4]
                break

            print(sentence)
            sentence = tokenizer.tokenize(sentence)
            # print(sentence)
            y_pred.append(sentence)
        print()

    bleu = compute_bleu(y_pred, y_true)
    print('Bleu Score: {:.2f}'.format(bleu))

device = torch.device('cuda:{}'.format(0) if torch.cuda.is_available() else 'cpu')

evaluation(val_loader,model,tokenizer,device)

Batch  0 


Actual Questions ##########################


Were Scott Derrickson and Ed Wood of the same nationality?
What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?
What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?
Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?
The director of the romantic comedy "Big Stone Gap" is based in what New York city?
2014 S/S is the debut album of a South Korean boy group that was formed by who?
Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?
The arena where the Lewiston Maineiacs played their home games can seat how many people?
Who is older, Annie Morton or Terry Richardson?
Are Local H and For Against both from the United States?

Generated Questions #######################


Are the two finalists in the America