In [None]:
%%capture
%pip install datasets
%pip install keras-tuner --upgrade
%pip install rouge_score
#%pip install --upgrade spacy
#%pip install torch
%pip install tensorflow
#!pip install tflearn - batchnorm
%pip install datasets
%pip install transformers[sentencepiece]

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelWithLMHead
from datasets import list_datasets, load_dataset, load_metric
import pandas as pd
import spacy
import numpy as np
import json
import sklearn
from sklearn.model_selection import train_test_split, GroupShuffleSplit
import re
import os
import random
import typing
from typing import Any, Tuple, List, NamedTuple
import gensim
import gensim.downloader as gloader
from gensim.models import KeyedVectors
from itertools import chain
from tqdm import tqdm
import datetime
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
import torch
from torch.utils.data import DataLoader
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

## Configs

In [None]:
batch_size = 10

dataset_config = {
    'num_examples': 90000,
    # 'num_examples': 100,
    'train_size': 0.65,
    'test_size': 0.40,
    'num_words_context': 45000,
    'num_words_question': 28000,
    'buffer_size': 32000,
    'batch_size': batch_size,
    'random_seed': 13,
}

path = {
    'training_json_path': "./data/train-v1.1.json",
    #'save_pkl_path': "./data/squadv2.pkl",
    'save_pkl_path': "/content/drive/MyDrive/Shared drive/Qgen_seq2seq_keras/data/squad.pkl",
    'checkpoint_dir': "./training_checkpoints",
}

evaluation_config = {
    'temperature' : 0.7
}

## Dataset Class

In [None]:
class Dataset(NamedTuple):
  """
  This class represent a a 3-way split processed dataset. 
  """
  # Reference :- https://github.com/topper-123/Articles/blob/master/New-interesting-data-types-in-Python3.rst
  train: tf.data.Dataset
  val: tf.data.Dataset
  test: tf.data.Dataset

class SQuAD:
  def __init__(self):
    self.random_seed = None
    self.squad_df = None
    self.preproc_squad_df = None
    self.tokenizer = None
    self.buffer_size = 0

  def __call__(self, dataset_config, path, tokenized=True, tensor_type=True):
    """The call() method loads the SQuAD dataset, preprocess it and optionally it returns 
    it tokenized. Moreover it also perform a 3-way split.

    Args:
        num_examples (int): number of examples to be taken from the original SQuAD dataset
        num_words (int): the maximum number of words to keep, based on word frequency. Only the most common num_words-1 words will be kept. 
        buffer_size (int): buffer size for the shuffling operation
        batch_size (int): size of the batches
        tokenized (boolean): specifies if the context and question data should be both tokenized
        pos_ner_tag (boolean):
        tensor_type (boolean): 

    Returns (depending on the input parameters):
        pd.DataFrame: training dataset
        pd.DataFrame: validation dataset
        pd.DataFrame: testing dataset
          OR
        NamedTuple: dataset, (dict, dict, dict)
    """
    self.random_seed = dataset_config['random_seed']
    self.buffer_size = dataset_config['buffer_size']
    self.batch_size = dataset_config['batch_size']
    self.train_size = dataset_config['train_size']
    self.test_size = dataset_config['test_size']
    self.training_json_path = path['training_json_path']
    self.save_pkl_path = path['save_pkl_path']
    self.max_length_context = 0
    self.max_length_question = 0

    # Load dataset from file
    self.load_dataset(dataset_config['num_examples'])
    # Extract answer
    self.extract_answer()
    # Preprocess context and question
    self.preprocess()
    self.compute_max_length()

    # Perform splitting
    X_train, y_train, X_val, y_val, X_test, y_test = self.split_train_val(self.preproc_squad_df)

    # Initialize Tokenizer for the source: in our case the context sentences
    self.tokenizer_context = tf.keras.preprocessing.text.Tokenizer(filters='',
                                                                   oov_token='<unk>',
                                                                   num_words=dataset_config['num_words_context'])
    # initialize also for the target, namely the question sentences
    self.tokenizer_question = tf.keras.preprocessing.text.Tokenizer(filters='',
                                                                   oov_token='<unk>',
                                                                   num_words=dataset_config['num_words_question'])

    if tokenized:
      X_train_tokenized, word_to_idx_train_context = self.__tokenize_context(X_train, test=False)
      y_train_tokenized, word_to_idx_train_question = self.__tokenize_question(y_train, test=False)

      # update the max length for the other splits
      self.max_length_context = X_train_tokenized.context.iloc[0].shape[0]
      self.max_length_question = y_train_tokenized.iloc[0].shape[0]

      X_val_tokenized, word_to_idx_val_context = self.__tokenize_context(X_val, test=False)
      y_val_tokenized, word_to_idx_val_question = self.__tokenize_question(y_val, test=False)

      # The test set should handle the oov words as unkwown words
      X_test_tokenized, word_to_idx_test_context = self.__tokenize_context(X_test, test=True)
      y_test_tokenized, word_to_idx_test_question = self.__tokenize_question(y_test, test=True)

      word_to_idx_context = (word_to_idx_train_context, word_to_idx_val_context, word_to_idx_test_context)
      word_to_idx_question = (word_to_idx_train_question, word_to_idx_val_question, word_to_idx_test_question)
      
      if tensor_type:
        AUTOTUNE = tf.data.AUTOTUNE

        # Returns tf.Data.Dataset objects (tokenized)
        train_dataset = self.to_tensor(X_train_tokenized, y_train_tokenized)
        val_dataset = self.to_tensor(X_val_tokenized, y_val_tokenized)
        test_dataset = self.to_tensor(X_test_tokenized, y_test_tokenized)

        # Configure the dataset for performance
        train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
        val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)
        test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)

        dataset = Dataset(
            train=train_dataset, 
            val=val_dataset,
            test=test_dataset)

        return dataset, word_to_idx_context, word_to_idx_question
      else:
        # Returns pd.DataFrame objects (tokenized)
        return X_train_tokenized, y_train_tokenized, X_val_tokenized, y_val_tokenized, X_test_tokenized, y_test_tokenized
    else:
      return X_train, y_train, X_val, y_val, X_test, y_test

  def compute_max_length(self):
    context_list = list(self.preproc_squad_df.context)
    question_list = list(self.preproc_squad_df.question)

    context_length = [len(sen.split()) for sen in context_list]
    question_length = [len(sen.split()) for sen in question_list]

    self.max_length_context = int(np.quantile(context_length, 0.995))
    self.max_length_question = int(np.quantile(question_length, 0.995))

  def load_dataset(self, num_examples):
    """
    Extract the dataset from the json file. Already grouped by title.

    :param path: [Optional] specifies the local path where the training_set.json file is located

    :return
        - the extracted dataset in a dataframe format
    """
    if os.path.exists(self.save_pkl_path):
      print('File already exists! Loading from .pkl...\n')
      print(f'Dir path {self.save_pkl_path}')
      self.squad_df = pd.read_pickle(self.save_pkl_path)
      self.squad_df = self.squad_df[:num_examples]
    else:
      print('Loading from .json...\n')
      print(f'Dir path {self.training_json_path}')
      with open(self.training_json_path) as f:
          data = json.load(f)

      df_array = []
      for current_subject in data['data']:
      # for current_subject in data:
          title = current_subject['title']

          for current_context in current_subject['paragraphs']:
              context = current_context['context']

              for current_qas in current_context['qas']:
                # Each qas is a list made of id, question, answers
                id = current_qas['id']
                question = current_qas['question']
                answers = current_qas['answers']

                for current_answer in current_qas['answers']:
                  answer_start = current_answer['answer_start']
                  text = current_answer['text']

                  record = { "id": id,
                            "title": title,
                            "context": context,
                            "question": question,
                            "answer_start": answer_start,
                            "answer": text
                            }

                  df_array.append(record)
      # Save file
      pd.to_pickle(pd.DataFrame(df_array), self.save_pkl_path)
      self.squad_df = pd.DataFrame(df_array)[:num_examples]

  def preprocess(self):
    df = self.squad_df.copy()

    # Pre-processing context
    context = list(df.context)
    preproc_context = []

    for c in context:
      c = self.__preprocess_sentence(c, question=False)
      preproc_context.append(c)
    
    df.context = preproc_context

    # Pre-processing questions
    question = list(df.question)
    preproc_question = []

    for q in question:
      q = self.__preprocess_sentence(q, question=True)
      preproc_question.append(q)
    
    df.question = preproc_question

    # Remove features that are not useful
    df = df.drop(['id'], axis=1)
    self.preproc_squad_df = df

  def __preprocess_sentence(self, sen, question):
    # Creating a space between a word and the punctuation following it
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    sen = re.sub(r"([?.!,¿])", r" \1 ", sen)
    sen = re.sub(r'[" "]+', " ", sen)

    # Replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    sen = re.sub(r"[^a-zA-Z0-9?.!,¿]+", " ", sen)

    sen = sen.strip()

    # Adding a start and an end token to the sentence so that the model know when to 
    # start and stop predicting.
    # if not question: sen = '<SOS> ' + sen + ' <EOS>'
    sen = '<SOS> ' + sen + ' <EOS>'
    return sen

  def __answer_start_end(self, df):
    """
    Creates a list of starting indexes and ending indexes for the answers.

    :param df: the target Dataframe

    :return: a dataframe containing the start and the end indexes foreach answer (ending index is excluded).

    """
    start_idx = df.answer_start
    end_idx = [start + len(list(answer)) for start, answer in zip(list(start_idx), list(df.answer))]
    return pd.DataFrame(list(zip(start_idx, end_idx)), columns=['start', 'end'])

  def split_train_val(self, df):
    """
    This method splits the dataframe in training and test sets, or eventually, in training, validation and test sets.

    Args
        :param df: the target Dataframe
        :param random_seed: random seed used in the splits
        :param train_size: represents the absolute number of train samples

    Returns:
        - Data and labels for training, validation and test sets if val is True 
        - Data and labels for training and test sets if val is False 

    """
    # Maybe we have also to return the index for the starting answer
    X = df.drop(['answer_start', 'question', 'answer'], axis=1).copy()
    idx = self.__answer_start_end(df)
    X['start'] = idx['start']
    X['end'] = idx['end']
    y = df['question']

    # In the first step we will split the data in training and remaining dataset
    splitter = GroupShuffleSplit(train_size=self.train_size, n_splits=2, random_state=self.random_seed)
    split = splitter.split(X, groups=X['title'])
    train_idx, rem_idx = next(split)

    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_rem = X.iloc[rem_idx]
    y_rem = y.iloc[rem_idx]


    # Val and test test accounts for the remaining percentage of the total data
    splitter = GroupShuffleSplit(test_size=self.test_size, n_splits=2, random_state=self.random_seed)
    split = splitter.split(X_rem, groups=X_rem['title'])
    val_idx, test_idx = next(split)

    X_val = X_rem.iloc[val_idx]
    y_val = y_rem.iloc[val_idx]

    X_test = X_rem.iloc[test_idx]
    y_test = y_rem.iloc[test_idx]

    return X_train, y_train, X_val, y_val, X_test, y_test

  def __tokenize_context(self, X, test):
    context = X.context
    if not test: self.tokenizer_context.fit_on_texts(context)
    context_tf = self.tokenizer_context.texts_to_sequences(context)

    if self.max_length_context != 0:
      context_tf_pad = tf.keras.preprocessing.sequence.pad_sequences(context_tf, maxlen=self.max_length_context, padding='post')
    else:
      context_tf_pad = tf.keras.preprocessing.sequence.pad_sequences(context_tf, padding='post')

    for i, _ in enumerate(context):
      X['context'].iloc[i] = context_tf_pad[i]

    # Add the padding
    self.tokenizer_context.word_index['<pad>'] = 0
    self.tokenizer_context.index_word[0] = '<pad>'

    return X, self.tokenizer_context.word_index

  def __tokenize_question(self, y, test):
    question = y
    if not test: self.tokenizer_question.fit_on_texts(question)
    question_tf = self.tokenizer_question.texts_to_sequences(question)
    
    if self.max_length_question != 0:
      question_tf_pad = tf.keras.preprocessing.sequence.pad_sequences(question_tf, maxlen=self.max_length_question, padding='post')
    else:
      question_tf_pad = tf.keras.preprocessing.sequence.pad_sequences(question_tf, padding='post')

    for i, _ in enumerate(question):
      y.iloc[i] = question_tf_pad[i]

    # Add the padding
    self.tokenizer_question.word_index['<pad>'] = 0
    self.tokenizer_question.index_word[0] = '<pad>'

    return y, self.tokenizer_question.word_index

  def extract_answer(self):
    """
    This method extracts the answer from the context of each sample, it uses the already answer index present in the dataset.
    """
    df = self.squad_df.copy()
    start_end = self.__answer_start_end(df)
    context = list(df.context)
    
    selected_sentences = []
    for i, par in enumerate(context):
      sentences = sent_tokenize(par)
      start = start_end.iloc[i].start
      end = start_end.iloc[i].end      
      right_sentence = ""
      context_characters = 0

      for j, sen in enumerate(sentences):
        sen += ' '
        context_characters += len(sen)
        # If the answer is completely in the current sentence
        if(start < context_characters and end <= context_characters):
          right_sentence = sen
          selected_sentences.append(right_sentence)
          break
        # the answer is in both the current and the next sentence
        if(start < context_characters and end > context_characters):
          right_sentence = sen + sentences[j+1]
          selected_sentences.append(right_sentence)
          break 

    self.squad_df.context = selected_sentences

  def to_tensor(self, X, y, train=True):
    X = X.context.copy()
    y = y.copy()

    # Reference:- https://www.tensorflow.org/api_docs/python/tf/data/Dataset
    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(list(X), tf.int64), 
         tf.cast(list(y), tf.int64)))
    if train: 
      dataset = dataset.shuffle(self.buffer_size).batch(self.batch_size, drop_remainder=True)
    else:
      dataset = dataset.batch(self.batch_size, drop_remainder=True)

    return dataset

## Dataset creation

In [None]:
dataset_creator = SQuAD()

X_trainn, y_trainn, X_vall, y_vall, X_test, y_test= dataset_creator(dataset_config, path, tokenized=False)

File already exists! Loading from .pkl...

Dir path /content/drive/MyDrive/Shared drive/Qgen_seq2seq_keras/data/squad.pkl


In [None]:
X_train=X_trainn.iloc[:5000]
X_train = X_train.reset_index(drop=True)

y_train=y_trainn.iloc[:5000]
y_train = y_train.reset_index(drop=True)

print(X_train)
print(y_train)

                         title  \
0     University_of_Notre_Dame   
1     University_of_Notre_Dame   
2     University_of_Notre_Dame   
3     University_of_Notre_Dame   
4     University_of_Notre_Dame   
...                        ...   
4995             American_Idol   
4996             American_Idol   
4997             American_Idol   
4998             American_Idol   
4999             American_Idol   

                                                context  start  end  
0     <SOS> It is a replica of the grotto at Lourdes...    515  541  
1     <SOS> Immediately in front of the Main Buildin...    188  213  
2     <SOS> Next to the Main Building is the Basilic...    279  296  
3     <SOS> Immediately behind the basilica is the G...    381  420  
4     <SOS> Atop the Main Building s gold dome is a ...     92  126  
...                                                 ...    ...  ...  
4995  <SOS> Casey Abrams , who suffers from ulcerati...    184  196  
4996  <SOS> One of the more pro

## Loader Creation

In [None]:
from torch.utils.data import Dataset

class SQ_Dataset(Dataset):
 
  def __init__(self,X_val,y_val):
    self.X_val=X_val
    self.y_val=y_val
 
  def __len__(self):
    return len(self.y_val)
   
  def __getitem__(self,idx):
    return self.X_val.iloc[idx].to_dict(),self.y_val.iloc[idx]

In [None]:
myDs=SQ_Dataset(X_train,y_train)
val_loader=DataLoader(myDs,batch_size=batch_size,shuffle=False)
myDs.__len__()
x,y=myDs.__getitem__(0)
print(y)
print(x['title'])
print(x['context'])

<SOS> To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France ? <EOS>
University_of_Notre_Dame
<SOS> It is a replica of the grotto at Lourdes , France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858 . <EOS>


In [None]:
for i, (data, labels) in enumerate(val_loader):
  print(data)
  break;
 

{'title': ['University_of_Notre_Dame', 'University_of_Notre_Dame', 'University_of_Notre_Dame', 'University_of_Notre_Dame', 'University_of_Notre_Dame', 'University_of_Notre_Dame', 'University_of_Notre_Dame', 'University_of_Notre_Dame', 'University_of_Notre_Dame', 'University_of_Notre_Dame'], 'context': ['<SOS> It is a replica of the grotto at Lourdes , France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858 . <EOS>', '<SOS> Immediately in front of the Main Building and facing it , is a copper statue of Christ with arms upraised with the legend Venite Ad Me Omnes . <EOS>', '<SOS> Next to the Main Building is the Basilica of the Sacred Heart . <EOS>', '<SOS> Immediately behind the basilica is the Grotto , a Marian place of prayer and reflection . <EOS>', '<SOS> Atop the Main Building s gold dome is a golden statue of the Virgin Mary . <EOS>', '<SOS> Begun as a one page journal in September 1876 , the Scholastic magazine is issued twice monthly and claims to b

## Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap",use_fast=False)
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")



## Evaluation Script

In [None]:
def compute_bleu(y_pred, y_true):
    metric = load_metric('bleu')
    metric.add_batch(predictions=y_pred, references=y_true)
    report = metric.compute()
    bleu = report['bleu'] * 100
    return bleu

def Model_Tokenizer(device):
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-small").to(device)
    tokenizer = AutoTokenizer.from_pretrained("t5-small")
    return model.to(device), tokenizer

def evaluation(loader, model, tokenizer, device):
    y_true = []
    y_pred = []
    for i,(x,y) in enumerate(loader):
        # Prepare and tokenize the source sentences
        #if(i==25):
        #  break
        st=x['start'].detach().numpy()
        en=x['end'].detach().numpy()
        src_sentences = ["answer: %s  context: %s </s>" % (x['context'][i][st[i]:en[i]],y[i]) for i in range(batch_size)]
        encoded_input = tokenizer(src_sentences, max_length=128,
                                  padding=True, truncation=True,
                                  return_tensors='pt', add_special_tokens=True).input_ids.to(device)

        # Translate and decode the inputs
        outputs = model.generate(encoded_input, max_length=175)
        batch_pred = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Concatenate the translated and reference sentences
        print("Batch ",i,"\n\n")
        print("Actual Questions ##########################\n\n")
        for i in range(batch_size):
            sentence = tokenizer.tokenize(y[i][6:-6])
            print(y[i][6:-6])
            y_true.append([sentence])
        print()

        print("Generated Questions #######################\n\n")
        for sentence in batch_pred:
            sentence=sentence[10:]
            print(sentence)
            sentence = tokenizer.tokenize(sentence)
            # print(sentence)
            y_pred.append(sentence)
        print()

    bleu = compute_bleu(y_pred, y_true)
    print('Bleu Score: {:.2f}'.format(bleu))

device = torch.device('cuda:{}'.format(0) if torch.cuda.is_available() else 'cpu')

evaluation(val_loader,model,tokenizer,device)

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Where was the Agucadoura Wave Farm located?
What percentage of Portugal s energy production was from renewable sources?
What is the name of Portugal s national energy transmission company?
What does REN do?
What renewable resource did Portugal generate electricity through before the solar wind revolution?
What is used to pump water uphill in Portugal?
What did the Portuguese government do to encourage rooftop solar panels?

Batch  339 


Actual Questions ##########################


What was the Portuguese population in 2011 ?
What percentage of the Portuguese population in 2011 was female ?
What percentage of the Portuguese population in 2011 was male ?
What is the dominant religion in Portugal ?
What were the Moors who converted to Catholicism known as ?
In what group of people do the Portuguese have their origin ?
When did people first start arriving in the European continent ?
What is the main population source of the