# Package Installations

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
!pip install datasets

# QG Model

## Imports

In [None]:
import datasets
from datasets import load_dataset
from datasets import Features, Value, Sequence
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
import transformers
from transformers import AutoTokenizer, TFT5ForConditionalGeneration
import datetime
import os

## Hyperparameters

In [None]:
task_prefix = 'extract answers: '
learning_rate = 3e-4
encoder_max_len = 250
decoder_max_len = 70
batch_size = 4

## Tokenizer & Optimizer

In [None]:
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate)

## Datasets

### Check if the datasets are already saved on Drive

In [None]:
from pathlib import Path

train_ds_file = Path(r'/content/drive/MyDrive/Datasets/AnswerExtraction/squad-valid-encoded.json')
is_ds_saved = False
if train_ds_file.is_file():
    is_ds_saved = True

### Case 1: Datasets **NOT** saved on Drive
-----
**Preprocessing and encoding datasets**

#### NewsQA Preprocessing

**Downloading and loading dataset**

In [None]:
if not is_ds_saved:
  !gdown --folder 1ujEc3UsU73RakkZ9RSYnlfbTyIcyRUAm

In [None]:
if not is_ds_saved:
  newsqa_dataset_dir = "/content/NewsQaDataset"

  train=load_dataset("newsqa", split="train[:90%]",data_dir=newsqa_dataset_dir,name="combined-json")
  validation=load_dataset("newsqa", split="train[-10%:]",data_dir=newsqa_dataset_dir,name="combined-json")

In [None]:
if not is_ds_saved:
  print(next(iter(train)))
  print(next(iter(validation)))
  print(len(train))
  print(len(validation))

**Get split counts (train, dev/validation, test)**

In [None]:
# newsqa_dataset_dir = "/content/NewsQaDataset"

# tmp = load_dataset("newsqa", split='train', data_dir=newsqa_dataset_dir, name="combined-json")

# train_count = 0
# validation_count = 0
# test_count = 0

# for example in tmp:
#   if(example['type'] == 'train'):
#     train_count += 1
#   elif(example['type'] == 'dev'):
#     validation_count += 1
#   elif(example['type'] == 'test'):
#     test_count += 1
#   else:
#     print(example['type'])

# print(train_count)
# print(validation_count)
# print(test_count)

**Remove unused columns**

In [None]:
if not is_ds_saved:
  train_tmp = train.remove_columns(['storyId','type'])
  train_tmp = train_tmp.flatten()
  train_tmp = train_tmp.remove_columns(['questions.q','questions.isAnswerAbsent', 'questions.isQuestionBad','questions.answers', 'questions.validated_answers'])

  train_tmp = train_tmp.rename_column("text", "context")
  train_tmp = train_tmp.rename_column("questions.consensus", "answers")

In [None]:
if not is_ds_saved:
  validation_tmp = validation.remove_columns(['storyId','type'])
  validation_tmp = validation_tmp.flatten()
  validation_tmp = validation_tmp.remove_columns(['questions.q','questions.isAnswerAbsent', 'questions.isQuestionBad','questions.answers', 'questions.validated_answers'])

  validation_tmp = validation_tmp.rename_column("text", "context")
  validation_tmp = validation_tmp.rename_column("questions.consensus", "answers")

In [None]:
if not is_ds_saved:
  print(train_tmp.features)
  print(validation_tmp.features)

**Remove duplicates, leading and trailing whitespace, and punctuation from answers**

In [None]:
from tqdm import tqdm
from string import punctuation

#punctuation = !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
# we will remove the dollar sign $ ------ assssssskkkkkkk

def answersToList(example):
  answersSet = set()
  for answer in example['answers']:
    if answer['noAnswer'] or answer['badQuestion']:
      continue
    answerstring = example['context'][answer['s']:answer['e']]
    answerstring = answerstring.strip()
    answerstring = answerstring.strip(punctuation)
    answersSet.add(answerstring)
  answersList=list(answersSet)
  example['answers']=answersList
  return example

In [None]:
if not is_ds_saved:
  newsqa_train_ds = train_tmp.map(answersToList, num_proc=16)
  newsqa_valid_ds = validation_tmp.map(answersToList, num_proc=16)

In [None]:
if not is_ds_saved:
  print(next(iter(newsqa_train_ds)))
  print(next(iter(newsqa_valid_ds)))

**Remove rows with no answers**

> 20 in train and 4 in validation



In [None]:
def getNoAnswerIndices(ds):
  no_answer_indices = list()
  for index, example in enumerate(ds):
    # all() returns True if condition is true for all elements OR list is empty
    if(all(answer == '' for answer in example['answers'])):
      no_answer_indices.append(index)
  return no_answer_indices

In [None]:
def removeNoAnswerIndices(ds, no_answer_indices):
  all_indices = list(range(0, len(ds)))
  filtered_indices = [x for x in all_indices if x not in no_answer_indices]
  filtered_ds = ds.select(filtered_indices)
  return filtered_ds

Train

In [None]:
if not is_ds_saved:
  no_answer_indices = getNoAnswerIndices(newsqa_train_ds)
  print(len(no_answer_indices))

  # for index in no_answer_indices:
  #   print(newsqa_train_ds[index]['answers'])

  # for index in no_answer_indices:
  #   print(train[index]['questions']['consensus'])

In [None]:
if not is_ds_saved:
  len_before = len(newsqa_train_ds)
  newsqa_train_ds = removeNoAnswerIndices(newsqa_train_ds, no_answer_indices)
  len_after = len(newsqa_train_ds)
  print(f'Removed {len_before - len_after}')

Validation

In [None]:
if not is_ds_saved:
  no_answer_indices = getNoAnswerIndices(newsqa_valid_ds)
  print(len(no_answer_indices))

  # for index in no_answer_indices:
  #   print(newsqa_valid_ds[index]['answers'])

  # for index in no_answer_indices:
  # print(validation[index]['questions']['consensus'])

In [None]:
if not is_ds_saved:
  len_before = len(newsqa_valid_ds)
  newsqa_valid_ds = removeNoAnswerIndices(newsqa_valid_ds, no_answer_indices)
  len_after = len(newsqa_valid_ds)
  print(f'Removed {len_before - len_after}')

Simple statistics number of answers

In [None]:
# newsqa_ds = datasets.concatenate_datasets([newsqa_train_ds, newsqa_valid_ds])

In [None]:
# import pandas as pd
# import numpy as np

# answerLengthsBefore = list()
# answerLengthsAfter = list()
# for example in newsqa_ds:
#   answerLengthsBefore.append(len(example['answers']))
#   answerLengthsAfter.append(len(set(example['answers'])))

# ds = list()
# ds.append(answerLengthsBefore)
# ds.append(answerLengthsAfter)
# df_describe = pd.DataFrame(ds)
# df_describe = df_describe.transpose()
# df_describe.columns = ['answerLengthsBefore', 'answerLengthsAfter']
# print(df_describe.describe())
# print(sum(answerLengthsBefore) / len(answerLengthsBefore))
# print(sum(answerLengthsAfter) / len(answerLengthsAfter))

In [None]:
# def autopct_format(values):
#         def my_format(pct):
#             total = sum(values)
#             val = int(round(pct*total/100.0))
#             return '{:.1f}%\n({v:d})'.format(pct, v=val)
#         return my_format

# s = df_describe['answerLengthsAfter'].value_counts().sort_values(ascending = False)
# top_k = 6
# others = s[top_k:].sum()
# print(s[top_k:].keys())
# s = s.drop(s[top_k:].keys())
# s['Others'] = others
# print(s)
# s.plot.pie(autopct=autopct_format(s),figsize=(5, 5))

#### SQuAD Preprocessing

Load squad dataset from "datasets" huggingface library

In [None]:
if not is_ds_saved:
  squad_train_ds = load_dataset('squad', split='train')
  squad_valid_ds = load_dataset('squad', split='validation')

In [None]:
squad_ds = datasets.concatenate_datasets([squad_train_ds, squad_valid_ds])

In [None]:
len(squad_ds)

Mapping every context to its answers

In [None]:
def map_context_to_answers(example, context_answers):
  context = example['context']
  answers = example['answers']['text']
  current_context_answers = context_answers.get(context, set())
  current_context_answers.update(answers)
  context_answers[context] = current_context_answers

In [None]:
from tqdm import tqdm

context_answers_squad = {}
for example in tqdm(squad_ds):
  map_context_to_answers(example,context_answers_squad)

Converting dictionary back to Huggingface dataset object

In [None]:
import pandas as pd

squad_ds = datasets.Dataset.from_pandas(pd.DataFrame(context_answers_squad.items(), columns=['context', 'answers']))

In [None]:
print(len(squad_ds))

Simple statistics number of answers

In [None]:
# import pandas as pd
# import numpy as np

# answerLengthsBefore = list()
# answerLengthsAfter = list()
# for example in squad_ds:
#   answerLengthsBefore.append(len(example['answers']))
#   answerLengthsAfter.append(len(set(example['answers'])))

# ds = list()
# ds.append(answerLengthsBefore)
# ds.append(answerLengthsAfter)
# df_describe = pd.DataFrame(ds)
# df_describe = df_describe.transpose()
# df_describe.columns = ['answerLengthsBefore', 'answerLengthsAfter']
# print(df_describe.describe())
# print(sum(answerLengthsBefore) / len(answerLengthsBefore))
# print(sum(answerLengthsAfter) / len(answerLengthsAfter))

In [None]:
# def autopct_format(values):
#         def my_format(pct):
#             total = sum(values)
#             val = int(round(pct*total/100.0))
#             return '{:.1f}%\n({v:d})'.format(pct, v=val)
#         return my_format

# s = df_describe['answerLengthsAfter'].value_counts().sort_values(ascending = False)
# top_k = 6
# others = s[top_k:].sum()
# print(s[top_k:].keys())
# s = s.drop(s[top_k:].keys())
# s['Others'] = others
# print(s)
# s.plot.pie(autopct=autopct_format(s),figsize=(5, 5))

In [None]:
squad_ds_splits = squad_ds.train_test_split(test_size=0.1)
squad_train_ds = squad_ds_splits["train"]
squad_valid_ds = squad_ds_splits["test"]

In [None]:
print(type(squad_train_ds))
print(len(squad_train_ds))

#### Encoding Datasets

Encoding examples (putting data in proper format for model & tokenizing the data)

In [None]:
def encode(example, encoder_max_len=encoder_max_len, decoder_max_len=decoder_max_len):
    context = example['context']
    answers=example['answers']
    answers = ' EOA '.join([i for i in list(answers)]) # Comma seperated answers

    input = task_prefix + 'context: ' + context
    output = answers

    encoder_inputs = tokenizer(input, truncation=True,
                               return_tensors='tf', max_length=encoder_max_len,
                              pad_to_max_length=True)
    decoder_inputs = tokenizer(output, truncation=True,
                               return_tensors='tf', max_length=decoder_max_len,
                              pad_to_max_length=True)

    # Shapes come from the encoder_max_len and decoder_max_len in hyperparameters section
    input_ids = encoder_inputs['input_ids'][0] # Shape before flattening: input_ids.shape= (1, 250) [[1,1,3,...]]
    input_attention = encoder_inputs['attention_mask'][0] # Shape before flattening: attension_mask.shape= (1, 250)
    target_ids = decoder_inputs['input_ids'][0] # Shape before flattening: target_ids.shape= (1, 70)
    target_attention = decoder_inputs['attention_mask'][0] # Shape before flattening: target_attention.shape= (1, 70)

    outputs = {'input_ids':input_ids, 'attention_mask': input_attention,
               'labels':target_ids, 'decoder_attention_mask':target_attention}
    return outputs

In [None]:
if not is_ds_saved:
  squad_train_ds = squad_train_ds.map(encode)
  squad_valid_ds = squad_valid_ds.map(encode)
  newsqa_train_ds = newsqa_train_ds.map(encode)
  newsqa_valid_ds = newsqa_valid_ds.map(encode)

#### Save encoded datasets to Drive (requires mounting Drive)

In [None]:
save_folder_path = "/content/drive/MyDrive/Datasets/AnswerExtraction"

if not is_ds_saved:
  squad_train_ds.to_json(save_folder_path + "/" + "squad-train-encoded-new.json")
  squad_valid_ds.to_json(save_folder_path + "/" + "squad-valid-encoded-new.json")
  newsqa_train_ds.to_json(save_folder_path + "/" + "newsqa-train-encoded-new.json")
  newsqa_valid_ds.to_json(save_folder_path + "/" + "newsqa-valid-encoded-new.json")

### Case 2: Datasets saved on Drive
-----
**Loading preprocessed and encoded datasets**

**Load datasets from Drive**

In [None]:
save_folder_path = "/content/drive/MyDrive/Datasets/AnswerExtraction"

SQuAD

In [None]:
if is_ds_saved:
  squad_data_files = {
        "train": save_folder_path + "/" + "squad-train-encoded.json",
        "validation": save_folder_path + "/" + "squad-valid-encoded.json",
    }
  squad_train_ds = load_dataset("json", data_files=squad_data_files, split='train')
  squad_valid_ds = load_dataset("json", data_files=squad_data_files, split='validation')

NewsQA

In [None]:
if is_ds_saved:
  newsqa_data_files = {
        "train": save_folder_path + "/" + "newsqa-train-encoded.json",
        "validation": save_folder_path + "/" + "newsqa-valid-encoded.json",
    }
  newsqa_train_ds = load_dataset("json", data_files=newsqa_data_files, split='train')
  newsqa_valid_ds = load_dataset("json", data_files=newsqa_data_files, split='validation')

###Merge Datasets & Convert to TensorFlow PrefetchDataset

In [None]:
train_ds = datasets.concatenate_datasets([squad_train_ds, newsqa_train_ds])
valid_ds = datasets.concatenate_datasets([squad_valid_ds, newsqa_valid_ds])

In [None]:
print(len(train_ds))
print(len(valid_ds))

**Convert to TensorFlow PrefetchDataset**

In [None]:
def to_tf_dataset(dataset):
  columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']
  dataset.set_format(type='tensorflow', columns=columns)
  return_types = {'input_ids':tf.int32, 'attention_mask':tf.int32,
                'labels':tf.int32, 'decoder_attention_mask':tf.int32,  }
  return_shapes = {'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None]),
                  'labels': tf.TensorShape([None]), 'decoder_attention_mask':tf.TensorShape([None])}
  ds = tf.data.Dataset.from_generator(lambda : dataset, return_types, return_shapes)
  return ds

In [None]:
tf_train_ds = to_tf_dataset(train_ds)
tf_valid_ds = to_tf_dataset(valid_ds)
#tf_test_ds=to_tf_dataset(test_ds)

In [None]:
tf_train_ds

In [None]:
def create_dataset(dataset, cache_path=None, batch_size=4,
                   buffer_size= 1000, shuffling=True):
    if cache_path is not None:
        dataset = dataset.cache(cache_path) # ZZZ
    dataset = dataset.repeat()  # 86k epoch
    if shuffling:
        dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

In [None]:
tf_train_ds= create_dataset(tf_train_ds, batch_size=batch_size,
                           shuffling=True, cache_path = None)
tf_valid_ds = create_dataset(tf_valid_ds, batch_size=batch_size,
                           shuffling=False, cache_path = None)

In [None]:
tf_train_ds

In [None]:
# del train_ds, valid_ds, squad_train_ds, squad_valid_ds, newsqa_train_ds, newsqa_valid_ds

## Training

Saving lowest val_loss model checkpoint

In [None]:
model = TFT5ForConditionalGeneration.from_pretrained(model_name) #options: t5-small, t5-base, t5-large, t5-3b, t5-11b
model.compile(optimizer=optimizer)

In [None]:
epochs_done = 0
total_num_of_epochs = 1
ntrain = len(train_ds)
nvalid = len(valid_ds)
steps = ntrain // batch_size
valid_steps = nvalid // batch_size
print("Total Steps: ", steps)
print("Total Validation Steps: ", valid_steps)
model.fit(tf_train_ds, epochs=total_num_of_epochs, steps_per_epoch=steps, validation_data=tf_valid_ds, validation_steps=valid_steps, initial_epoch=epochs_done)

In [None]:
model.save_pretrained(f'{model_name}-epochs={total_num_of_epochs}')

In [None]:
saved_model_dir_src = f'{model_name}-epochs={total_num_of_epochs}'
saved_model_dir_dest = "/content/drive/MyDrive/AnswerExtractionModels"

In [None]:
!cp -r {saved_model_dir_src} {saved_model_dir_dest}

# Model Evaluation

## Manual

**Single Example**

In [None]:
context = "In a broad sense, all of computer security is concerned with access control. Indeed, RFC 4949 defines computer security as follows: measures that implement and assure security services in a computer system, particularly those that assure access control service. This chapter deals with a narrower, more specific concept of access control: Access control implements a security policy that specifies who or what (e.g., in the case of a process) may have access to each specific system resource, and the type of access that is permitted in each instance." #@param {type:"string"}
context = context.strip()
input_text = task_prefix + 'context: ' + context

In [None]:
encoded_query = tokenizer(input_text, return_tensors='tf', pad_to_max_length=True, truncation=True, max_length=encoder_max_len)

input_ids = encoded_query["input_ids"]
attention_mask = encoded_query["attention_mask"]
generated_answers = model.generate(input_ids, attention_mask=attention_mask, max_length=decoder_max_len, top_p=0.95, top_k=50, repetition_penalty=float(2))
decoded_answers = tokenizer.decode(generated_answers.numpy()[0], skip_special_tokens=True)

print("Answers: ", decoded_answers)

**Multiple Examples**

In [None]:
# use different length sentences to test batching

sentences = [
  "In a broad sense, all of computer security is concerned with access control. Indeed, RFC 4949 defines computer security as follows: measures that implement and assure security services in a computer system, particularly those that assure access control service. This chapter deals with a narrower, more specific concept of access control: Access control implements a security policy that specifies who or what (e.g., in the case of a process) may have access to each specific system resource, and the type of access that is permitted in each instance."
  , "A subject is an entity capable of accessing objects. Generally, the concept of subject equates with that of process. Any user or application actually gains access to an object by means of a process that represents that user or application. The process takes on the attributes of the user, such as access rights."
]

In [None]:
inputs = tokenizer([task_prefix + 'context: ' + sentence for sentence in sentences], return_tensors="tf", padding=True)

output_sequences = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
)

tokenizer.batch_decode(output_sequences, skip_special_tokens=True)

## Automatic

**Inference on training dataset**

In [None]:
extracted_answers = list()
for i in tqdm(range(0, len(train_ds['input_ids']), 700)):
  output_sequences = model.generate(
      input_ids=train_ds["input_ids"][i:i+700],
      attention_mask=train_ds["attention_mask"][i:i+700],
      max_length=decoder_max_len,
      top_p=0.95,
      top_k=50,
      repetition_penalty=float(2)
  )
  a = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
  extracted_answers.extend(a)

**Saving inference**

In [None]:
print(len(extracted_answers))

In [None]:
import os
size_in_mb = os.path.getsize("/content/extracted_answers.pickle") / 10**6
print(f'extracted_answers.pickle size: {size_in_mb}MB')

In [None]:
with open('extracted_answers.pickle', 'wb') as f:
    pickle.dump(extracted_answers, f)

In [None]:
!cp "extracted_answers.pickle" "/content/drive/MyDrive/AnswerExtractionModels"

**Loading inference**

In [None]:
!cp "/content/drive/MyDrive/AnswerExtractionModels/extracted_answers.pickle" "extracted_answers.pickle"

In [None]:
with open('extracted_answers.pickle', 'rb') as f:
    extracted_answers = pickle.load(f)

print('extracted_answers is', len(extracted_answers))

**Transform list of extracted answers for each context to list of list**

**Ex:**
```
myList = ['car, boat, vehicle']
myListTransformed = [['car', 'boat', 'vehicle']]
```



In [None]:
extracted_answers = [answersString.split(',') for answersString in extracted_answers]
print(extracted_answers[0])

**Calculating F1 score**

In [None]:
def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()

In [None]:
def compute_f1(a_gold, a_pred):
  gold_toks = get_tokens(a_gold)
  pred_toks = get_tokens(a_pred)
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())
  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return tuple([int(gold_toks == pred_toks)] * 3)
  if num_same == 0:
    return tuple([0] * 3)
  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return (f1, precision, recall)

def compute_f1_avg(f1_scores, total):
  return (sum(f1_scores) / total) * 100

In [None]:
def compute_em(a_gold, a_pred):
  return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_em_avg(em_scores, total):
  return (sum(em_scores) / total) * 100

In [None]:
from operator import itemgetter

f1_scores = list()
precison_scores = list()
recall_scores = list()
em_scores = list()

for ground_truth_answers_list, extracted_answers_list in tqdm(list(zip(train_ds['answers'], extracted_answers))):
  for extracted_answer in extracted_answers_list:
    # ***IMPORTANT***
    # - Problem: Model outputs empty string for no answers sometimes
    # - Temporary fix by setting the conditon for default value to check if the
    #   current generated answer is an empty string (no answer)
    #   and setting the result to 1 as that means both ground truth and
    #   generated answer agree that there is no answers to extract
    # - Actual fix by removing the empty answers and retraining the model

    result = [compute_f1(ground_truth_answer, extracted_answer) for ground_truth_answer in ground_truth_answers_list]
    f1_score, precision, recall = max(result, key=itemgetter(0)) if(len(result) != 0) else tuple([int(extracted_answer == '')]*3)
    # f1_score = max([compute_f1(ground_truth_answer, extracted_answer) for ground_truth_answer in ground_truth_answers_list], default= int(extracted_answer == ''))
    em_score = max([compute_em(ground_truth_answer, extracted_answer) for ground_truth_answer in ground_truth_answers_list], default= int(extracted_answer == ''))
    f1_scores.append(f1_score)
    precison_scores.append(precision)
    recall_scores.append(recall)
    em_scores.append(em_score)

In [None]:
print(len(f1_scores))
print(len(em_scores))

print(list(zip(f1_scores[:100], em_scores[:100])))

print('F1 =', compute_f1_avg(f1_scores, len(f1_scores)), '%')
print('EM =', compute_em_avg(em_scores, len(em_scores)), '%')
print('Precision =', compute_f1_avg(precison_scores, len(f1_scores)), '%')
print('Recall =', compute_f1_avg(recall_scores, len(f1_scores)), '%')

Scores before adding conditon in default: different in the 3rd digit in the precentage

F1 = 46.97654860625854 %

EM = 32.95584116546556 %