In [1]:
import json
import transformers
import random

  from .autonotebook import tqdm as notebook_tqdm


### Load SQuAD Dataset

In [2]:
from datasets import load_dataset

squad = load_dataset("rajpurkar/squad")

*   Data splitting is done automatically

In [3]:
squad.keys()

dict_keys(['train', 'validation'])

In [4]:
squad_train, squad_valid = squad["train"], squad["validation"]

In [5]:
squad_train[0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

### Save the training and validation to `Jsonlines`
*   Save them and apply augmentation on the training set
*   To keep the validation set the same

In [6]:
def dataset_to_jsonlines(dataset, filename):
    with open(filename, 'w') as f:
        for i in range(len(dataset)):
            item = dataset[i]
            f.write(json.dumps(item) + '\n')
    print(f"{filename} is generated.")

In [None]:
dataset_to_jsonlines(squad_train, "./data/squad_train_vanilla.jsonl")
dataset_to_jsonlines(squad_valid, "./data/squad_valid.jsonl")

squad_train_vanilla.jsonl is generated.
squad_valid.jsonl is generated.


### Data augmentation

In [8]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lloyd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lloyd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Lloyd\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [9]:
import nltk
from nltk.corpus import wordnet as wn 
from nltk.tokenize import word_tokenize 
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer

*   Collect a set of verbs and nouns

In [10]:
def get_word_of_type(pos):
    word_set = set()
    for syn in wn.all_synsets(pos):
        for lemma in syn.lemmas():
            word_set.add(lemma.name())
    return list(word_set)

In [11]:
verbs = get_word_of_type('v')
raw_nouns = get_word_of_type('n')

# Some nouns are too long
nouns = set()
for n in raw_nouns:
    if len(n) <= 15 and "_" not in n:
        nouns.add(n)

In [12]:
def tag_sentence(sentence):
    nltk_tokenized = word_tokenize(sentence)
    sentence_tag = pos_tag(nltk_tokenized)
    return sentence_tag

In [13]:
def get_wn_pos(tag):
    if tag.startswith('J'):
        return wn.ADJ  # Adjective
    elif tag.startswith('V'):
        return wn.VERB  # Verb
    elif tag.startswith('N'):
        return wn.NOUN  # Noun
    elif tag.startswith('R'):
        return wn.ADV  # Adverb
    else:
        return wn.NOUN  # Default to Noun

In [14]:
# Find original words to be replaced
def collect_original_word(tagged_sentence, lemmatized_token_list, token):
    # Manually implement indexing all elements
    indices_to_change = []
    for i, l_token in enumerate(lemmatized_token_list):
        if l_token == token:
            indices_to_change.append(i)
    
    original_words = []
    for i in indices_to_change:
        original_words.append(tagged_sentence[i][0])

    return original_words

*   Apply changes to the original text

In [None]:
def apply_modification(squad_instance, verbs_to_choose, nouns_to_choose, change_verbs=None, change_nouns=None):
    lemmatizer = WordNetLemmatizer()
    lemmatized_token_lists = [[], []]
    lemmatized_verb_set = set()
    lemmatized_noun_set = set()

    context = squad_instance["context"]
    question = squad_instance["question"]

    tagged_sentences = [tag_sentence(context), 
                        tag_sentence(question)]
    
    verb_to_skip = ["be", "do", "have"]

    for i, tagged_sentence in enumerate(tagged_sentences):
        for token, tag in tagged_sentence:
            wn_pos = get_wn_pos(tag)
            lemmatized_token = lemmatizer.lemmatize(token, pos = wn_pos)
            lemmatized_token_lists[i].append(lemmatized_token)
             # ignore forms of be and do
            if tag.startswith('V') and not lemmatized_token in verb_to_skip:
                lemmatized_verb_set.add(lemmatized_token)
            if tag.startswith('N'):
                lemmatized_noun_set.add(lemmatized_token)

    # Randomly choose lemmatized verbs and nouns
    if not change_verbs: # by default replace all verbs
        change_verbs = len(lemmatized_verb_set)
    else:
        change_verbs = min(change_verbs, len(lemmatized_verb_set))

    if not change_nouns:
        change_nouns = len(lemmatized_noun_set)
    else:
        change_nouns= min(change_nouns, len(lemmatized_noun_set))

    verb_to_replace = random.sample(sorted(lemmatized_verb_set), change_verbs)
    noun_to_replace = random.sample(sorted(lemmatized_noun_set), change_nouns)

    # Avoid sampling duplicates
    for v in verb_to_replace:
        if v in verbs_to_choose:
            verbs_to_choose.remove(v)
    for n in noun_to_replace:
        if n in nouns_to_choose:
            nouns_to_choose.remove(n)

    new_verbs = random.sample(sorted(verbs_to_choose), change_verbs)
    new_nouns = random.sample(sorted(nouns_to_choose), change_nouns)

    # Apply changes to the original text
    context = squad_instance["context"]
    question = squad_instance["question"]
    answer = squad_instance["answers"]["text"][0]
    answer_id = squad_instance["answers"]["answer_start"][0]
    context_pre, context_post = context[:answer_id], context[answer_id:]

    for old_words, new_words in zip([verb_to_replace, noun_to_replace], [new_verbs, new_nouns]):
        for old_word, new_word in zip(old_words, new_words):
            original_words = []
            for tagged_sentence, lemmatized_token_list in zip(tagged_sentences, lemmatized_token_lists):
                original_words.extend(collect_original_word(tagged_sentence, lemmatized_token_list, old_word))
            for original_word in original_words:
                question = question.replace(original_word, new_word)
                answer = answer.replace(original_word, new_word)
                context_pre = context_pre.replace(original_word, new_word)
                context_post = context_post.replace(original_word, new_word)

    # Construct a new squad instance
    new_squad_instance = squad_instance.copy()
    new_squad_instance["context"] = context_pre + context_post
    new_squad_instance["question"] = question
    new_squad_instance["answers"] = {"text": [answer], "answer_start": [len(context_pre)]}

    return new_squad_instance


In [16]:
new_squad_instance = apply_modification(squad_train[0], verbs, nouns)

*   Check the new instance

In [17]:
squad_train[0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [18]:
new_squad_instance

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the accoucheur has a Catholic Parana. push_back the minah unguent\'s parliamentarian messmate is a parliamentarianen limey of the nitwit peridinian. Immediately in roof of the minah unguent and swish it, is a katabolism limey of craniometry with annotation cover_for with the jolly "admixture thunderclap chinchilla pause". Next to the minah unguent is the molestation of the newsvsuccussionor Trichoceros. Immediately behind the purist is the Ulanova, a Marian manta of Gabriel and Gongora. It is a Aeolian of the symbol-worship at congress, hazan where the nitwit peridinian reputedly aquaplane to sward Pecos Imuran in 1858. At the succussion of the main Guest (and in a direct corbina that act_upon through 3 limeys and the Erythrocebus IOU), is a simple, modern microsporidian limey of peridinian.',
 'question': 'To whom did the nitwit peridinian allegedly aquaplane in 1858 in congress haza

### Generate new datasets

In [19]:
new_epochs = 15
change_verbs = None
change_nouns = None

In [None]:
for i in range(new_epochs):
    new_train = []
    for squad_instance in squad_train:
        new_squad_instance = apply_modification(squad_instance, 
                                                verbs, 
                                                nouns, 
                                                change_verbs=change_verbs, 
                                                change_nouns=change_nouns)
        new_train.append(new_squad_instance)
    
    dataset_to_jsonlines(new_train, f"./data/squad_train_noised_{i}.jsonl")

squad_train_noised_0.jsonl is generated.
squad_train_noised_1.jsonl is generated.
squad_train_noised_2.jsonl is generated.
squad_train_noised_3.jsonl is generated.
squad_train_noised_4.jsonl is generated.
squad_train_noised_5.jsonl is generated.
squad_train_noised_6.jsonl is generated.
squad_train_noised_7.jsonl is generated.
squad_train_noised_8.jsonl is generated.
squad_train_noised_9.jsonl is generated.
squad_train_noised_10.jsonl is generated.
squad_train_noised_11.jsonl is generated.
squad_train_noised_12.jsonl is generated.
squad_train_noised_13.jsonl is generated.
squad_train_noised_14.jsonl is generated.
