In [2]:
from datasets import load_dataset
import nltk
import re
import numpy as np
from tqdm import tqdm
import pickle
import spacy

nlp = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
def preprocess_tokenize_sentence(sentence):
    if sentence == "":
        return sentence
    sentence = re.sub(r'\n', " ", sentence)
    sentence = re.sub(r'(((http|https):\/\/)|www\.)([a-zA-Z0-9]+\.){0,2}[a-zA-Z0-9]+([a-zA-Z0-9\/#%&=\?_\.\-\+]+)', "", sentence)
    sentence = re.sub(r'(@[a-zA-Z0-9_]+)', "", sentence)
    sentence = re.sub(r'(#[a-zA-Z0-9_]+\b)', "", sentence)
    sentence = re.sub(r'\d+', "", sentence)
    sentence = re.sub(r'--', " ", sentence)
    sentence = re.sub(r'[\_\$\*\^\(\)\[\]\{\}\=\+\<\>",\&\%\-\—\”\“\–\\\.\?\!;]', " ", sentence)
    sentence = re.sub(r'&lt;[^&]*&gt;', '', sentence)
    sentence = re.sub(r'——', "...", sentence)
    sentence = re.sub(r'—-?', r", ", sentence)
    sentence = re.sub(r'_', "", sentence)
    sentence = re.sub(r'[“”]', "\"", sentence)
    sentence = re.sub(r'[‘’]', "\'", sentence)
    sentence = re.sub(r'[.,\'"!?()\[\]{}\-;:]+', "", sentence)
    sentence = re.sub(r'/', " ", sentence)
    
    sentence = sentence.lower()
    sentence = nltk.word_tokenize(sentence)

    pattern = re.compile(r'\b\w+\b')

    sentence = [word for word in sentence if pattern.match(word)]

    # pattern2 = r"/"

    return sentence

### Wiki_QA

#### Link to tokenized dataset (pkl) 
- Train: https://drive.google.com/file/d/1S5HtkMnENY7KnxHxZG5cn0-tfaCqMfvt/view?usp=sharing
- Validation: https://drive.google.com/file/d/1sEVb9BmZKjG3M8-xcI65WLvtWkshJRU8/view?usp=sharing
- Test: https://drive.google.com/file/d/1-CjZ-j7JMShlXj06gZoySQE9RD_umE79/view?usp=sharing

#### CODE TO GENERATE TOKENIZED DATASETS IS PROVIDED BELOW

In [4]:
wiki_qa_dataset = load_dataset("wiki_qa")

In [17]:
wiki_qa_dataset["train"][11]

{'question_id': 'Q2',
 'question': 'How are the directions of the velocity and force vectors related in a circular motion',
 'document_title': 'Circular motion',
 'answer': "Without this acceleration, the object would move in a straight line, according to Newton's laws of motion .",
 'label': 0}

In [56]:
wiki_qa_set = {
    "train" : {},
    "validation" : {},
    "test" : {}
}

for split in ["train", "validation", "test"]:
    for example in wiki_qa_dataset[split]:
        if example["question_id"] not in wiki_qa_set[split]:
            wiki_qa_set[split][example["question_id"]] = {
                "question" : preprocess_tokenize_sentence(example["question"]),
                "answers" : [],
                "labels" : [],
                "sum_labels" : 0
            }
        wiki_qa_set[split][example["question_id"]]["answers"].append(preprocess_tokenize_sentence(example["answer"]))
        wiki_qa_set[split][example["question_id"]]["labels"].append(example["label"])
        wiki_qa_set[split][example["question_id"]]["sum_labels"] += example["label"]

wiki_qa_train = [{"question" : wiki_qa_set["train"][qn]["question"], 
                  "answers" : wiki_qa_set["train"][qn]["answers"], 
                  "labels" : wiki_qa_set["train"][qn]["labels"]} for qn in wiki_qa_set["train"] if wiki_qa_set["train"][qn]["sum_labels"] > 0]

wiki_qa_validation = [{"question" : wiki_qa_set["validation"][qn]["question"],
                    "answers" : wiki_qa_set["validation"][qn]["answers"],
                    "labels" : wiki_qa_set["validation"][qn]["labels"]} for qn in wiki_qa_set["validation"] if wiki_qa_set["validation"][qn]["sum_labels"] > 0]

wiki_qa_test = [{"question" : wiki_qa_set["test"][qn]["question"],
                "answers" : wiki_qa_set["test"][qn]["answers"],
                "labels" : wiki_qa_set["test"][qn]["labels"]} for qn in wiki_qa_set["test"] if wiki_qa_set["test"][qn]["sum_labels"] > 0]

In [57]:
for i in range(4):
    print(wiki_qa_train[i])

{'question': ['how', 'are', 'glacier', 'caves', 'formed'], 'answers': [['a', 'partly', 'submerged', 'glacier', 'cave', 'on', 'perito', 'moreno', 'glacier'], ['the', 'ice', 'facade', 'is', 'approximately', 'm', 'high'], ['ice', 'formations', 'in', 'the', 'titlis', 'glacier', 'cave'], ['a', 'glacier', 'cave', 'is', 'a', 'cave', 'formed', 'within', 'the', 'ice', 'of', 'a', 'glacier'], ['glacier', 'caves', 'are', 'often', 'called', 'ice', 'caves', 'but', 'this', 'term', 'is', 'properly', 'used', 'to', 'describe', 'bedrock', 'caves', 'that', 'contain', 'year', 'round', 'ice']], 'labels': [0, 0, 0, 1, 0]}
{'question': ['how', 'much', 'is', 'tablespoon', 'of', 'water'], 'answers': [['this', 'tablespoon', 'has', 'a', 'capacity', 'of', 'about', 'ml'], ['measuring', 'spoons'], ['in', 'the', 'us', 'and', 'parts', 'of', 'canada', 'a', 'tablespoon', 'is', 'the', 'largest', 'type', 'of', 'spoon', 'used', 'for', 'eating', 'from', 'a', 'bowl'], ['in', 'the', 'uk', 'europe', 'and', 'most', 'commonwealt

In [58]:
with open("wiki_qa_train.pkl", "wb") as f:
    pickle.dump(wiki_qa_train, f)

with open("wiki_qa_validation.pkl", "wb") as f:
    pickle.dump(wiki_qa_validation, f)

with open("wiki_qa_test.pkl", "wb") as f:
    pickle.dump(wiki_qa_test, f)

The dataset is tokenized before storing in pickle form, hence no tokenization is required when you load the dataset.

### SQuAD

#### Link to tokenized dataset (pkl)
- Train: https://drive.google.com/file/d/1_OSiMwCTvaeoziuK2Ew0U4dHeUfVIK6f/view?usp=sharing
- Validation: https://drive.google.com/file/d/15lPDZODuySawp9HLpRuVssaJ6m6XTPvA/view?usp=sharing


In [25]:
squad_dataset = load_dataset("squad")

In [26]:
squad_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

### DON'T TOUCH THIS SECTION UNLESS YOU WANT TO REMAKE THE SQUAD TRAIN DATASET FILE.

In [24]:
squad_train_set = []
for example in tqdm(squad_dataset["train"]):
    tmp = {}
    tmp["question"] = example["question"]
    con_text = nlp(example["context"])
    tmp["context"] = [token.text for token in con_text.sents]
    tmp["labels"] = [0 for _ in range(len(tmp["context"]))]
    for sent in tmp["context"]:
        if example["answers"]["text"][0] in sent:
            tmp["labels"][tmp["context"].index(sent)] = 1
    squad_train_set.append(tmp)


  0%|          | 0/87599 [00:00<?, ?it/s]

100%|██████████| 87599/87599 [26:46<00:00, 54.53it/s] 


In [29]:
squad_valid_set = []
for example in tqdm(squad_dataset["validation"]):
    tmp = {}
    tmp["question"] = example["question"]
    con_text = nlp(example["context"])
    tmp["context"] = [token.text for token in con_text.sents]
    tmp["labels"] = [0 for _ in range(len(tmp["context"]))]
    for sent in tmp["context"]:
        if example["answers"]["text"][0] in sent:
            tmp["labels"][tmp["context"].index(sent)] = 1
    squad_valid_set.append(tmp)


100%|██████████| 10570/10570 [07:59<00:00, 22.06it/s]


Tokenizing the dataset

In [48]:
for i, elem in tqdm(enumerate(squad_train_set)):
    elem["question"] = preprocess_tokenize_sentence(elem["question"])
    elem["context"] = [preprocess_tokenize_sentence(sent) for sent in elem["context"]]

for i, elem in tqdm(enumerate(squad_valid_set)):
    elem["question"] = preprocess_tokenize_sentence(elem["question"])
    elem["context"] = [preprocess_tokenize_sentence(sent) for sent in elem["context"]]

0it [00:00, ?it/s]

87599it [02:46, 526.65it/s]
10570it [00:19, 529.89it/s]


Storing in pickle form

In [49]:
with open("squad_train.pkl", "wb") as f:
    pickle.dump(squad_train_set, f)

with open("squad_valid.pkl", "wb") as f:
    pickle.dump(squad_valid_set, f)

#### If dataset pkl file exists then continue from here:

In [50]:
with open("squad_train.pkl", "rb") as f:
    squad_train_set = pickle.load(f)

In [51]:
squad_valid_set = pickle.load(open("squad_valid.pkl", "rb"))

In [53]:
print(squad_train_set[0])

{'question': ['to', 'whom', 'did', 'the', 'virgin', 'mary', 'allegedly', 'appear', 'in', 'in', 'lourdes', 'france'], 'context': [['architecturally', 'the', 'school', 'has', 'a', 'catholic', 'character'], ['atop', 'the', 'main', 'buildings', 'gold', 'dome', 'is', 'a', 'golden', 'statue', 'of', 'the', 'virgin', 'mary'], ['immediately', 'in', 'front', 'of', 'the', 'main', 'building', 'and', 'facing', 'it', 'is', 'a', 'copper', 'statue', 'of', 'christ', 'with', 'arms', 'upraised', 'with', 'the', 'legend', 'venite', 'ad', 'me', 'omnes'], ['next', 'to', 'the', 'main', 'building', 'is', 'the', 'basilica', 'of', 'the', 'sacred', 'heart'], ['immediately', 'behind', 'the', 'basilica', 'is', 'the', 'grotto', 'a', 'marian', 'place', 'of', 'prayer', 'and', 'reflection'], ['it', 'is', 'a', 'replica', 'of', 'the', 'grotto', 'at', 'lourdes', 'france', 'where', 'the', 'virgin', 'mary', 'reputedly', 'appeared', 'to', 'saint', 'bernadette', 'soubirous', 'in'], ['at', 'the', 'end', 'of', 'the', 'main', 'd

In [54]:
print(squad_valid_set[0])

{'question': ['which', 'nfl', 'team', 'represented', 'the', 'afc', 'at', 'super', 'bowl'], 'context': [['super', 'bowl', 'was', 'an', 'american', 'football', 'game', 'to', 'determine', 'the', 'champion', 'of', 'the', 'national', 'football', 'league', 'nfl', 'for', 'the', 'season'], ['the', 'american', 'football', 'conference', 'afc', 'champion', 'denver', 'broncos', 'defeated', 'the', 'national', 'football', 'conference', 'nfc', 'champion', 'carolina', 'panthers', 'to', 'earn', 'their', 'third', 'super', 'bowl', 'title'], ['the', 'game', 'was', 'played', 'on', 'february', 'at', 'levis', 'stadium', 'in', 'the', 'san', 'francisco', 'bay', 'area', 'at', 'santa', 'clara', 'california'], ['as', 'this', 'was', 'the', 'th', 'super', 'bowl', 'the', 'league', 'emphasized', 'the', 'golden', 'anniversary', 'with', 'various', 'gold', 'themed', 'initiatives', 'as', 'well', 'as', 'temporarily', 'suspending', 'the', 'tradition', 'of', 'naming', 'each', 'super', 'bowl', 'game', 'with', 'roman', 'numer

qnli (to be used only for TANDA - has not been recommended in the Cosinet paper)

Further preprocessing of QNLI to be done later

In [55]:
qnli_dataset = load_dataset("glue", "qnli")

In [32]:
qnli_dataset["train"][2500]


{'question': 'Who replaced Stratford Canning after he first resigned as British ambassador to the Ottoman Empire ?',
 'sentence': "In February 1853, the British government of Lord Aberdeen, the prime minister, re-appointed Stratford Canning as British ambassador to the Ottoman Empire.:110 Having resigned the ambassadorship in January, he had been replaced by Colonel Rose as chargé d'affaires.",
 'label': 0,
 'idx': 2500}

#### Concept-Net Numberbatch embeddings : obtained using `numberbatch-en.txt` from https://github.com/commonsense/conceptnet-numberbatch#downloads

#### Link to embeddings (pkl)
https://drive.google.com/file/d/1f2urijZCLm0KYHgnF9bLsR8ABfPmR6Nj/view?usp=sharing

Observing the embeddings

In [13]:
import pandas as pd

In [14]:
txt_embeds = pd.read_csv("numberbatch-en.txt", sep=" ", skiprows=1, header=None, index_col=0)

In [56]:
txt_embeds

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
##,0.0295,-0.0405,-0.0341,0.0837,-0.0575,0.0482,-0.0145,0.0019,0.0347,0.0825,...,0.0095,0.0908,0.0549,0.0121,0.0056,0.0443,-0.0217,-0.0239,0.0403,-0.0005
###,0.0202,-0.0249,-0.0653,0.0930,-0.0923,0.0306,-0.0093,0.0224,-0.0334,0.0750,...,-0.0078,0.0819,0.0082,-0.0285,0.0007,-0.0418,0.0893,-0.0575,-0.0580,0.0651
####,0.0521,-0.0262,-0.0881,0.1085,-0.1168,0.0324,0.0084,0.0382,-0.0287,0.1098,...,0.0540,0.0769,0.0234,-0.0080,0.1110,-0.0342,0.0570,-0.0424,-0.0192,0.0599
#####,0.0416,0.0061,-0.0388,0.0175,-0.0617,-0.0043,0.0140,0.0725,-0.0287,0.0469,...,0.0255,0.0511,-0.0039,-0.0399,0.0665,-0.0622,-0.0117,0.0066,-0.0165,0.0645
#####_metres,-0.0018,-0.0410,-0.0531,0.1115,-0.1031,-0.0065,-0.0391,0.0145,0.0391,0.1635,...,-0.0401,-0.0299,-0.0179,-0.0354,0.0237,0.0357,0.0335,0.0124,-0.0084,-0.0431
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
⠙_⠉_⠇,-0.0873,-0.1373,-0.0600,0.0142,-0.0113,-0.0224,0.0531,-0.1467,0.0259,-0.1169,...,-0.0158,0.0006,0.0229,0.0023,-0.0089,-0.0462,-0.0386,0.0933,0.0538,-0.0170
⠨_⠴,-0.0140,-0.0175,0.0094,0.0208,-0.0768,-0.0918,0.0447,-0.1958,0.0033,0.0013,...,-0.0409,0.0254,0.0609,0.0043,-0.0302,-0.0257,-0.0403,0.0951,0.0355,-0.0181
形容詞,-0.0356,0.0516,-0.1663,-0.1416,-0.0936,-0.1030,0.1375,-0.1280,-0.0403,-0.0883,...,-0.0665,0.0036,0.0352,0.0578,0.0834,-0.0254,0.0163,-0.0009,-0.0191,-0.0744
江,0.0455,-0.0179,-0.0083,-0.0100,-0.0919,0.0961,0.0035,-0.0000,0.0138,0.0699,...,0.0083,0.0978,-0.1176,0.0280,-0.0135,0.0138,0.0387,0.0450,0.0528,0.0087


In [57]:
txt_embeds.loc["the"].shape

(300,)

### RUN THIS SECTION TO STORE THE EMBEDDINGS IN A PICKLE FILE

In [15]:
f = open("numberbatch-en.txt", "r")

embeddings = {}

for line in f:
    line = line.split(" ")
    word = line[0]
    embed = np.asarray(line[1:], dtype="float32")
    embeddings[word] = embed

#### Load embeddings from the file

In [29]:
with open("numberbatch_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

In [17]:
embeddings["absolute"]

array([-0.1265, -0.0897, -0.1441, -0.0691,  0.0024, -0.0428,  0.1635,
        0.1092,  0.0225, -0.0023, -0.1066,  0.0672,  0.0165, -0.0922,
        0.0828, -0.0673,  0.0214,  0.1012, -0.1139, -0.0741,  0.0493,
        0.0114, -0.0317,  0.1188, -0.0103, -0.052 , -0.0403, -0.0175,
        0.0645,  0.012 , -0.0525, -0.0848,  0.0659, -0.0299,  0.0017,
        0.0488,  0.0374, -0.1365,  0.0575,  0.1142,  0.0156, -0.0408,
        0.0027,  0.0862, -0.0022,  0.0215,  0.0024, -0.0432, -0.057 ,
        0.0345,  0.0279, -0.0385,  0.0017,  0.1389, -0.0318, -0.0322,
       -0.0802, -0.0694, -0.0643,  0.0778,  0.0747, -0.0437, -0.0723,
        0.0352,  0.0326,  0.0429, -0.0019,  0.1215, -0.0162,  0.0679,
        0.0049,  0.115 , -0.1836,  0.1341,  0.0997, -0.0246, -0.0155,
       -0.025 ,  0.0566, -0.0245,  0.0154,  0.0234,  0.0168,  0.021 ,
        0.1137,  0.0786, -0.1198,  0.0025,  0.0605,  0.1162, -0.0091,
       -0.0163, -0.046 , -0.0303, -0.0147, -0.0548,  0.0414, -0.1429,
       -0.0026, -0.0