In [1]:
import json

def read_squad(path):
    # open JSON file and load intro dictionary
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # check if we need to be extracting from 'answers' or 'plausible_answers'
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

In [2]:
# execute our read SQuAD function for training and validation sets
train_contexts_old, train_questions_old, train_answers_old = read_squad('squad/train-v2.0.json')

In [3]:
with open("train-indexes.json", 'r') as trainfile:
    rand_i_train = json.load(trainfile)
    
with open("test-indexes.json", 'r') as testfile:
    rand_i_test = json.load(testfile)

In [4]:
#Use 90% for train
train_contexts = []
train_questions = []
train_answers = []

for i in rand_i_train:
    train_contexts.append(train_contexts_old[i])
    train_questions.append(train_questions_old[i])
    train_answers.append(train_answers_old[i])

In [5]:
test_contexts = []
test_questions = []
test_answers = []

for i in rand_i_test:
    test_contexts.append(train_contexts_old[i])
    test_questions.append(train_questions_old[i])
    test_answers.append(train_answers_old[i])

In [6]:
def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            # this means the answer is off by 1-2 tokens
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

In [7]:
# and apply the function to our two answer lists
add_end_idx(train_answers, train_contexts)

In [8]:
from transformers import RobertaTokenizerFast, RobertaModel

In [9]:
t_dir = '/scratch/mahmadin/.cache/huggingface/transformers'
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base',cache_dir=t_dir)
roberta_model = RobertaModel.from_pretrained('roberta-base', return_dict=True, add_pooling_layer=False, cache_dir=t_dir)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# tokenize
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)

In [10]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift position until found
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [12]:
# apply function to our data
add_token_positions(train_encodings, train_answers)

In [13]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [14]:
roberta_model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [11]:
import json

with open("tokenized-features-final-ints.json", 'r') as f:
    features = json.load(f)

In [16]:
# Select only the last 3 features (removing one of the features each time)
for f in features:
    for each in f:
        each.pop(3)

In [17]:
# Pad features (In case of poping)
for f in features:
    f += [[0,0,0]] * (512 - len(f))

In [12]:
# Pad features
for f in features:
    f += [[0,0,0,0]] * (512 - len(f))

In [13]:
#Use 90% of features
train_features = []
for i in rand_i_train:
    train_features.append(features[i])

In [14]:
test_features = []
for i in rand_i_test:
    test_features.append(features[i])

In [15]:
import torch
from torch import nn

class QANetwork(torch.nn.Module):
    def __init__(self):
        super(QANetwork, self).__init__()
        self.num_labels = 2
        
        # when poping
        #self.hidden_size = 768 + 3
        
        # with 3 features (linear layer)
        #self.hidden_size = 768 + 3
        
        # with 4 features
        self.hidden_size = 768 + 4
        
        self.roberta = roberta_model
        self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)
        
        # for having 3 features (linear layer)
        #self.features_linear_layer = nn.Linear(4, 3)
        #self.features_relu = nn.ReLU()
        #self.features_lstm = nn.LSTM(input_size=4, hidden_size=3)

    def forward(self, input_ids, attention_mask, start_positions=None, end_positions=None, features=None):
        
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            #token_type_ids=token_type_ids,
            #output_attentions=output_attentions,         Include these later if needed
            #output_hidden_states=output_hidden_states,
            #return_dict=return_dict,
        )
        sequence_output = outputs[0]
        
        # for having 3 features (linear layer)
        #features_torch = torch.tensor(features)
        #features_torch = features_torch.type(torch.float)
        #features_linear_output = self.features_linear_layer(features_torch)
        #features_output = self.features_relu(features_linear_output)
        #features_output, (hn, cn) = self.features_lstm(features_torch)
        #sequence_output = torch.cat([sequence_output, features_output], 2)
        
        # for having 4 features (without linear layer)
        sequence_output = torch.cat([sequence_output, features], 2)
        
        # when poping with 3 features
        #sequence_output = torch.cat([sequence_output, features], 2)
        
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()
        
        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
        
        return total_loss, start_logits, end_logits

In [16]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, features):
        self.encodings = encodings
        self.features = features

    def __getitem__(self, idx):
        sub = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        sub['features'] = torch.tensor(self.features[idx])
        return sub

    def __len__(self):
        return len(self.encodings.input_ids)

In [22]:
# build datasets for both our training and validation sets
train_dataset = MyDataset(train_encodings, train_features)

In [17]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

  from cryptography import utils, x509


In [18]:
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [25]:
qa_model = nn.DataParallel(QANetwork().to(device))

In [26]:
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(qa_model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)



In [27]:
for epoch in range(3):
    # set model to train mode
    qa_model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = qa_model(input_ids, attention_mask, start_positions, end_positions, batch_features)
        # extract loss
        loss = outputs[0]  # 0: total loss, 1: start logits, 2: end logits
        # calculate loss for every parameter that needs grad update
        loss.sum().backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.sum().item())

Epoch 0: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7331/7331 [45:58<00:00,  2.66it/s, loss=4.45]  
Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7331/7331 [38:54<00:00,  3.14it/s, loss=0.709] 
Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7331/7331 [35:42<00:00,  3.42it/s, loss=3.71]  


In [19]:
# these functions are heavily influenced by the HF squad_metrics.py script
from nltk.tokenize import word_tokenize
import collections

def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    inputs = tokenizer(truth, return_tensors='pt', add_special_tokens=False)
    truth = tokenizer.decode(inputs['input_ids'][0])

    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    inputs = tokenizer(truth, return_tensors='pt', add_special_tokens=False)
    truth = tokenizer.decode(inputs['input_ids'][0])

    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens),int(pred_tokens == truth_tokens),int(pred_tokens == truth_tokens)

    common_tokens = collections.Counter(truth_tokens) & collections.Counter(pred_tokens)
    num_same = sum(common_tokens.values())

    # if there are no common tokens then f1 = 0
    if num_same == 0:
        return 0,0,0

    prec = 1.0 * num_same / len(pred_tokens)
    rec = 1.0 * num_same / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec), prec, rec

def Jaccard_index(context,answer,prediction):

    inputs = tokenizer(answer, return_tensors='pt', add_special_tokens=False)
    gold_answer0 = tokenizer.decode(inputs['input_ids'][0])

    inputs = tokenizer(context, return_tensors='pt', add_special_tokens=False)
    context = tokenizer.decode(inputs['input_ids'][0])

    prediction = normalize_text(prediction)
    gold_answer0 = normalize_text(gold_answer0)

    text=" ".join(word_tokenize(context)).lower()
    gold_answers=" ".join(word_tokenize(gold_answer0)).lower()
    prediction = " ".join(word_tokenize(prediction)).lower()
    if prediction=='':
        pred_set=set()
    else:
        pred_start = text.find(prediction)
        pred_end = len(text) - (text[::-1].find(prediction[::-1]))
        pred_set = set(list(range(pred_start, pred_end)))
        if pred_start==-1 or pred_end==-1:
            pred_set=set()

    if gold_answers=='':
        gold_start = 0
        gold_end = 0
        gold_set=set()
    else:
        gold_start = text.find(gold_answers)
        gold_end = len(text) - (text[::-1].find(gold_answers[::-1]))
        # gold_start = example.answers[0]['answer_start']
        # gold_end = example.answers[0]['answer_end']
        gold_set = set(list(range(gold_start, gold_end)))
        if gold_start==-1 or gold_end==-1:
            gold_set=set()


    intersection=gold_set.intersection(pred_set)
    union=gold_set.union(pred_set)


    intersection_list=list(intersection)
    union_list=list(union)


    intersection_list.sort()
    union_list.sort()

    if not intersection_list:
        intersection_word=''
    else:
        intersection_word=text[intersection_list[0]:intersection_list[-1] + 1]
    if not union_list:
        union_words=''
    else:
        union_words=text[union_list[0]:union_list[-1]+1]

    intersection_word_length=len(word_tokenize(intersection_word))
    union_word_length=len(word_tokenize(union_words))

    if intersection_word_length==0 and union_word_length==0:
        JI=1
    else:
        JI=intersection_word_length/union_word_length

    return JI

## Save Models

In [34]:
# roberta-squad with 4 features
model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [28]:
# roberta-squad with 3 features (linear layer)
model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set-with-features-linear-layer'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [27]:
# roberta-squad with 3 features (LSTM)
model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set-with-features-lstm-layer'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [29]:
# bert-squad without NER
model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set-without-NER-without-nn'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [29]:
# bert-squad without POS
model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set-without-POS-without-nn'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [34]:
# bert-squad without DEP
model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set-without-DEP-without-nn'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [29]:
# bert-squad without STOP
model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set-without-STOP-without-nn'
#Save the model
torch.save(qa_model.state_dict(), model_path)

## Use 90% of training set to train and 10% to test

In [20]:
add_end_idx(test_answers, test_contexts)

In [21]:
# tokenize
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

In [22]:
add_token_positions(test_encodings, test_answers)

In [23]:
# build datasets for both our training and validation sets
test_dataset = MyDataset(test_encodings, test_features)

# initialize data loader for training data
test_loader = DataLoader(test_dataset, batch_size=16)

## With 4 Features

In [24]:
# roberta-squad with 4 features

#Load the model
model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set'
qa_model = nn.DataParallel(QANetwork().to(device))
qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []
predicted_answers = []
true_answers = []
contexts = []
questions = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            start_q = input_ids[i].tolist().index(2)+1
            end_q = len(input_ids[i])

            question_tokens = input_ids[i, input_ids[i].tolist().index(2, start_q, end_q)+1: input_ids[i].tolist().index(2, start_q+1, end_q)]
            question = tokenizer.decode(question_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))
            
            predicted_answers.append(predicted_answer)
            true_answers.append(true_answer)
            contexts.append(context)
            questions.append(question)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 815/815 [05:19<00:00,  2.55it/s]


In [75]:
def Average(lst):
    return sum(lst) / len(lst)

In [66]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:81.22675888821398
Precision:83.09100463148701
Recall:84.17201627565164
Exact Match:69.30632289748311
Jaccard Index:79.76349112867409


## With 3 Features Linear Layer

In [34]:
# bert-squad with 3 features (linear layer)

#Load the model
#model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set-with-features-linear-layer'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 815/815 [04:34<00:00,  2.97it/s]


In [None]:
def Average(lst):
    return sum(lst) / len(lst)

In [35]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:81.74977158718615
Precision:84.13831367479926
Recall:83.60941314799581
Exact Match:70.10435850214856
Jaccard Index:80.46127518346373


## With 3 Features LSTM

In [25]:
# bert-squad with 3 features (LSTM)

#Load the model
model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set-with-features-lstm-layer'
qa_model = nn.DataParallel(QANetwork().to(device))
qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

  self.dropout, self.training, self.bidirectional, self.batch_first)
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 815/815 [04:59<00:00,  2.72it/s]


In [26]:
def Average(lst):
    return sum(lst) / len(lst)

In [27]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:81.0519508002631
Precision:82.72607402174856
Recall:84.13021805981082
Exact Match:68.87661141804789
Jaccard Index:79.83615038433973


## Without NER Without NN

In [34]:
# roberta-squad without NER without nn

#Load the model
#model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set-without-NER-without-nn'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 815/815 [02:59<00:00,  4.54it/s]


In [35]:
def Average(lst):
    return sum(lst) / len(lst)

In [36]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:81.229352291682
Precision:83.405247470436
Recall:83.53333072175167
Exact Match:69.06077348066299
Jaccard Index:79.89225684178194


## Without POS Without NN

In [34]:
# roberta-squad without POS without nn

#Load the model
#model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set-without-POS-without-nn'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 815/815 [03:03<00:00,  4.45it/s]


In [35]:
def Average(lst):
    return sum(lst) / len(lst)

In [36]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:80.83951188068835
Precision:83.75422538894087
Recall:82.51169346252
Exact Match:69.24493554327809
Jaccard Index:79.63116105201948


## Without DEP Without NN

In [35]:
# roberta-squad without DEP without nn

#Load the model
#model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set-without-DEP-without-nn'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 815/815 [03:00<00:00,  4.52it/s]


In [36]:
def Average(lst):
    return sum(lst) / len(lst)

In [37]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:80.91836353504542
Precision:82.5332943965528
Recall:84.07795969200103
Exact Match:68.8305709023941
Jaccard Index:79.48690870460051


## Without STOP Without NN

In [34]:
# roberta-squad without STOP without nn

#Load the model
#model_path = '/scratch/mahmadin/models/roberta-squad-with-features-90-train-set-without-STOP-without-nn'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 815/815 [02:50<00:00,  4.78it/s]


In [35]:
def Average(lst):
    return sum(lst) / len(lst)

In [36]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:81.41187928218856
Precision:83.52535702285037
Recall:83.80827110241228
Exact Match:69.2756292203806
Jaccard Index:79.87761252108753


## Without the Features

In [25]:
import torch
from torch import nn

class QANetwork(torch.nn.Module):
    def __init__(self):
        super(QANetwork, self).__init__()
        self.num_labels = 2
        self.hidden_size = 768 
        self.roberta = roberta_model
        self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)

    def forward(self, input_ids, attention_mask, start_positions=None, end_positions=None, features=None):
        
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            #token_type_ids=token_type_ids,
            #output_attentions=output_attentions,         Include these later if needed
            #output_hidden_states=output_hidden_states,
            #return_dict=return_dict,
        )
        sequence_output = outputs[0]
        
        # Concatenate logits with features
        if features is not None:
            sequence_output = torch.cat([sequence_output, features], 2)
        
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()
        
        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
        
        return total_loss, start_logits, end_logits

In [26]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        sub = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return sub

    def __len__(self):
        return len(self.encodings.input_ids)

In [18]:
# build datasets for both our training and validation sets
train_dataset = MyDataset(train_encodings)

In [19]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

In [20]:
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [21]:
qa_model_no_features = nn.DataParallel(QANetwork().to(device))

In [22]:
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(qa_model_no_features.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [23]:
for epoch in range(3):
    # set model to train mode
    qa_model_no_features.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        #batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = qa_model_no_features(input_ids, attention_mask, start_positions, end_positions)
        # extract loss
        loss = outputs[0]  # 0: total loss, 1: start logits, 2: end logits
        # calculate loss for every parameter that needs grad update
        loss.sum().backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.sum().item())

Epoch 0: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7331/7331 [35:52<00:00,  3.41it/s, loss=2.84]  
Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7331/7331 [35:21<00:00,  3.46it/s, loss=4.11]  
Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7331/7331 [35:22<00:00,  3.45it/s, loss=1.74]   


## Saving the Model

In [34]:
model_path = '/scratch/mahmadin/models/roberta-squad-without-features-90-train-set'
#Save the model
torch.save(qa_model_no_features.state_dict(), model_path)

## Use 90% of training set to train and 10% to test

In [79]:
add_end_idx(test_answers, test_contexts)

In [80]:
# tokenize
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

In [27]:
add_token_positions(test_encodings, test_answers)

In [28]:
# build datasets for both our training and validation sets
test_dataset = MyDataset(test_encodings)

# initialize data loader for training data
test_loader = DataLoader(test_dataset, batch_size=16)

In [29]:
#Load the model
model_path = '/scratch/mahmadin/models/roberta-squad-without-features-90-train-set'
qa_model_no_features = nn.DataParallel(QANetwork().to(device))
qa_model_no_features.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model_no_features.eval()


em_scores_without_features = []
f1_scores_without_features = []
precision_scores_without_features = []
recall_scores_without_features = []
JI_scores_without_features = []
predicted_answers_without_features = []
true_answers_without_features = []
contexts_without_features = []
questions_without_features = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        #batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model_no_features(input_ids, attention_mask)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores_without_features.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores_without_features.append(scores[0])
            precision_scores_without_features.append(scores[1])
            recall_scores_without_features.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            start_q = input_ids[i].tolist().index(2)+1
            end_q = len(input_ids[i])

            question_tokens = input_ids[i, input_ids[i].tolist().index(2, start_q, end_q)+1: input_ids[i].tolist().index(2, start_q+1, end_q)]
            question = tokenizer.decode(question_tokens)

            JI_scores_without_features.append(Jaccard_index(context, true_answer, predicted_answer))
            
            predicted_answers_without_features.append(predicted_answer)
            true_answers_without_features.append(true_answer)
            contexts_without_features.append(context)
            questions_without_features.append(question)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 815/815 [05:04<00:00,  2.67it/s]


In [32]:
def Average(lst):
    return sum(lst) / len(lst)

In [33]:
print(f"F1_score:{Average(f1_scores_without_features)*100}")
print(f"Precision:{Average(precision_scores_without_features)*100}")
print(f"Recall:{Average(recall_scores_without_features)*100}")
print(f"Exact Match:{Average(em_scores_without_features)*100}")
print(f"Jaccard Index:{Average(JI_scores_without_features)*100}")

F1_score:81.1516372409529
Precision:83.76271421394067
Recall:83.10689688456485
Exact Match:68.96101903007981
Jaccard Index:79.65136505896876


## Test Cases on F1

In [30]:
counter = 0
for i in range(len(f1_scores)):
    if f1_scores[i] > f1_scores_without_features[i]:
        print(f"Instance: {i}")
        print(f"Question: {questions[i]}")
        print(f"Context: {contexts[i]}")
        print(f"True answer: {true_answers[i]}")
        
        print(f"f1 with features: {f1_scores[i]}")
        print(f"Predicted answer with features: {predicted_answers[i]}")
        
        
        print(f"f1 without features: {f1_scores_without_features[i]}")
        print(f"Predicted answer without features: {predicted_answers_without_features[i]}")
        
        counter+=1
        print("\n")
print(f"number of improvments: {counter}")

Instance: 8
Question: What city was Beyonc√©'s elementary school located in?
Context: Beyonc√© attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes. Her singing talent was discovered when dance instructor Darlette Johnson began humming a song and she finished it, able to hit the high-pitched notes. Beyonc√©'s interest in music and performing continued after winning a school talent show at age seven, singing John Lennon's "Imagine" to beat 15/16-year-olds. In fall of 1990, Beyonc√© enrolled in Parker Elementary School, a music magnet school in Houston, where she would perform with the school's choir. She also attended the High School for the Performing and Visual Arts and later Alief Elsik High School. Beyonc√© was also a member of the choir at St. John's United Methodist Church as a soloist for two years.
True answer:  Fredericksburg,
f1 with features: 1.0
Predicted answer with features:  Fredericksburg,
f1 without features: 0
Predicted an

Instance: 1362
Question: How old was Mabel when she became deaf?
Context: Deciding to give up his lucrative private Boston practice, Bell retained only two students, six-year-old "Georgie" Sanders, deaf from birth, and 15-year-old Mabel Hubbard. Each pupil would play an important role in the next developments. George's father, Thomas Sanders, a wealthy businessman, offered Bell a place to stay in nearby Salem with Georgie's grandmother, complete with a room to "experiment". Although the offer was made by George's mother and followed the year-long arrangement in 1872 where her son and his nurse had moved to quarters next to Bell's boarding house, it was clear that Mr. Sanders was backing the proposal. The arrangement was for teacher and student to continue their work together, with free room and board thrown in. Mabel was a bright, attractive girl who was ten years Bell's junior, but became the object of his affection. Having lost her hearing after a near-fatal bout of scarlet fever clo

Predicted answer without features:  1861.


Instance: 3022
Question: The state ranks second in the production of what type of wood?
Context: Agriculture is a relatively small component of the state's economy and varies greatly due to the varying climate across the state. The state ranked first in Mexico for the production of the following crops: oats, chile verde, cotton, apples, pecans, and membrillo. The state has an important dairy industry with large milk processors throughout the state. Delicias is home to Alpura, the second-largest dairy company in Mexico. The state has a large logging industry ranking second in oak and third in pine in Mexico. The mining industry is a small but continues to produce large amounts of minerals. The state ranked first place in the country for the production of lead with 53,169 metric tons. Chihuahua ranked second in Mexico for zinc at 150,211 metric tons, silver at 580,271 kg, and gold at 15,221.8 kg.
True answer:  oak
f1 with features: 1.0
Predicte

f1 without features: 0.8571428571428571
Predicted answer without features:  southernmost active volcano.


Instance: 4248
Question: What British citizens established Grtviken on Antarctica?
Context: The first semi-permanent inhabitants of regions near Antarctica (areas situated south of the Antarctic Convergence) were British and American sealers who used to spend a year or more on South Georgia, from 1786 onward. During the whaling era, which lasted until 1966, the population of that island varied from over 1,000 in the summer (over 2,000 in some years) to some 200 in the winter. Most of the whalers were Norwegian, with an increasing proportion of Britons. The settlements included Grytviken, Leith Harbour, King Edward Point, Stromness, Husvik, Prince Olav Harbour, Ocean Harbour and Godthul. Managers and other senior officers of the whaling stations often lived together with their families. Among them was the founder of Grytviken, Captain Carl Anton Larsen, a prominent Norwegian whaler



Instance: 6217
Question: What usage that causes worker damage have some countries been reported doing? 
Context: In addition to concerns over subsidies, the cotton industries of some countries are criticized for employing child labor and damaging workers' health by exposure to pesticides used in production. The Environmental Justice Foundation has campaigned against the prevalent use of forced child and adult labor in cotton production in Uzbekistan, the world's third largest cotton exporter. The international production and trade situation has led to "fair trade" cotton clothing and footwear, joining a rapidly growing market for organic clothing, fair fashion or "ethical fashion". The fair trade system was initiated in 2005 with producers from Cameroon, Mali and Senegal.
True answer:  exposure to pesticides
f1 with features: 0.5
Predicted answer with features:  pesticides
f1 without features: 0
Predicted answer without features:  employing child labor


Instance: 6219
Question: What

f1 with features: 0.6666666666666666
Predicted answer with features:  sub-schools,
f1 without features: 0
Predicted answer without features:  non-theistic monism,


Instance: 7727
Question: To what does the concept of atman refer?
Context: According to this school of Vedanta, all reality is Brahman, and there exists nothing whatsoever which is not Brahman. Its metaphysics includes the concept of mƒÅyƒÅ and ƒÅtman. MƒÅyƒÅ connotes "that which exists, but is constantly changing and thus is spiritually unreal". The empirical reality is considered as always changing and therefore "transitory, incomplete, misleading and not what it appears to be". The concept of ƒÅtman is of soul, self within each person, each living being. Advaita Vedantins assert that ƒÅtman is same as Brahman, and this Brahman is within each human being and all life, all living beings are spiritually interconnected, and there is oneness in all of existence. They hold that dualities and misunderstanding of mƒÅyƒÅ as the s

f1 without features: 0
Predicted answer without features: 


Instance: 9840
Question: What is Uttar Pradesh known for?
Context: Rajasthan (/Ààr…ëÀêd í…ôst√¶n/ Hindustani pronunciation: [raÀêd í…ôsÀàtÃ™ ∞aÀên] ( listen); literally, "Land of Kings") is India's largest state by area (342,239 square kilometres (132,139 sq mi) or 10.4% of India's total area). It is located on the western side of the country, where it comprises most of the wide and inhospitable Thar Desert (also known as the "Rajasthan Desert" and "Great Indian Desert") and shares a border with the Pakistani provinces of Punjab to the northwest and Sindh to the west, along the Sutlej-Indus river valley. Elsewhere it is bordered by the other Indian states: Punjab to the north; Haryana and Uttar Pradesh to the northeast; Madhya Pradesh to the southeast; and Gujarat to the southwest. Its features include the ruins of the Indus Valley Civilization at Kalibanga; the Dilwara Temples, a Jain pilgrimage site at Rajasthan's only hill

Predicted answer without features:  bone throwing skill ("kushaya ematsambo")


Instance: 11648
Question: What did translators shape in the course of their work?
Context: Translation has served as a school of writing for many authors. Translators, including monks who spread Buddhist texts in East Asia, and the early modern European translators of the Bible, in the course of their work have shaped the very languages into which they have translated. They have acted as bridges for conveying knowledge between cultures; and along with ideas, they have imported from the source languages, into their own languages, loanwords and calques of grammatical structures, idioms and vocabulary.
True answer:  languages
f1 with features: 1.0
Predicted answer with features:  languages
f1 without features: 0.25
Predicted answer without features:  the very languages into which they have translated.


Instance: 11652
Question: What type of translation is still never accurate and reliable?
Context: Web-based 

In [33]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Beyonc√© attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes.")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Beyonc√© nsubj attended VERB []
attended ROOT attended VERB [Beyonc√©, School, .]
St. compound Mary PROPN []
Mary poss School PROPN [St., 's]
's case Mary PROPN []
Elementary compound School PROPN []
School dobj attended VERB [Mary, Elementary, in, enrolled]
in prep School PROPN [Fredericksburg]
Fredericksburg pobj in ADP [,, Texas, ,]
, punct Fredericksburg PROPN []
Texas appos Fredericksburg PROPN []
, punct Fredericksburg PROPN []
where advmod enrolled VERB []
she nsubj enrolled VERB []
enrolled relcl School PROPN [where, she, in]
in prep enrolled VERB [classes]
dance compound classes NOUN []
classes pobj in ADP [dance]
. punct attended VERB []


In [34]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("his love life and his early death have made him, in the public consciousness, a leading symbol of the Romantic era.")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])


his poss life NOUN []
love compound life NOUN []
life nsubj made VERB [his, love, and, death]
and cc life NOUN []
his poss death NOUN []
early amod death NOUN []
death conj life NOUN [his, early]
have aux made VERB []
made ROOT made VERB [life, have, him, ,, in, .]
him dobj made VERB []
, punct made VERB []
in prep made VERB [consciousness]
the det consciousness NOUN []
public amod consciousness NOUN []
consciousness pobj in ADP [the, public, ,, symbol]
, punct consciousness NOUN []
a det symbol NOUN []
leading amod symbol NOUN []
symbol appos consciousness NOUN [a, leading, of]
of prep symbol NOUN [era]
the det era NOUN []
Romantic amod era NOUN []
era pobj of ADP [the, Romantic]
. punct made VERB []


In [38]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The parish baptismal record gives his birthday as 22 February 1810, and cites his given names in the Latin form Fridericus Franciscus (in Polish, he was Fryderyk Franciszek)")

for ent in doc.ents:
    print(ent.text, ent.label_)

February 1810 DATE
Latin NORP
Fridericus Franciscus ORG
Polish NORP
Fryderyk Franciszek PERSON


In [39]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The parish baptismal record gives his birthday as 22 February 1810, and cites his given names in the Latin form Fridericus Franciscus (in Polish, he was Fryderyk Franciszek)")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

The det record NOUN []
parish amod record NOUN []
baptismal compound record NOUN []
record nsubj gives VERB [The, parish, baptismal]
gives ccomp was AUX [record, birthday, as, ,, and, cites]
his poss birthday NOUN []
birthday dobj gives VERB [his]
as prep gives VERB [February]
22 nummod February PROPN []
February pobj as ADP [22, 1810]
1810 nummod February PROPN []
, punct gives VERB []
and cc gives VERB []
cites conj gives VERB [names]
his poss names NOUN []
given amod names NOUN []
names dobj cites VERB [his, given, in]
in prep names NOUN [form]
the det form NOUN []
Latin amod form NOUN []
form pobj in ADP [the, Latin, Franciscus]
Fridericus compound Franciscus PROPN []
Franciscus appos form NOUN [Fridericus, (, in]
( punct Franciscus PROPN []
in prep Franciscus PROPN [Polish]
Polish pobj in ADP []
, punct was AUX []
he nsubj was AUX []
was ROOT was AUX [gives, ,, he, Franciszek, )]
Fryderyk compound Franciszek PROPN []
Franciszek attr was AUX [Fryderyk]
) punct was AUX []


In [88]:
counter = 0
for i in range(len(f1_scores)):
    if f1_scores[i] < f1_scores_without_features[i]:
        print(f"Question: {questions[i]}")
        print(f"Context: {contexts[i]}")
        print(f"True answer: {true_answers[i]}")
        
        print(f"f1 with features: {f1_scores[i]}")
        print(f"Predicted answer with features: {predicted_answers[i]}")
        
        
        print(f"f1 without features: {f1_scores_without_features[i]}")
        print(f"Predicted answer without features: {predicted_answers_without_features[i]}")
        
        counter+=1
        
print(f"number of mistakes: {counter}")

Question: What does she get credits for in her music?
Context: She has received co-writing credits for most of the songs recorded with Destiny's Child and her solo efforts. Her early songs were personally driven and female-empowerment themed compositions like "Independent Women" and "Survivor", but after the start of her relationship with Jay Z she transitioned to more man-tending anthems such as "Cater 2 U". Beyonc√© has also received co-producing credits for most of the records in which she has been involved, especially during her solo efforts. However, she does not formulate beats herself, but typically comes up with melodies and ideas during production, sharing them with producers.
True answer:  co-producing credits
f1 with features: 0
Predicted answer with features:  co-writing
f1 without features: 0.07142857142857142
Predicted answer without features:  co-writing credits for most of the songs recorded with Destiny's Child and her solo efforts. Her early songs were personally driv

Context: Computer security, also known as cybersecurity or IT security, is the protection of information systems from theft or damage to the hardware, the software, and to the information on them, as well as from disruption or misdirection of the services they provide. It includes controlling physical access to the hardware, as well as protecting against harm that may come via network access, data and code injection, and due to malpractice by operators, whether intentional, accidental, or due to them being tricked into deviating from secure procedures.
True answer:  data and code injection, and due to malpractice by operators,
f1 with features: 0.5
Predicted answer with features:  network access, data and code injection,
f1 without features: 0.5405405405405406
Predicted answer without features:  controlling physical access to the hardware, as well as protecting against harm that may come via network access, data and code injection, and due to malpractice by operators,
Question: What ne

Question: What don't the tradeoffs of digital piracy support?
Context: According to the same study, even though digital piracy inflicts additional costs on the production side of media, it also offers the main access to media goods in developing countries. The strong tradeoffs that favor using digital piracy in developing economies dictate the current neglected law enforcements toward digital piracy. In China, the issue of digital infringement is not merely legal, but social ‚Äì originating from the high demand for cheap and affordable goods as well as the governmental connections of the businesses which produce such goods.
True answer:  current neglected law enforcements
f1 with features: 0.6666666666666666
Predicted answer with features:  law enforcements
f1 without features: 1.0
Predicted answer without features:  the current neglected law enforcements
Question: Which government lost $422 million in potential tax money?
Context: In 2007, the Institute for Policy Innovation (IPI) rep

True answer:  Channel Island club entered the competition when Guernsey F.C. competed for the first time.
f1 with features: 0.0425531914893617
Predicted answer with features:  other teams from Wales, Ireland and Scotland also took part in the competition, with Glasgow side Queen's Park losing the final to Blackburn Rovers in 1884 and 1885 before being barred from entering by the Scottish Football Association.
f1 without features: 0.1
Predicted answer without features:  Ireland and Scotland also took part in the competition,
Question: What is the most games played a team to reach the final?
Context: Until the 1990s further replays would be played until one team was victorious. Some ties took as many as six matches to settle; in their 1975 campaign, Fulham played a total of 12 games over six rounds, which remains the most games played by a team to reach a final. Replays were traditionally played three or four days after the original game, but from 1991‚Äì92 they were staged at least 10 d

f1 with features: 0
Predicted answer with features: 
f1 without features: 1.0
Predicted answer without features:  free market
Question: Hayek believed that authoritarianism was very different from what?
Context: For Hayek, the supposedly stark difference between authoritarianism and totalitarianism has much importance and Hayek places heavy weight on this distinction in his defence of transitional dictatorship. For example, when Hayek visited Venezuela in May 1981, he was asked to comment on the prevalence of totalitarian regimes in Latin America. In reply, Hayek warned against confusing "totalitarianism with authoritarianism," and said that he was unaware of "any totalitarian governments in Latin America. The only one was Chile under Allende". For Hayek, however, the word 'totalitarian' signifies something very specific: the want to ‚Äúorganize the whole of society‚Äù to attain a ‚Äúdefinite social goal‚Äù ‚Äîwhich is stark in contrast to ‚Äúliberalism and individualism‚Äù.
True answe

True answer:  an unprecedented level
f1 with features: 0.6666666666666666
Predicted answer with features:  unprecedented
f1 without features: 1.0
Predicted answer without features:  unprecedented level
Question: What are the Majority Leader's duties in Westminster?
Context: Unlike in Westminster style legislatures or as with the Senate Majority Leader, the House Majority Leader's duties and prominence vary depending upon the style and power of the Speaker of the House. Typically, the Speaker does not participate in debate and rarely votes on the floor. In some cases, Majority Leaders have been more influential than the Speaker; notably Tom DeLay who was more prominent than Speaker Dennis Hastert. In addition, Speaker Newt Gingrich delegated to Dick Armey an unprecedented level of authority over scheduling legislation on the House floor.
True answer:  vary depending upon the style and power of the Speaker of the House.
f1 with features: 0.10000000000000002
Predicted answer with features