In [38]:
import json

def read_squad(path):
    # open JSON file and load intro dictionary
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # check if we need to be extracting from 'answers' or 'plausible_answers'
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

In [39]:
# execute our read SQuAD function for training and validation sets
train_contexts_old, train_questions_old, train_answers_old = read_squad('squad/train-v2.0.json')

In [40]:
with open("train-indexes.json", 'r') as trainfile:
    rand_i_train = json.load(trainfile)
    
with open("test-indexes.json", 'r') as testfile:
    rand_i_test = json.load(testfile)

In [41]:
#Use 90% for train
train_contexts = []
train_questions = []
train_answers = []

for i in rand_i_train:
    train_contexts.append(train_contexts_old[i])
    train_questions.append(train_questions_old[i])
    train_answers.append(train_answers_old[i])

In [42]:
test_contexts = []
test_questions = []
test_answers = []

for i in rand_i_test:
    test_contexts.append(train_contexts_old[i])
    test_questions.append(train_questions_old[i])
    test_answers.append(train_answers_old[i])

In [43]:
def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            # this means the answer is off by 1-2 tokens
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

In [7]:
# and apply the function to our two answer lists
add_end_idx(train_answers, train_contexts)

In [44]:
from transformers import BertTokenizerFast, BertModel

In [45]:
t_dir = '/scratch/mahmadin/.cache/huggingface/transformers'
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased',cache_dir=t_dir)
bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True, add_pooling_layer=False, cache_dir=t_dir)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# tokenize
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)

In [46]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift position until found
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [12]:
# apply function to our data
add_token_positions(train_encodings, train_answers)

In [13]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [14]:
bert_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [47]:
import json

with open("tokenized-features-final-ints.json", 'r') as f:
    features = json.load(f)

In [17]:
# Select only the last 3 features (removing one of the features each time)
for f in features:
    for each in f:
        each.pop(3)

In [19]:
# Pad features (In case of poping)
for f in features:
    f += [[0,0,0]] * (512 - len(f))

In [48]:
# Pad features
for f in features:
    f += [[0,0,0,0]] * (512 - len(f))

In [49]:
#Use 90% of features
train_features = []
for i in rand_i_train:
    train_features.append(features[i])

In [50]:
test_features = []
for i in rand_i_test:
    test_features.append(features[i])

In [51]:
import torch
from torch import nn

class QANetwork(torch.nn.Module):
    def __init__(self):
        super(QANetwork, self).__init__()
        self.num_labels = 2
        
        # when poping
        #self.hidden_size = 768 + 3
        
        # with 3 features
        #self.hidden_size = 768 + 3
        
        # with 4 features
        self.hidden_size = 768 + 4
        
        self.bert = bert_model
        self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)
        
        # for having 3 features (linear layer)
        #self.features_linear_layer = nn.Linear(4, 3)
        #self.features_relu = nn.ReLU()
        #self.features_lstm = nn.LSTM(input_size=4, hidden_size=3)

    def forward(self, input_ids, attention_mask, token_type_ids, start_positions=None, end_positions=None, features=None):
        
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            #output_attentions=output_attentions,         Include these later if needed
            #output_hidden_states=output_hidden_states,
            #return_dict=return_dict,
        )
        sequence_output = outputs[0]
        
        # for having 3 features
        #features_torch = torch.tensor(features)
        #features_torch = features_torch.type(torch.float)
        #features_linear_output = self.features_linear_layer(features_torch)
        #features_output = self.features_relu(features_linear_output)
        #features_output, (hn, cn) = self.features_lstm(features_torch)
        #sequence_output = torch.cat([sequence_output, features_output], 2)
        
        # for having 4 features (without linear layer)
        sequence_output = torch.cat([sequence_output, features], 2)
        
        # when poping with 3 features
        #sequence_output = torch.cat([sequence_output, features], 2)
        
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()
        
        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
        
        return total_loss, start_logits, end_logits

In [15]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, features):
        self.encodings = encodings
        self.features = features

    def __getitem__(self, idx):
        sub = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        sub['features'] = torch.tensor(self.features[idx])
        return sub

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
# build datasets for both our training and validation sets
train_dataset = MyDataset(train_encodings, train_features)

In [16]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

  from cryptography import utils, x509


In [17]:
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [24]:
qa_model = nn.DataParallel(QANetwork().to(device))

In [25]:
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(qa_model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)



In [26]:
for epoch in range(3):
    # set model to train mode
    qa_model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = qa_model(input_ids, attention_mask, token_type_ids, start_positions, end_positions, batch_features)
        # extract loss
        loss = outputs[0]  # 0: total loss, 1: start logits, 2: end logits
        # calculate loss for every parameter that needs grad update
        loss.sum().backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.sum().item())

Epoch 0: 100%|██████████| 7331/7331 [54:21<00:00,  2.25it/s, loss=5.91]  
Epoch 1: 100%|██████████| 7331/7331 [53:41<00:00,  2.28it/s, loss=5.02]  
Epoch 2: 100%|██████████| 7331/7331 [53:49<00:00,  2.27it/s, loss=1.93]   


In [18]:
# these functions are heavily influenced by the HF squad_metrics.py script
from nltk.tokenize import word_tokenize
import collections

def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    inputs = tokenizer(truth, return_tensors='pt', add_special_tokens=False)
    truth = tokenizer.decode(inputs['input_ids'][0])

    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    inputs = tokenizer(truth, return_tensors='pt', add_special_tokens=False)
    truth = tokenizer.decode(inputs['input_ids'][0])

    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens),int(pred_tokens == truth_tokens),int(pred_tokens == truth_tokens)

    common_tokens = collections.Counter(truth_tokens) & collections.Counter(pred_tokens)
    num_same = sum(common_tokens.values())

    # if there are no common tokens then f1 = 0
    if num_same == 0:
        return 0,0,0

    prec = 1.0 * num_same / len(pred_tokens)
    rec = 1.0 * num_same / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec), prec, rec

def Jaccard_index(context,answer,prediction):

    inputs = tokenizer(answer, return_tensors='pt', add_special_tokens=False)
    gold_answer0 = tokenizer.decode(inputs['input_ids'][0])

    inputs = tokenizer(context, return_tensors='pt', add_special_tokens=False)
    context = tokenizer.decode(inputs['input_ids'][0])

    prediction = normalize_text(prediction)
    gold_answer0 = normalize_text(gold_answer0)

    text=" ".join(word_tokenize(context)).lower()
    gold_answers=" ".join(word_tokenize(gold_answer0)).lower()
    prediction = " ".join(word_tokenize(prediction)).lower()
    if prediction=='':
        pred_set=set()
    else:
        pred_start = text.find(prediction)
        pred_end = len(text) - (text[::-1].find(prediction[::-1]))
        pred_set = set(list(range(pred_start, pred_end)))
        if pred_start==-1 or pred_end==-1:
            pred_set=set()

    if gold_answers=='':
        gold_start = 0
        gold_end = 0
        gold_set=set()
    else:
        gold_start = text.find(gold_answers)
        gold_end = len(text) - (text[::-1].find(gold_answers[::-1]))
        # gold_start = example.answers[0]['answer_start']
        # gold_end = example.answers[0]['answer_end']
        gold_set = set(list(range(gold_start, gold_end)))
        if gold_start==-1 or gold_end==-1:
            gold_set=set()


    intersection=gold_set.intersection(pred_set)
    union=gold_set.union(pred_set)


    intersection_list=list(intersection)
    union_list=list(union)


    intersection_list.sort()
    union_list.sort()

    if not intersection_list:
        intersection_word=''
    else:
        intersection_word=text[intersection_list[0]:intersection_list[-1] + 1]
    if not union_list:
        union_words=''
    else:
        union_words=text[union_list[0]:union_list[-1]+1]

    intersection_word_length=len(word_tokenize(intersection_word))
    union_word_length=len(word_tokenize(union_words))

    if intersection_word_length==0 and union_word_length==0:
        JI=1
    else:
        JI=intersection_word_length/union_word_length

    return JI

## Save Models

In [28]:
# bert-squad with 4 features
model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [35]:
# bert-squad with 3 features (LSTM)
model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set-with-features-lstm-layer'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [28]:
# bert-squad with 3 features (linear layer)
model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set-with-features-linear-layer'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [34]:
# bert-squad without NER
model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set-without-NER-without-nn'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [31]:
# bert-squad without POS
model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set-without-POS-without-nn'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [31]:
# bert-squad without DEP
model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set-without-DEP-without-nn'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [31]:
# bert-squad without STOP
model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set-without-STOP-without-nn'
#Save the model
torch.save(qa_model.state_dict(), model_path)

## Use 90% of training set to train and 10% to test

In [19]:
add_end_idx(test_answers, test_contexts)

In [20]:
# tokenize
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

In [21]:
add_token_positions(test_encodings, test_answers)

In [22]:
# build datasets for both our training and validation sets
test_dataset = MyDataset(test_encodings, test_features)

# initialize data loader for training data
test_loader = DataLoader(test_dataset, batch_size=16)

## With 4 Features

In [23]:
# bert-squad with 4 features

#Load the model
model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set'
qa_model = nn.DataParallel(QANetwork().to(device))
qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []
predicted_answers = []
true_answers = []
contexts = []
questions = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, token_type_ids, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(102)]
            context = tokenizer.decode(context_tokens)
            
            start_q = input_ids[i].tolist().index(102)
            end_q = len(input_ids[i])

            question_tokens = input_ids[i, input_ids[i].tolist().index(102, start_q, end_q)+1: input_ids[i].tolist().index(102, start_q+1, end_q)]
            question = tokenizer.decode(question_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))
            
            predicted_answers.append(predicted_answer)
            true_answers.append(true_answer)
            contexts.append(context)
            questions.append(question)

100%|██████████| 815/815 [05:24<00:00,  2.51it/s]


In [39]:
def Average(lst):
    return sum(lst) / len(lst)

In [32]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:79.50335102746003
Precision:80.8763656486865
Recall:83.19493918959617
Exact Match:67.08870472682628
Jaccard Index:77.96715753583305


## With 3 Features Linear

In [30]:
# bert-squad with 3 features (linear layer)

#Load the model
#model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set-with-features-linear-layer'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, token_type_ids, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 0 : token_type_ids[i].tolist().index(1)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|██████████| 815/815 [03:12<00:00,  4.24it/s]


In [31]:
def Average(lst):
    return sum(lst) / len(lst)

In [32]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:78.87534162391772
Precision:81.33581845705147
Recall:81.42041764466012
Exact Match:66.63597298956415
Jaccard Index:77.59461334350488


## With 3 Features LSTM

In [29]:
# bert-squad with 3 features (LSTM)

#Load the model
#model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set-with-features-lstm-layer'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, token_type_ids, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 0 : token_type_ids[i].tolist().index(1)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

  self.dropout, self.training, self.bidirectional, self.batch_first)
100%|██████████| 815/815 [05:01<00:00,  2.70it/s]


In [30]:
def Average(lst):
    return sum(lst) / len(lst)

In [31]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:79.31968941484186
Precision:80.39784172972723
Recall:83.30437786518759
Exact Match:66.9659300184162
Jaccard Index:77.854916365228


## Without NER Without NN

In [39]:
# bert-squad without NER without nn

#Load the model
#model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set-without-NER-without-nn'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, token_type_ids, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 0 : token_type_ids[i].tolist().index(1)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|██████████| 815/815 [03:16<00:00,  4.16it/s]


In [40]:
def Average(lst):
    return sum(lst) / len(lst)

In [41]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:78.65525331795791
Precision:80.69727791946222
Recall:81.49992574511137
Exact Match:66.66666666666666
Jaccard Index:77.52690757692994


## Without POS Without NN

In [36]:
# bert-squad without POS without nn

#Load the model
#model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set-without-POS-without-nn'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, token_type_ids, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 0 : token_type_ids[i].tolist().index(1)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|██████████| 815/815 [03:13<00:00,  4.21it/s]


In [37]:
def Average(lst):
    return sum(lst) / len(lst)

In [38]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:79.19543919717395
Precision:80.29490419050614
Recall:83.13020745439654
Exact Match:66.87384898710866
Jaccard Index:77.82582362399913


## Without DEP Without NN

In [36]:
# bert-squad without DEP without nn

#Load the model
#model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set-without-DEP-without-nn'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, token_type_ids, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 0 : token_type_ids[i].tolist().index(1)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|██████████| 815/815 [03:12<00:00,  4.24it/s]


In [37]:
def Average(lst):
    return sum(lst) / len(lst)

In [38]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:78.68515978713462
Precision:80.09770981296782
Recall:82.42343766542058
Exact Match:66.44413750767342
Jaccard Index:77.41336904668348


## Without STOP Without NN

In [36]:
# bert-squad without STOP without nn

#Load the model
#model_path = '/scratch/mahmadin/models/bert-squad-with-features-90-train-set-without-STOP-without-nn'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, token_type_ids, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 0 : token_type_ids[i].tolist().index(1)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|██████████| 815/815 [03:09<00:00,  4.29it/s]


In [37]:
def Average(lst):
    return sum(lst) / len(lst)

In [38]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:79.21505290447762
Precision:80.77035263095036
Recall:82.77529386396611
Exact Match:67.01197053406999
Jaccard Index:77.92772613924902


## Without the Features

In [24]:
import torch
from torch import nn

class QANetwork(torch.nn.Module):
    def __init__(self):
        super(QANetwork, self).__init__()
        self.num_labels = 2
        self.hidden_size = 768 
        self.bert = bert_model
        self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids, start_positions=None, end_positions=None, features=None):
        
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            #output_attentions=output_attentions,         Include these later if needed
            #output_hidden_states=output_hidden_states,
            #return_dict=return_dict,
        )
        sequence_output = outputs[0]
        
        # Concatenate logits with features
        if features is not None:
            sequence_output = torch.cat([sequence_output, features], 2)
        
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()
        
        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
        
        return total_loss, start_logits, end_logits

In [25]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        sub = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return sub

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
# build datasets for both our training and validation sets
train_dataset = MyDataset(train_encodings)

In [18]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

  from cryptography import utils, x509


In [19]:
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [20]:
qa_model_no_features = nn.DataParallel(QANetwork().to(device))

In [21]:
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(qa_model_no_features.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)



In [None]:
for epoch in range(3):
    # set model to train mode
    qa_model_no_features.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        #batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = qa_model_no_features(input_ids, attention_mask, token_type_ids, start_positions, end_positions)
        # extract loss
        loss = outputs[0]  # 0: total loss, 1: start logits, 2: end logits
        # calculate loss for every parameter that needs grad update
        loss.sum().backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.sum().item())

## Saving the Models

In [None]:
model_path = '/scratch/mahmadin/models/bert-squad-without-features-90-train-set'
#Save the model
torch.save(qa_model_no_features.state_dict(), model_path)

## Use 90% of training set to train and 10% to test

In [27]:
add_end_idx(test_answers, test_contexts)

In [28]:
# tokenize
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

In [29]:
add_token_positions(test_encodings, test_answers)

In [30]:
# build datasets for both our training and validation sets
test_dataset = MyDataset(test_encodings)

# initialize data loader for training data
test_loader = DataLoader(test_dataset, batch_size=16)

In [31]:
#Load the model
model_path = '/scratch/mahmadin/models/bert-squad-without-features-90-train-set'
qa_model_no_features = nn.DataParallel(QANetwork().to(device))
qa_model_no_features.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model_no_features.eval()


em_scores_without_features = []
f1_scores_without_features = []
precision_scores_without_features = []
recall_scores_without_features = []
JI_scores_without_features = []
predicted_answers_without_features = []
true_answers_without_features = []
contexts_without_features = []
questions_without_features = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        #batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model_no_features(input_ids, attention_mask, token_type_ids)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores_without_features.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores_without_features.append(scores[0])
            precision_scores_without_features.append(scores[1])
            recall_scores_without_features.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(102)]
            context = tokenizer.decode(context_tokens)
            
            start_q = input_ids[i].tolist().index(102)
            end_q = len(input_ids[i])

            question_tokens = input_ids[i, input_ids[i].tolist().index(102, start_q, end_q)+1: input_ids[i].tolist().index(102, start_q+1, end_q)]
            question = tokenizer.decode(question_tokens)

            JI_scores_without_features.append(Jaccard_index(context, true_answer, predicted_answer))
            
            predicted_answers_without_features.append(predicted_answer)
            true_answers_without_features.append(true_answer)
            contexts_without_features.append(context)
            questions_without_features.append(question)

100%|██████████| 815/815 [04:51<00:00,  2.80it/s]


In [48]:
def Average(lst):
    return sum(lst) / len(lst)

In [30]:
print(f"F1_score:{Average(f1_scores_without_features)*100}")
print(f"Precision:{Average(precision_scores_without_features)*100}")
print(f"Recall:{Average(recall_scores_without_features)*100}")
print(f"Exact Match:{Average(em_scores_without_features)*100}")
print(f"Jaccard Index:{Average(JI_scores_without_features)*100}")

F1_score:78.75893041798673
Precision:80.32300808902055
Recall:82.47230466474944
Exact Match:66.49017802332719
Jaccard Index:77.27342106697196


## Test Cases on F1

In [32]:
counter = 0
for i in range(len(f1_scores)):
    if f1_scores[i] > f1_scores_without_features[i]:
        print(f"Instance: {i}")
        print(f"Question: {questions[i]}")
        print(f"Context: {contexts[i]}")
        print(f"True answer: {true_answers[i]}")
        
        print(f"f1 with features: {f1_scores[i]}")
        print(f"Predicted answer with features: {predicted_answers[i]}")
        
        
        print(f"f1 without features: {f1_scores_without_features[i]}")
        print(f"Predicted answer without features: {predicted_answers_without_features[i]}")
        
        counter+=1
        print("\n")
print(f"number of improvments: {counter}")

Instance: 28
Question: if beyonce won three grammies in 2015, how many was she nominated for?
Context: at the 57th annual grammy awards in february 2015, beyonce was nominated for six awards, ultimately winning three : best r & b performance and best r & b song for " drunk in love ", and best surround sound album for beyonce. she was nominated for album of the year but the award was won by beck for his morning phase album. in august, the cover of the september issue of vogue magazine was unveiled online, beyonce as the cover star, becoming the first african - american artist and third african - american woman in general to cover the september issue. she headlined the 2015 made in america festival in early september and also the global citizen festival later that month. beyonce made an uncredited featured appearance on the track " hymn for the weekend " by british rock band coldplay, on their seventh studio album a head full of dreams ( 2015 ), which saw release in december. on january 

Question: who believes the ecumenical councils intended the cannons to be laws?
Context: the greek - speaking orthodox have collected canons and commentaries upon them in a work known as the pedalion ( greek : πηδαλιον, " rudder " ), so named because it is meant to " steer " the church. the orthodox christian tradition in general treats its canons more as guidelines than as laws, the bishops adjusting them to cultural and other local circumstances. some orthodox canon scholars point out that, had the ecumenical councils ( which deliberated in greek ) meant for the canons to be used as laws, they would have called them nomoi / νομοι ( laws ) rather than kanones / κανονες ( rules ), but almost all orthodox conform to them. the dogmatic decisions of the councils, though, are to be obeyed rather than to be treated as guidelines, since they are essential for the church's unity.
True answer: orthodox canon scholars
f1 with features: 1.0
Predicted answer with features: orthodox canon scholars

f1 without features: 0.5714285714285715
Predicted answer without features: 25 % nickel


Instance: 3150
Question: what makes how one construes oneself now and in the future similar?
Context: the inclusiveness of weinreich's definition ( above ) directs attention to the totality of one's identity at a given phase in time, and assists in elucidating component aspects of one's total identity, such as one's gender identity, ethnic identity, occupational identity and so on. the definition readily applies to the young child, to the adolescent, to the young adult, and to the older adult in various phases of the life cycle. depending on whether one is a young child or an adult at the height of one's powers, how one construes oneself as one was in the past will refer to very different salient experiential markers. likewise, how one construes oneself as one aspires to be in the future will differ considerably according to one's age and accumulated experiences. ( weinreich & saunderson, ( eds ) 2

True answer: secular affairs, particularly science and technology,
f1 with features: 1.0
Predicted answer with features: secular affairs, particularly science and technology,
f1 without features: 0.5
Predicted answer without features: secular affairs,


Instance: 4576
Question: the hvc - 101 lost the rf and adopted which input for use in north america and europe?
Context: video output connections varied from one model of the console to the next. the original hvc - 001 model of the family computer featured only radio frequency ( rf ) modulator output. when the console was released in north america and europe, support for composite video through rca connectors was added in addition to the rf modulator. the hvc - 101 model of the famicom dropped the rf modulator entirely and adopted composite video output via a proprietary 12 - pin " multi - out " connector first introduced for the super famicom / super nintendo entertainment system. conversely, the north american re - released nes - 101 

f1 with features: 1.0
Predicted answer with features: ad 845,
f1 without features: 0
Predicted answer without features: 878. although " somersetshire " was in common use as an alternative name for the county, it went out of fashion in the late 19th century,


Instance: 6308
Question: in what town is yeo valley organic based?
Context: agriculture and food and drink production continue to be major industries in the county, employing over 15, 000 people. apple orchards were once plentiful, and somerset is still a major producer of cider. the towns of taunton and shepton mallet are involved with the production of cider, especially blackthorn cider, which is sold nationwide, and there are specialist producers such as burrow hill cider farm and thatchers cider. gerber products company in bridgwater is the largest producer of fruit juices in europe, producing brands such as " sunny delight " and " ocean spray. " development of the milk - based industries, such as ilchester cheese company and 

Context: while the big bang model is well established in cosmology, it is likely to be refined in the future. little is known about the earliest moments of the universe's history. the penrose – hawking singularity theorems require the existence of a singularity at the beginning of cosmic time. however, these theorems assume that general relativity is correct, but general relativity must break down before the universe reaches the planck temperature, and a correct treatment of quantum gravity may avoid the singularity.
True answer: the penrose – hawking singularity theorems
f1 with features: 1.0
Predicted answer with features: the penrose – hawking singularity theorems
f1 without features: 0.7499999999999999
Predicted answer without features: penrose – hawking


Instance: 8428
Question: what did authors go beyond in the 19th century when writing fiction?
Context: time travel is the concept of moving backwards or forwards to different points in time, in a manner analogous to moving throug

True answer: nashville and memphis.
f1 with features: 1.0
Predicted answer with features: nashville and memphis.
f1 without features: 0.28571428571428575
Predicted answer without features: tri - cities and knoxville,


Instance: 10020
Question: what subject matter is avoided in post - punk music?
Context: post - punk is a heterogeneous type of rock music that emerged in the wake of the punk movement of the 1970s. drawing inspiration from elements of punk rock while departing from its musical conventions and wider cultural affiliations, post - punk music was marked by varied, experimentalist sensibilities and its " conceptual assault " on rock tradition. artists embraced electronic music, black dance styles and the avant - garde, as well as novel recording technology and production techniques. the movement also saw the frequent intersection of music with art and politics, as artists liberally drew on sources such as critical theory, cinema, performance art and modernist literature. acco

Question: which ban was lifted in 1991?
Context: however, by the turn of the 1990s the downward trend was starting to reverse ; england had been successful in the 1990 fifa world cup, reaching the semi - finals. uefa, european football's governing body, lifted the five - year ban on english clubs playing in european competitions in 1990 ( resulting in manchester united lifting the uefa cup winners'cup in 1991 ) and the taylor report on stadium safety standards, which proposed expensive upgrades to create all - seater stadiums in the aftermath of the hillsborough disaster, was published in january of that year.
True answer: five - year ban on english clubs playing in european competitions
f1 with features: 0.7692307692307693
Predicted answer with features: uefa, european football's governing body, lifted the five - year ban on english clubs playing in european competitions
f1 without features: 0
Predicted answer without features: uefa cup winners'cup


Instance: 11112
Question: who insi

Instance: 12094
Question: what happened in 2000 causing a 93 day ordeal?
Context: in the united states, political commentators often refer to the " red states ", which traditionally vote for republican candidates in presidential elections, and " blue states ", which vote for the democratic candidate. this convention is relatively recent : before the 2000 presidential election, media outlets assigned red and blue to both parties, sometimes alternating the allocation for each election. fixed usage was established during the 39 - day recount following the 2000 election, when the media began to discuss the contest in terms of " red states " versus " blue states ".
True answer: recount
f1 with features: 1.0
Predicted answer with features: recount
f1 without features: 0
Predicted answer without features: 


Instance: 12097
Question: who took power of china in 1994?
Context: the communist party of china, founded in 1920, adopted the red flag and hammer and sickle emblem of the soviet union, w

In [37]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("In October 2014, it was announced that Beyoncé with her management company Parkwood Entertainment would be partnering with London-based fashion retailer Topshop, in a new 50/50 split subsidiary business named Parkwood Topshop Athletic Ltd.")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

In prep announced VERB [October]
October pobj In ADP [2014]
2014 nummod October PROPN []
, punct announced VERB []
it nsubjpass announced VERB []
was auxpass announced VERB []
announced ROOT announced VERB [In, ,, it, was, partnering]
that mark partnering VERB []
Beyoncé nsubj partnering VERB [with]
with prep Beyoncé PROPN [company]
her poss company NOUN []
management compound company NOUN []
company pobj with ADP [her, management, Entertainment]
Parkwood compound Entertainment PROPN []
Entertainment appos company NOUN [Parkwood]
would aux partnering VERB []
be aux partnering VERB []
partnering ccomp announced VERB [that, Beyoncé, would, be, with, ,, in]
with prep partnering VERB [Topshop]
London npadvmod based VERB []
- punct based VERB []
based amod retailer NOUN [London, -]
fashion compound retailer NOUN []
retailer compound Topshop PROPN [based, fashion]
Topshop pobj with ADP [retailer]
, punct partnering VERB []
in prep partnering VERB [business]
a det business NOUN []
new amod bu

In [52]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("he was also attracted to the singing student konstancja gładkowska. in letters to woyciechowski, he indicated which of his works, and even which of their passages, were influenced by his fascination with her ; his letter of 15 may 1830 revealed that the slow movement ( larghetto ) of his piano concerto no. 1 ( in e minor ) was secretly dedicated to her")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

he nsubjpass attracted VERB []
was auxpass attracted VERB []
also advmod attracted VERB []
attracted ROOT attracted VERB [he, was, also, to, .]
to prep attracted VERB [gładkowska]
the det gładkowska PROPN []
singing compound student NOUN []
student compound gładkowska PROPN [singing]
konstancja compound gładkowska PROPN []
gładkowska pobj to ADP [the, student, konstancja]
. punct attracted VERB []
in prep indicated VERB [letters]
letters pobj in ADP [to]
to prep letters NOUN [woyciechowski]
woyciechowski pobj to ADP []
, punct indicated VERB []
he nsubj indicated VERB []
indicated ccomp revealed VERB [in, ,, he, which, ,, influenced]
which dobj indicated VERB [of, ,, and, which]
of prep which PRON [works]
his poss works NOUN []
works pobj of ADP [his]
, punct which PRON []
and cc which PRON []
even advmod which PRON []
which conj which PRON [even, of]
of prep which PRON [passages]
their poss passages NOUN []
passages pobj of ADP [their]
, punct indicated VERB []
were auxpass influenced

In [51]:
counter = 0
for i in range(len(f1_scores)):
    if f1_scores[i] < f1_scores_without_features[i]:
        print(f"Instance: {i}")
        print(f"Question: {questions[i]}")
        print(f"Context: {contexts[i]}")
        print(f"True answer: {true_answers[i]}")
        
        print(f"f1 with features: {f1_scores[i]}")
        print(f"Predicted answer with features: {predicted_answers[i]}")
        
        
        print(f"f1 without features: {f1_scores_without_features[i]}")
        print(f"Predicted answer without features: {predicted_answers_without_features[i]}")
        
        counter+=1
        
print(f"number of mistakes: {counter}")

Question: debut song, " killing time " was featured on what movie's sound track?
Context: the group changed their name to destiny's child in 1996, based upon a passage in the book of isaiah. in 1997, destiny's child released their major label debut song " killing time " on the soundtrack to the 1997 film, men in black. the following year, the group released their self - titled debut album, scoring their first major hit " no, no, no ". the album established the group as a viable act in the music industry, with moderate sales and winning the group three soul train lady of soul awards for best r & b / soul album of the year, best r & b / soul or rap new artist, and best r & b / soul single for " no, no, no ". the group released their multi - platinum second album the writing's on the wall in 1999. the record features some of the group's most widely known songs such as " bills, bills, bills ", the group's first number - one single, " jumpin'jumpin'" and " say my name ", which became their 

Question: malyarchuk found czech mtdna lineages were typical of what populations?
Context: in 2008, biochemist boris arkadievich malyarchuk ( russian : борис аркадьевич малярчук ) et al. of the institute of biological problems of the north, russian academy of sciences, magadan, russia, used a sample ( n = 279 ) of czech individuals to determine the frequency of " mongoloid " " mtdna lineages ". malyarchuk found czech mtdna lineages were typical of " slavic populations " with " 1. 8 % " mongoloid mtdna lineage. malyarchuk added that " slavic populations " " almost always " contain mongoloid mtdna lineage. malyarchuk said the mongoloid component of slavic people was partially added before the split of " balto - slavics " in 2, 000 – 3, 000 bc with additional mongoloid mixture occurring among slavics in the last 4, 000 years. malyarchuk said the " russian population " was developed by the " assimilation of the indigenous pre - slavic population of eastern europe by true slavs " with addit

Predicted answer without features: 1991,
Question: how does the classifiacation scheme work on the kinsey scale?
Context: the kinsey scale provides a classification of sexual orientation based on the relative amounts of heterosexual and homosexual experience or psychic response in one's history at a given time. the classification scheme works such that individuals in the same category show the same balance between the heterosexual and homosexual elements in their histories. the position on the scale is based on the relation of heterosexuality to homosexuality in one's history, rather than the actual amount of overt experience or psychic response. an individual can be assigned a position on the scale in accordance with the following definitions of the points of the scale :
True answer: such that individuals in the same category show the same balance between the heterosexual and homosexual elements in their histories.
f1 with features: 0.9375
Predicted answer with features: individuals i

Question: what is a jewish language that has not fallen out of use?
Context: for centuries, jews worldwide have spoken the local or dominant languages of the regions they migrated to, often developing distinctive dialectal forms or branches that became independent languages. yiddish is the judæo - german language developed by ashkenazi jews who migrated to central europe. ladino is the judæo - spanish language developed by sephardic jews who migrated to the iberian peninsula. due to many factors, including the impact of the holocaust on european jewry, the jewish exodus from arab and muslim countries, and widespread emigration from other jewish communities around the world, ancient and distinct jewish languages of several communities, including judæo - georgian, judæo - arabic, judæo - berber, krymchak, judæo - malayalam and many others, have largely fallen out of use.
True answer: judæo - georgian,
f1 with features: 0
Predicted answer with features: yiddish
f1 without features: 0.3636

Question: what is the name of the bom standard?
Context: the same character converted to utf - 8 becomes the byte sequence ef bb bf. the unicode standard allows that the bom " can serve as signature for utf - 8 encoded text where the character set is unmarked ". some software developers have adopted it for other encodings, including utf - 8, in an attempt to distinguish utf - 8 from local 8 - bit code pages. however rfc 3629, the utf - 8 standard, recommends that byte order marks be forbidden in protocols using utf - 8, but discusses the cases where this may not be possible. in addition, the large restriction on possible patterns in utf - 8 ( for instance there cannot be any lone bytes with the high bit set ) means that it should be possible to distinguish utf - 8 from other character encodings without relying on the bom.
True answer: rfc 3629,
f1 with features: 0.5714285714285715
Predicted answer with features: rfc 3629, the utf - 8 standard,
f1 without features: 1.0
Predicted answer 

Context: in january 1977, droney promoted him to first assistant district attorney, essentially making kerry his campaign and media surrogate because droney was afflicted with amyotrophic lateral sclerosis ( als, or lou gehrig's disease ). as first assistant, kerry tried cases, which included winning convictions in a high - profile rape case and a murder. he also played a role in administering the office, including initiating the creation of special white - collar and organized crime units, creating programs to address the problems of rape and other crime victims and witnesses, and managing trial calendars to reflect case priorities. it was in this role in 1978 that kerry announced an investigation into possible criminal charges against then senator edward brooke, regarding " misstatements " in his first divorce trial. the inquiry ended with no charges being brought after investigators and prosecutors determined that brooke's misstatements were pertinent to the case, but were not mater

f1 with features: 0
Predicted answer with features: 28
f1 without features: 1.0
Predicted answer without features: 15
Question: what rules did a german derivatives dealer say could be quite legally circumvented through swaps?
Context: according to der spiegel, credits given to european governments were disguised as " swaps " and consequently did not get registered as debt because eurostat at the time ignored statistics involving financial derivatives. a german derivatives dealer had commented to der spiegel that " the maastricht rules can be circumvented quite legally through swaps, " and " in previous years, italy used a similar trick to mask its true debt with the help of a different us bank. " these conditions had enabled greek as well as many other european governments to spend beyond their means, while meeting the deficit targets of the european union and the monetary union guidelines. in may 2010, the greek government deficit was again revised and estimated to be 13. 6 % which wa