In [1]:
import json

def read_squad(path):
    # open JSON file and load intro dictionary
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # check if we need to be extracting from 'answers' or 'plausible_answers'
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

In [2]:
# execute our read SQuAD function for training and validation sets
train_contexts_old, train_questions_old, train_answers_old = read_squad('squad/train-v2.0.json')

In [3]:
with open("train-indexes.json", 'r') as trainfile:
    rand_i_train = json.load(trainfile)
    
with open("test-indexes.json", 'r') as testfile:
    rand_i_test = json.load(testfile)

In [4]:
#Use 90% for train
train_contexts = []
train_questions = []
train_answers = []

for i in rand_i_train:
    train_contexts.append(train_contexts_old[i])
    train_questions.append(train_questions_old[i])
    train_answers.append(train_answers_old[i])

In [5]:
test_contexts = []
test_questions = []
test_answers = []

for i in rand_i_test:
    test_contexts.append(train_contexts_old[i])
    test_questions.append(train_questions_old[i])
    test_answers.append(train_answers_old[i])

In [6]:
def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            # this means the answer is off by 1-2 tokens
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

In [7]:
# and apply the function to our two answer lists
add_end_idx(train_answers, train_contexts)

In [8]:
from transformers import LongformerTokenizerFast, LongformerModel

In [9]:
t_dir = '/scratch/mahmadin/.cache/huggingface/transformers'
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096',cache_dir=t_dir)
longformer_model = LongformerModel.from_pretrained('allenai/longformer-base-4096', add_pooling_layer=False, return_dict=True, cache_dir=t_dir)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'longformer.pooler.dense.bias', 'longformer.pooler.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# tokenize
train_encodings = tokenizer(train_contexts, train_questions, max_length=512, truncation=True, padding='max_length')

In [11]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift position until found
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [12]:
# apply function to our data
add_token_positions(train_encodings, train_answers)

In [13]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [14]:
longformer_model

LongformerModel(
  (embeddings): LongformerEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(4098, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): LongformerEncoder(
    (layer): ModuleList(
      (0): LongformerLayer(
        (attention): LongformerAttention(
          (self): LongformerSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (query_global): Linear(in_features=768, out_features=768, bias=True)
            (key_global): Linear(in_features=768, out_features=768, bias=True)
            (value_global): Linear(in_features=768, out_features=768, bias=True)
          )
          (o

In [15]:
import json

with open("tokenized-features-final-ints.json", 'r') as f:
    features = json.load(f)

In [16]:
# Select only the last 3 features (removing one of the features each time)
for f in features:
    for each in f:
        each.pop(2)

In [17]:
# Pad features (In case of poping)
for f in features:
    f += [[0,0,0]] * (512 - len(f))

In [16]:
# Pad features
for f in features:
    f += [[0,0,0,0]] * (512 - len(f))

In [18]:
#Use 90% of features
train_features = []
for i in rand_i_train:
    train_features.append(features[i])

In [19]:
test_features = []
for i in rand_i_test:
    test_features.append(features[i])

In [20]:
import torch
from torch import nn

class QANetwork(torch.nn.Module):
    def __init__(self):
        super(QANetwork, self).__init__()
        self.num_labels = 2
        
        # with 3 features (linear layer)
        self.hidden_size = 768 + 3
        
        # with 4 features
        #self.hidden_size = 768 + 4
        
        self.longformer = longformer_model
        self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)
        
        # for having 3 features (linear layer)
        #self.features_linear_layer = nn.Linear(4, 3)
        #self.features_relu = nn.ReLU()
        #self.features_lstm = nn.LSTM(input_size=4, hidden_size=3)

    def forward(self, input_ids, attention_mask, start_positions=None, end_positions=None, features=None):
        
        outputs = self.longformer(
            input_ids,
            attention_mask=attention_mask,
            #token_type_ids=token_type_ids,
            #output_attentions=output_attentions,         Include these later if needed
            #output_hidden_states=output_hidden_states,
            #return_dict=return_dict,
        )
        sequence_output = outputs[0]
        
        # for having 3 features (linear layer)
        #features_torch = torch.tensor(features)
        #features_torch = features_torch.type(torch.float)
        #features_linear_output = self.features_linear_layer(features_torch)
        #features_output = self.features_relu(features_linear_output)
        #features_output, (hn, cn) = self.features_lstm(features_torch)
        #sequence_output = torch.cat([sequence_output, features_output], 2)
        
        # for having 4 features (without linear layer)
        sequence_output = torch.cat([sequence_output, features], 2)
        
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()
        
        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
        
        return total_loss, start_logits, end_logits

In [21]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, features):
        self.encodings = encodings
        self.features = features

    def __getitem__(self, idx):
        sub = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        sub['features'] = torch.tensor(self.features[idx])
        return sub

    def __len__(self):
        return len(self.encodings.input_ids)

In [22]:
# build datasets for both our training and validation sets
train_dataset = MyDataset(train_encodings, train_features)

In [23]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

  from cryptography import utils, x509


In [24]:
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [25]:
qa_model = nn.DataParallel(QANetwork().to(device))

In [26]:
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(qa_model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)



In [27]:
for epoch in range(3):
    # set model to train mode
    qa_model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = qa_model(input_ids, attention_mask, start_positions, end_positions, batch_features)
        # extract loss
        loss = outputs[0]  # 0: total loss, 1: start logits, 2: end logits
        # calculate loss for every parameter that needs grad update
        loss.sum().backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.sum().item())

Epoch 0: 100%|██████████| 7331/7331 [2:35:16<00:00,  1.27s/it, loss=5.46]  
Epoch 1: 100%|██████████| 7331/7331 [2:58:57<00:00,  1.46s/it, loss=3.79]   
Epoch 2: 100%|██████████| 7331/7331 [3:04:09<00:00,  1.51s/it, loss=1.68]   


In [28]:
# these functions are heavily influenced by the HF squad_metrics.py script
from nltk.tokenize import word_tokenize
import collections

def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    inputs = tokenizer(truth, return_tensors='pt', add_special_tokens=False)
    truth = tokenizer.decode(inputs['input_ids'][0])

    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    inputs = tokenizer(truth, return_tensors='pt', add_special_tokens=False)
    truth = tokenizer.decode(inputs['input_ids'][0])

    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens),int(pred_tokens == truth_tokens),int(pred_tokens == truth_tokens)

    common_tokens = collections.Counter(truth_tokens) & collections.Counter(pred_tokens)
    num_same = sum(common_tokens.values())

    # if there are no common tokens then f1 = 0
    if num_same == 0:
        return 0,0,0

    prec = 1.0 * num_same / len(pred_tokens)
    rec = 1.0 * num_same / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec), prec, rec

def Jaccard_index(context,answer,prediction):

    inputs = tokenizer(answer, return_tensors='pt', add_special_tokens=False)
    gold_answer0 = tokenizer.decode(inputs['input_ids'][0])

    inputs = tokenizer(context, return_tensors='pt', add_special_tokens=False)
    context = tokenizer.decode(inputs['input_ids'][0])

    prediction = normalize_text(prediction)
    gold_answer0 = normalize_text(gold_answer0)

    text=" ".join(word_tokenize(context)).lower()
    gold_answers=" ".join(word_tokenize(gold_answer0)).lower()
    prediction = " ".join(word_tokenize(prediction)).lower()
    if prediction=='':
        pred_set=set()
    else:
        pred_start = text.find(prediction)
        pred_end = len(text) - (text[::-1].find(prediction[::-1]))
        pred_set = set(list(range(pred_start, pred_end)))
        if pred_start==-1 or pred_end==-1:
            pred_set=set()

    if gold_answers=='':
        gold_start = 0
        gold_end = 0
        gold_set=set()
    else:
        gold_start = text.find(gold_answers)
        gold_end = len(text) - (text[::-1].find(gold_answers[::-1]))
        # gold_start = example.answers[0]['answer_start']
        # gold_end = example.answers[0]['answer_end']
        gold_set = set(list(range(gold_start, gold_end)))
        if gold_start==-1 or gold_end==-1:
            gold_set=set()


    intersection=gold_set.intersection(pred_set)
    union=gold_set.union(pred_set)


    intersection_list=list(intersection)
    union_list=list(union)


    intersection_list.sort()
    union_list.sort()

    if not intersection_list:
        intersection_word=''
    else:
        intersection_word=text[intersection_list[0]:intersection_list[-1] + 1]
    if not union_list:
        union_words=''
    else:
        union_words=text[union_list[0]:union_list[-1]+1]

    intersection_word_length=len(word_tokenize(intersection_word))
    union_word_length=len(word_tokenize(union_words))

    if intersection_word_length==0 and union_word_length==0:
        JI=1
    else:
        JI=intersection_word_length/union_word_length

    return JI

## Save Models

In [35]:
# longformer-squad with 4 features
model_path = '/scratch/mahmadin/models/longformer-squad-with-features-90-train-set'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [28]:
# longformer-squad with 3 features (linear layer)
model_path = '/scratch/mahmadin/models/longformer-squad-with-features-90-train-set-with-features-linear-layer'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [27]:
# longformer-squad with 3 features (LSTM)
model_path = '/scratch/mahmadin/models/longformer-squad-with-features-90-train-set-with-features-lstm-layer'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [40]:
# longformer-squad without NER
model_path = '/scratch/mahmadin/models/longformer-squad-with-features-90-train-set-without-NER-without-nn'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [29]:
# longformer-squad without POS
model_path = '/scratch/mahmadin/models/longformer-squad-with-features-90-train-set-without-POS-without-nn'
#Save the model
torch.save(qa_model.state_dict(), model_path)

In [29]:
# longformer-squad without DEP
model_path = '/scratch/mahmadin/models/longformer-squad-with-features-90-train-set-without-DEP-without-nn'
#Save the model
torch.save(qa_model.state_dict(), model_path)

## Use 90% of training set to train and 10% to test

In [30]:
add_end_idx(test_answers, test_contexts)

In [31]:
# tokenize
test_encodings = tokenizer(test_contexts, test_questions, max_length = 512, truncation=True, padding='max_length')

In [32]:
add_token_positions(test_encodings, test_answers)

In [33]:
# build datasets for both our training and validation sets
test_dataset = MyDataset(test_encodings, test_features)

# initialize data loader for training data
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

## With 4 Features

In [32]:
# longformer-squad with 4 features

#Load the model
#model_path = '/scratch/mahmadin/models/longformer-squad-with-features-90-train-set'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|██████████| 815/815 [05:15<00:00,  2.58it/s]


In [33]:
def Average(lst):
    return sum(lst) / len(lst)

In [34]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:81.11701040180475
Precision:83.33228319586577
Recall:83.37108555104507
Exact Match:69.08379373848987
Jaccard Index:79.73050379996714


## With 3 Features Linear

In [33]:
# longformer-squad with 3 features (linear layer)

#Load the model
#model_path = '/scratch/mahmadin/models/longformer-squad-with-features-90-train-set-with-features-linear-layer'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|██████████| 815/815 [05:15<00:00,  2.58it/s]


In [34]:
def Average(lst):
    return sum(lst) / len(lst)

In [35]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:80.29707018959142
Precision:82.47405691525381
Recall:82.90850565721856
Exact Match:68.21669736034377
Jaccard Index:78.95786670231082


## With 3 Features LSTM

In [33]:
# longformer-squad with 3 features (LSTM)

#Load the model
#model_path = '/scratch/mahmadin/models/longformer-squad-with-features-90-train-set-with-features-lstm-layer'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

  self.dropout, self.training, self.bidirectional, self.batch_first)
100%|██████████| 815/815 [03:57<00:00,  3.43it/s]


In [34]:
def Average(lst):
    return sum(lst) / len(lst)

In [35]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:81.40162037553972
Precision:82.86560342625857
Recall:84.33357683046255
Exact Match:69.29864947820748
Jaccard Index:80.11588914959358


## Without NER Without NN

In [45]:
# longformer-squad without NER without nn

#Load the model
#model_path = '/scratch/mahmadin/models/longformer-squad-with-features-90-train-set-without-NER-without-nn'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|██████████| 815/815 [08:19<00:00,  1.63it/s]


In [46]:
def Average(lst):
    return sum(lst) / len(lst)

In [47]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:80.07841620210333
Precision:82.01973981508998
Recall:82.92920951205754
Exact Match:68.2243707796194
Jaccard Index:79.03660358664233


## Without POS Without NN

In [34]:
# longformer-squad without POS without nn

#Load the model
#model_path = '/scratch/mahmadin/models/longformer-squad-with-features-90-train-set-without-POS-without-nn'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|██████████| 815/815 [09:36<00:00,  1.41it/s]


In [35]:
def Average(lst):
    return sum(lst) / len(lst)

In [36]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:80.54945161964143
Precision:83.30231584171175
Recall:82.28255870521193
Exact Match:68.70779619398404
Jaccard Index:79.40271648208133


## Without DEP Without NN

In [34]:
# longformer-squad without DEP without nn

#Load the model
#model_path = '/scratch/mahmadin/models/longformer-squad-with-features-90-train-set-without-DEP-without-nn'
#qa_model = nn.DataParallel(QANetwork().to(device))
#qa_model.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model.eval()


em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
JI_scores = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model(input_ids, attention_mask, features=batch_features)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores.append(scores[0])
            precision_scores.append(scores[1])
            recall_scores.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            JI_scores.append(Jaccard_index(context, true_answer, predicted_answer))

100%|██████████| 815/815 [15:47<00:00,  1.16s/it]


In [35]:
def Average(lst):
    return sum(lst) / len(lst)

In [36]:
print(f"F1_score:{Average(f1_scores)*100}")
print(f"Precision:{Average(precision_scores)*100}")
print(f"Recall:{Average(recall_scores)*100}")
print(f"Exact Match:{Average(em_scores)*100}")
print(f"Jaccard Index:{Average(JI_scores)*100}")

F1_score:80.7786548160234
Precision:83.97609926901785
Recall:82.01158036987614
Exact Match:68.8842848373235
Jaccard Index:79.33696213130911


## Without the Features

In [16]:
import torch
from torch import nn

class QANetwork(torch.nn.Module):
    def __init__(self):
        super(QANetwork, self).__init__()
        self.num_labels = 2
        self.hidden_size = 768 
        self.longformer = longformer_model
        self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)

    def forward(self, input_ids, attention_mask, start_positions=None, end_positions=None, features=None):
        
        outputs = self.longformer(
            input_ids,
            attention_mask=attention_mask,
            #token_type_ids=token_type_ids,
            #output_attentions=output_attentions,         Include these later if needed
            #output_hidden_states=output_hidden_states,
            #return_dict=return_dict,
        )
        sequence_output = outputs[0]
        
        # Concatenate logits with features
        if features is not None:
            sequence_output = torch.cat([sequence_output, features], 2)
        
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()
        
        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
        
        return total_loss, start_logits, end_logits

In [17]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        sub = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return sub

    def __len__(self):
        return len(self.encodings.input_ids)

In [18]:
# build datasets for both our training and validation sets
train_dataset = MyDataset(train_encodings)

In [19]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

  from cryptography import utils, x509


In [20]:
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [21]:
qa_model_no_features = nn.DataParallel(QANetwork().to(device))

In [22]:
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(qa_model_no_features.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)



In [23]:
for epoch in range(3):
    # set model to train mode
    qa_model_no_features.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        #batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = qa_model_no_features(input_ids, attention_mask, start_positions, end_positions)
        # extract loss
        loss = outputs[0]  # 0: total loss, 1: start logits, 2: end logits
        # calculate loss for every parameter that needs grad update
        loss.sum().backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.sum().item())

Epoch 0: 100%|██████████| 7331/7331 [1:16:30<00:00,  1.60it/s, loss=2.31]
Epoch 1: 100%|██████████| 7331/7331 [1:16:37<00:00,  1.59it/s, loss=3.16] 
Epoch 2: 100%|██████████| 7331/7331 [1:16:43<00:00,  1.59it/s, loss=3.04] 


## Use 90% of training set to train and 10% to test

In [35]:
model_path = '/scratch/mahmadin/models/longformer-squad-without-features-90-train-set'
#Save the model
torch.save(qa_model_no_features.state_dict(), model_path)

In [24]:
add_end_idx(test_answers, test_contexts)

In [25]:
# tokenize
test_encodings = tokenizer(test_contexts, test_questions, max_length = 512, truncation=True, padding='max_length')

In [26]:
add_token_positions(test_encodings, test_answers)

In [32]:
# build datasets for both our training and validation sets
test_dataset = MyDataset(test_encodings)

# initialize data loader for training data
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [33]:
#Load the model
#model_path = '/scratch/mahmadin/models/longformer-squad-without-features-90-train-set'
#qa_model_no_features = nn.DataParallel(QANetwork().to(device))
#qa_model_no_features.load_state_dict(torch.load(model_path))

# switch model out of training mode
qa_model_no_features.eval()


em_scores_without_features = []
f1_scores_without_features = []
precision_scores_without_features = []
recall_scores_without_features = []
JI_scores_without_features = []

# loop through batches
for batch in tqdm(test_loader):
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        #batch_features = batch['features'].to(device)
        # train model on batch and return outputs (incl. loss)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = qa_model_no_features(input_ids, attention_mask)
        
        answer_start_index = outputs[1].argmax(axis=1)
        answer_end_index = outputs[2].argmax(axis=1)
        
        
        for i in range(len(input_ids)):
            predict_answer_tokens = input_ids[i, answer_start_index[i] : answer_end_index[i] + 1]
            predicted_answer = tokenizer.decode(predict_answer_tokens)
            
            true_answer_tokens = input_ids[i, start_true[i] : end_true[i] + 1]
            true_answer = tokenizer.decode(true_answer_tokens)
            
            em_scores_without_features.append(compute_exact_match(predicted_answer, true_answer))

            scores = (compute_f1(predicted_answer, true_answer))
            f1_scores_without_features.append(scores[0])
            precision_scores_without_features.append(scores[1])
            recall_scores_without_features.append(scores[2])
            
            context_tokens = input_ids[i, 1 : input_ids[i].tolist().index(2)]
            context = tokenizer.decode(context_tokens)

            JI_scores_without_features.append(Jaccard_index(context, true_answer, predicted_answer))

100%|██████████| 815/815 [05:10<00:00,  2.63it/s]


In [29]:
def Average(lst):
    return sum(lst) / len(lst)

In [34]:
print(f"F1_score:{Average(f1_scores_without_features)*100}")
print(f"Precision:{Average(precision_scores_without_features)*100}")
print(f"Recall:{Average(recall_scores_without_features)*100}")
print(f"Exact Match:{Average(em_scores_without_features)*100}")
print(f"Jaccard Index:{Average(JI_scores_without_features)*100}")

F1_score:80.74892141975855
Precision:82.03876595563916
Recall:84.19767936079747
Exact Match:68.36249232658072
Jaccard Index:79.38317262440603
