<a href="https://colab.research.google.com/github/Myrto-Iglezou/AI2-project4/blob/master/Question_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## YΣ19 Artificial Intelligence II
# Homework 4

### Iglezou Myrto - 111520170038

# Project Description



Build a BERT-based model which returns “an answer”, given a user question and a
passage which includes the answer of the question. For this question answering task, we
will use the SQuAD 2.0 dataset. We will start with the BERT-base pretrained model “bert-base-uncased”
and fine-tune it to have a question answering task.


# **Question 3** 

### Load SCuAD 2.0 dataset from drive

In [56]:
import io
import os
from google.colab import drive
import pandas as pd 
import numpy as np
import json
import sys

drive.mount('/content/drive',force_remount=True)

sys.path.append('/content/drive/My Drive/')

!cp -r "/content/drive/My Drive/train-v2.0.json" '/content/'
!cp -r "/content/drive/My Drive/dev-v2.0.json" '/content/'
!cp -r "/content/drive/My Drive/utils_squad.py" '/content/'
!cp -r "/content/drive/My Drive/utils_squad_evaluate.py" '/content/'

train_file = '/content/train-v2.0.json'
validation_file = '/content/dev-v2.0.json'

with open(train_file) as f:
    raw_train_data = json.load(f)
with open(validation_file) as f:
    raw_val_data = json.load(f)

Mounted at /content/drive


In [2]:
%%capture
!pip install pytorch-pretrained-bert pytorch-nlp pytorch_transformers

In [3]:
%%capture
!pip install transformers

In [35]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import BertForQuestionAnswering
from tokenizers import BertWordPieceTokenizer

# Load pre-trained model tokenizer (vocabulary)
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)
max_len = 384


In [65]:
class SquadSample:
  
    def __init__(self, context, question, basic_answer, more_answers, start_idx):
        self.context = context
        self.question = question
        self.basic_answer = basic_answer
        self.more_answers = more_answers
        self.start_idx = start_idx
        self.end_idx = None
        self.start_idx_token = start_idx
        self.end_idx_token = None
        self.offsets = None
        self.input_ids = None 
        self.attention_mask = None
        self.token_type_ids = None
        self.validExample = True

    def preprocess(self):
       # Clean context, answer and question
        self.context = " ".join(str(self.context).split())
        self.question = " ".join(str(self.question).split())
        self.basic_answer = " ".join(str(self.basic_answer).split())


        #Calculate end_idx
        self.end_idx = self.start_idx + len(self.basic_answer)
        if (self.end_idx >=len(self.context)):
            self.validExample= False
            return

        #find characters of context that are part of answer
        is_part_of_answer = [0]*len(self.context)
        for i in range (self.start_idx, self.end_idx):
            is_part_of_answer[i] = 1

        contextTokenizer =  tokenizer.encode(self.context)  
        #find index of token that corresponds to start and the end of the answer
        answer_id_token=[]
        for idx, (start,end) in enumerate(contextTokenizer.offsets):
            if (sum(is_part_of_answer[start:end]) >0 ):
                answer_id_token.append(idx)
        #data to predict
        if len(answer_id_token) == 0 :
            self.validExample=False
            return         
        self.start_idx_token = answer_id_token[0]
        self.end_idx_token = answer_id_token[-1]
        self.offsets = contextTokenizer.offsets
        
        # work on question
        questionTokinizer  = tokenizer.encode(self.question)

        #Create model's inputs 
        self.input_ids = contextTokenizer.ids + questionTokinizer.ids[1:]
        self.attention_mask = [1] * len (self.input_ids)
        self.token_type_ids = [0] * len(contextTokenizer.ids) + [1]*len(questionTokinizer.ids[1:])  

        padding_length = max_len - len(self.input_ids)
        if padding_length > 0:
            self.input_ids = self.input_ids + ([0] * padding_length)
            self.attention_mask = self.attention_mask + ([0] * padding_length)
            self.token_type_ids = self.token_type_ids + ([0] * padding_length)
        elif padding_length < 0:
            self.validExample= False 
            return 

In [49]:
def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                if qa["answers"]:
                  answer_text = qa["answers"][0]["text"]
                  all_answers = [_["text"] for _ in qa["answers"]]
                  start_char_idx = qa["answers"][0]["answer_start"]
                  #context, question, basic_answer, more_answers, start_idx
                  squad_eg =  squad_eg = SquadSample(context,question, answer_text, all_answers, start_char_idx)

                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples

In [72]:
def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_ids" : [],
        "attention_mask" : [],
        "token_type_ids" : [],
        "start_idx_token" : [],
        "end_idx_token" : []
    }
    for item in squad_examples:
        if item.validExample is True:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key],dtype=np.float16)
    x = [dataset_dict["input_ids"], dataset_dict["attention_mask"], dataset_dict["token_type_ids"]]
    y = [dataset_dict["start_idx_token"], dataset_dict["end_idx_token"]]
    return x, y   

In [66]:
data = create_squad_examples(raw_train_data)

val_data = create_squad_examples(raw_val_data)

In [67]:
train_data = pd.DataFrame.from_records([vars(line) for line in data])
train_data[["context","question","basic_answer"]].head()

Unnamed: 0,context,question,basic_answer
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s


In [74]:
x_train, y_train = create_inputs_targets(data)

x_eval, y_eval = create_inputs_targets(val_data)

In [69]:
doc_stride = 64
max_seq_length = 128
max_query_length = 32
batch_size = 16

In [78]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert to Tensors and build dataset
train_data = TensorDataset(torch.tensor(x_train[0], dtype=torch.int64),
                           torch.tensor(x_train[1], dtype=torch.float),
                           torch.tensor(x_train[2], dtype=torch.int64),
                           torch.tensor(y_train[0], dtype=torch.int64),
                           torch.tensor(y_train[1], dtype=torch.int64))

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [79]:
# Convert to Tensors and build dataset
train_sampler = RandomSampler(train_data)
train_data_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
eval_data = TensorDataset(torch.tensor(x_eval[0], dtype=torch.int64),
                          torch.tensor(x_eval[1], dtype=torch.float),
                          torch.tensor(x_eval[2], dtype=torch.int64),
                          torch.tensor(y_eval[0], dtype=torch.int64),
                          torch.tensor(y_eval[1], dtype=torch.int64))

eval_sampler = SequentialSampler(eval_data)
validation_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size)

In [80]:
import torch
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
    device = 'cuda'
else:
    print('No GPU available, training on CPU.')
    device = 'cpu'

No GPU available, training on CPU.


In [82]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device=device)
param_optimizer = list(model.named_parameters())

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [83]:
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = torch.optim.Adam(lr=1e-5, betas=(0.9, 0.98), eps=1e-9, params=optimizer_grouped_parameters)

In [85]:
import string

def normalize_text(text):
    text = text.lower()
    text = "".join(ch for ch in text if ch not in set(string.punctuation))
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    text = re.sub(regex, " ", text)
    text = " ".join(text.split())
    return text

In [None]:
epochs = 1

for epoch in range(1, epochs + 1):
    # ============================================ TRAINING ============================================================
    print("Training epoch ", str(epoch))
   
    model.train()
    tr_loss = 0
    nb_tr_steps = 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)

        inputs = {'input_ids':       batch[0],
                  'attention_mask':  batch[1], 
                  'token_type_ids':  batch[2],  
                  'start_positions': batch[3], 
                  'end_positions':   batch[4]}

        optimizer.zero_grad()

        outputs = model(**inputs)
        loss = outputs[0]

        loss.backward()
        optimizer.step()
        tr_loss += loss.item()
        nb_tr_steps += 1

    print(f"\nTraining loss={tr_loss / nb_tr_steps:.4f}")

    # ============================================ VALIDATION ==========================================================
    model.eval()
    currect_query = 0
    corrent_ans = 0
    valid_examples = [x for x in val_data if x.validExample is True]
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        
        input_ids, attention_mask, token_type_ids, start_positions, end_positions = batch

        with torch.no_grad():
          start_logits, end_logits =  model(input_ids=input_word_ids,
                                             attention_mask=input_mask,
                                             token_type_ids=input_type_ids, return_dict=False)
          
          pred_start, pred_end = start_logits.detach().cpu().numpy(), end_logits.detach().cpu().numpy()

        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = valid_examples[currect_query]
            currect_query += 1
            offsets = squad_eg.offsets
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]
            normalized_pred_ans = normalize_text(pred_ans)
            normalized_true_ans = [normalize_text(x) for x in squad_eg.all_answers]
            if normalized_pred_ans in normalized_true_ans:
                correct_ans += 1
    acc = correct_ans / len(y_eval[0])

    print(f"\nAccuracy score={acc:.2f}")

Training epoch  1


## References



*   https://github.com/nlpyang/pytorch-transformers/tree/master/examples
*  https://github.com/flogothetis/SQuAD-QueryAnswering-BERT-Keras/blob/main/SQuAD_QuestionAnswering_Bert.ipynb
*  https://github.com/dredwardhyde/bert-examples/blob/main/bert_squad_pytorch.py?fbclid=IwAR1VGhZx6MsVlOha3lDX_uC8PASSDu9ECKceD2XCHGSetKhldgay0F8SirY
* https://colab.research.google.com/drive/1Zp2_Uka8oGDYsSe5ELk-xz6wIX8OIkB7?fbclid=IwAR1zl-nOBSOdYA4H-WY-ba6AJ--hRHM2OZhgK3DrQ1SLfavln5M-k-r4jJ4#scrollTo=j3_CAQUf2asD

