In [1]:
# download
# !mkdir data
# !wget -P data/ https://s3.amazonaws.com/my89public/quac/train_v0.2.json --no-check-certificate
# !wget -P data/ https://s3.amazonaws.com/my89public/quac/val_v0.2.json --no-check-certificate

# preprocessing
!python3 toolbox/download_process_quac.py --quac_file data/train_v0.2.json --output_file data/quac_train.json
!python3 toolbox/download_process_quac.py --quac_file data/val_v0.2.json --output_file data/quac_dev.json

In [2]:
import pandas as pd
from datasets import Dataset
from datasets.dataset_dict import DatasetDict

import torch.nn as nn
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, DataCollatorWithPadding
from transformers import BertTokenizerFast, LongformerTokenizerFast
from transformers import BertForQuestionAnswering, LongformerForQuestionAnswering

In [3]:
def add_history(qid, data, i):
    turn_number = int(qid[-1])
    history = []
          
        
    if n_history == 1:
        if turn_number != 0:
            history.append(data[i - 1]['paragraphs'][0]['qas'][0]['question'])
            history.append(data[i - 1]['paragraphs'][0]['qas'][0]['answers'][0]['text'])
    
    if n_history == 2:
        if turn_number != 0:
            if turn_number == 1:
                history.append(data[i - 1]['paragraphs'][0]['qas'][0]['question'])
                history.append(data[i - 1]['paragraphs'][0]['qas'][0]['answers'][0]['text'])
            else:
                history.append(data[i - 2]['paragraphs'][0]['qas'][0]['question'])
                history.append(data[i - 2]['paragraphs'][0]['qas'][0]['answers'][0]['text'])
                history.append(data[i - 1]['paragraphs'][0]['qas'][0]['question'])
                history.append(data[i - 1]['paragraphs'][0]['qas'][0]['answers'][0]['text'])
                
    if n_history == 3:
        if turn_number != 0:
            if turn_number == 1:
                history.append(data[i - 1]['paragraphs'][0]['qas'][0]['question'])
                history.append(data[i - 1]['paragraphs'][0]['qas'][0]['answers'][0]['text'])
            elif turn_number == 2:
                history.append(data[i - 2]['paragraphs'][0]['qas'][0]['question'])
                history.append(data[i - 2]['paragraphs'][0]['qas'][0]['answers'][0]['text'])
                history.append(data[i - 1]['paragraphs'][0]['qas'][0]['question'])
                history.append(data[i - 1]['paragraphs'][0]['qas'][0]['answers'][0]['text'])
            else: 
                history.append(data[i - 3]['paragraphs'][0]['qas'][0]['question'])
                history.append(data[i - 3]['paragraphs'][0]['qas'][0]['answers'][0]['text'])
                history.append(data[i - 2]['paragraphs'][0]['qas'][0]['question'])
                history.append(data[i - 2]['paragraphs'][0]['qas'][0]['answers'][0]['text'])
                history.append(data[i - 1]['paragraphs'][0]['qas'][0]['question'])
                history.append(data[i - 1]['paragraphs'][0]['qas'][0]['answers'][0]['text'])
        
#     if n_history == 'all':
#         for n in range(turn_number):
#             history.append(data[i - turn_number + n]['paragraphs'][0]['qas'][0]['question'])
#             history.append(data[i - turn_number + n]['paragraphs'][0]['qas'][0]['answers'][0]['text'])
            
    history.append(data[i]['paragraphs'][0]['qas'][0]['question'])
    return (' '.join(history))
        

In [4]:
def add_end_pos(answers, docs):
    for answer, context in zip(answers, docs):
        end_pos = answer['answer_start'] + len(answer['text'])
        if context[answer['answer_start']:end_pos] == answer['text']:
            answer['answer_end'] = end_pos
        else:
            Exception('error..')
    return answers

def extract_info(data):
    questions, docs, answers, answer_candidates = [], [], [], []
    ids, is_impossible, yesno, followups = [], [], [], []
    history = []
    i = 0
    
    for dialog in data:
        assert len(dialog)==1
        assert len(dialog['paragraphs'])==1
        assert len(dialog['paragraphs'][0]['qas'])==1
        # document/context
        doc = dialog['paragraphs'][0]['context']
        docs.append(doc)

        
        #others
        qas = dialog['paragraphs'][0]['qas'][0]

        question = add_history(qas['id'], data, i)
        
        ids.append(qas['id'])
#         questions.append(qas['question'])
        questions.append(question)
        answers.append(qas['answers'][0])
        is_impossible.append(qas['is_impossible'])
        yesno.append(qas['yesno'])
        followups.append(qas['followup'])
        answer_candidates.append(qas['answer_candidates'])
        i += 1

    answers = add_end_pos(answers, docs)
    return {'questions':questions, 'docs':docs, 'answers':answers, 'ids':ids, 'is_impossible':is_impossible, 'yesno':yesno, 'followups':followups, 'answer_candidates':answer_candidates}


def load_process_data(train_dir, val_dir):
    train = pd.read_json(train_dir)['data']
    val = pd.read_json(val_dir)['data']

    train = extract_info(train)
    val = extract_info(val)

    train = Dataset.from_dict(train)
    val = Dataset.from_dict(val)

    dataset = DatasetDict({'train': train, 'validation': val})

    return dataset

In [5]:
def add_token_positions(val, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # val.char_to_token(i, answers[i]['answer_start']) is the start_pos and it can be none
        start_positions.append(val.char_to_token(i, answers[i]['answer_start'], sequence_index=1))

        #this should not exist
        if(answers[i]['answer_end']==0):
            Exception('error...')
            #end_positions.append(val.char_to_token(i, answers[i]['answer_end'])) 
        else:
            end_positions.append(val.char_to_token(i, answers[i]['answer_end'] - 1, sequence_index=1))

        # if None, the answer passage has been truncated
         # Here is not a good approach
        if start_positions[-1] is None:
#             print('start_positions[-1] is None')
            start_positions[-1] = tokenizer.model_max_length

        if end_positions[-1] is None:
#             print('end_positions[-1] is None')
            end_positions[-1] = tokenizer.model_max_length

    return start_positions, end_positions

max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

def add_token_positions_chunking(tokenized_examples, all_answers):
    
    start_positions = []
    end_positions = []
    
    sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
    offset_mapping = tokenized_examples["offset_mapping"]

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        
        # Label impossible answers with the index of the CLS token (0,0).
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        # id == 0: question, id == 1: context
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = all_answers[sample_index]
        # If no answers are given, set the cls_index as answer.
        # In our case, all questons with CANNOTANSWER return an answer_end of None. Thus all these cases get labeled (0,0) 
        if answers["answer_end"] == None:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"]
            end_char = answers["answer_end"]

            # Set token index where context starts (sequence_id==1)
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                # Move the token_start_index to start_char
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)
                # Move token_end_index to the end_char
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)
    return start_positions, end_positions

def encode(examples):
    """Mapping function to tokenize the sentences passed with truncation"""
#     encoding = tokenizer(examples["questions"], examples["docs"], truncation=True, padding="max_length",
#                      max_length=512, return_special_tokens_mask=True)
    encoding = tokenizer(examples["questions"], examples["docs"], truncation="only_second",
                     max_length=max_length, return_overflowing_tokens=True, return_offsets_mapping=True, stride=doc_stride)
    
#     start_positions, end_positions = add_token_positions(encoding, examples["answers"])
    start_positions, end_positions = add_token_positions_chunking(encoding, examples["answers"])

    encoding.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encoding

In [6]:
train_dir = 'data/quac_train.json' 
val_dir = 'data/quac_dev.json' 

n_history = 0
dataset = load_process_data(train_dir, val_dir)

In [7]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', cache_dir="bert_base/")


In [8]:
train =  dataset["train"].map(encode, batched=True, remove_columns=dataset["train"].column_names)
val =  dataset["validation"].map(encode, batched=True, remove_columns=dataset["validation"].column_names)

  0%|          | 0/84 [00:00<?, ?ba/s]

KeyboardInterrupt: 

In [54]:
def prepare_model(trainSet,valSet,tokenizer):

    model = BertForQuestionAnswering.from_pretrained('bert-base-uncased', cache_dir="bert_base/")
    model.cuda()

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print('preparing model...')

    args = TrainingArguments(
        "bert-base-uncased-finetuned-quac-chunking",
        evaluation_strategy = "steps",
        eval_steps=10000,
        save_steps=10000,
        save_strategy = "steps",
        learning_rate=2e-5,
        adafactor=True,
        per_device_train_batch_size=10,
#         gradient_accumulation_steps=2,
        per_device_eval_batch_size=10,
        logging_steps = 5000,
        num_train_epochs=30,
        group_by_length=True,
        weight_decay=0.01,
#         fp16=True,
        #warmup_ratio=0.02,
        save_total_limit = 3,
        load_best_model_at_end=True,
        report_to="wandb",
        run_name="bert-base-uncased-finetuned-quac-chunking",
      )

    trainer_quac = Trainer(
        model=model,
        args=args,
        train_dataset=trainSet,
        eval_dataset=valSet,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
      )

    print('ready to train!')
    return trainer_quac

In [55]:
trainer_quac = prepare_model(train,val,tokenizer)

trainer_quac.train()

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at bert_base/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from cache at bert_base/a8041bf617d7f94

preparing model...
ready to train!


***** Running training *****
  Num examples = 83568
  Num Epochs = 30
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 250710
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


RuntimeError: CUDA out of memory. Tried to allocate 120.00 MiB (GPU 0; 10.74 GiB total capacity; 6.75 GiB already allocated; 83.19 MiB free; 6.81 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
import wandb
wandb.finish()

In [27]:
import torch
def predictTheAnswer(text,question,model):
    inputs = tokenizer.encode_plus(question, text, return_tensors='pt',max_length=512, truncation=True).to(device='cuda')

    outputs = model(**inputs)
    answer_start = torch.argmax(outputs[0])  
    answer_end = torch.argmax(outputs[1]) + 1 

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
    return answer

In [28]:
def prediction(contents,model):
    predAnswers={}
    for i in range(len(contents)):
        text = contents['docs'][i]
        qid = contents['ids'][i]
        question = contents['questions'][i]
        predAnswers[qid]=predictTheAnswer(text,question,model)
    return predAnswers

In [32]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased-finetuned-quac-chunking/checkpoint-20000')
model.cuda()

loading configuration file bert-base-uncased-finetuned-quac-Q0-chunking/checkpoint-20000/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file bert-base-uncased-finetuned-quac-Q0-chunking/checkpoint-20000/pytorch_model.bin
All model checkpoint weights were used when initializing BertForQuestionAnswering.

All the weights 

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [33]:
predAnswers=prediction(dataset['validation'],model)

In [34]:
import json
with open('results.txt', 'w') as convert_file:
     convert_file.write(json.dumps(predAnswers))

In [None]:
!python3 evaluate-v2.0.py ./data/quac_dev.json results.txt