# Initialization dataset & tokenizer

In [76]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [77]:
quac = load_dataset("quac")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Reusing dataset quac (/home/jelle-van-der-lee/.cache/huggingface/datasets/quac/plain_text/1.1.0/4170258e7e72d7c81bd6441b3f3489ea1544f0ff226ce61e22bb00c6e9d01fb6)


  0%|          | 0/2 [00:00<?, ?it/s]

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/jelle-van-der-lee/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-

# Data Preprocessing


In [78]:
import pandas as pd

quac_new = quac.remove_columns(('background', 'followups', 'yesnos', 'orig_answers', 'wikipedia_page_title'))

quac_train = pd.DataFrame.from_dict(quac_new["train"])
quac_val = pd.DataFrame.from_dict(quac_new["validation"])

def remove_CA(row):
    answers = row['answers']
    questions = row['questions']
    turn_ids = row['turn_ids']
    index = 0
    index_to_remove = []
    for answer in answers['texts']:
        if (answer[0]=='CANNOTANSWER'):
            index_to_remove.append(index)
        index += 1
    new_answers_texts = [j for i, j in enumerate(answers['texts']) if i not in index_to_remove]
    new_answers_starts = [j for i, j in enumerate(answers['answer_starts']) if i not in index_to_remove]
    new_answers = {'texts' : new_answers_texts, 'answer_starts' : new_answers_starts}
    new_questions = [j for i, j in enumerate(questions) if i not in index_to_remove]
    new_turn_ids = [j for i, j in enumerate(turn_ids) if i not in index_to_remove]
    new_context = row['context'].rsplit(' ', 1)[0]
    return row['dialogue_id'], row['section_title'], new_context, new_turn_ids, new_questions, new_answers
     

quac_train = quac_train.apply(lambda row: remove_CA(row), axis=1, result_type='broadcast')
quac_val = quac_val.apply(lambda row: remove_CA(row), axis=1, result_type='broadcast')

def define_prevQA(row):
    prev_QA = []
    prev_QA_2 = []
    for i in range(len(row['questions'])):
        if i == 0:
            prev_QA = prev_QA + [ [ [ [] , [] ] ] ]
        else:
            prev_QA_2 = prev_QA_2 + [ [ [row['questions'][i-1]] , [row['answers']['texts'][i-1]] ] ]
            prev_QA = prev_QA + [ prev_QA_2 ]

    return prev_QA

quac_train['prev_QA'] = quac_train.apply(lambda row: define_prevQA(row), axis=1)
quac_val['prev_QA'] = quac_val.apply(lambda row: define_prevQA(row), axis=1)

def explode(df):
    df['tmp']=df.apply(lambda row: list(zip(row['questions'],row['turn_ids'],row['texts'],row['answer_starts'],row['prev_QA'])), axis=1) 
    df=df.explode('tmp')
    df[['questions','turn_ids', 'texts', 'answer_starts', 'prev_QA']]=pd.DataFrame(df['tmp'].tolist(), index=df.index)
    df.drop(columns='tmp', inplace=True)
    return df

quac_train = quac_train.join(pd.DataFrame(quac_train.pop('answers').values.tolist()))
quac_val = quac_val.join(pd.DataFrame(quac_val.pop('answers').values.tolist()))

quac_train = explode(quac_train)
quac_val = explode(quac_val)

quac_train = quac_train.reset_index(drop=True)
quac_val = quac_val.reset_index(drop=True)

quac_train.rename(columns={'questions': 'question', 'texts': 'text', 'answer_starts': 'answer_start'}, inplace=True)
quac_val.rename(columns={'questions': 'question', 'texts': 'text', 'answer_starts': 'answer_start'}, inplace=True)

answer_columns = ['text', 'answer_start']

quac_train['answers'] = quac_train[answer_columns].to_dict(orient='records')
quac_val['answers'] = quac_val[answer_columns].to_dict(orient='records')

quac_train = quac_train.drop(columns=answer_columns)
quac_val = quac_val.drop(columns=answer_columns)

  return array(a, dtype, copy=False, order=order)


In [79]:
quac_val.drop(quac_val[quac_val['section_title']== 'Entry into politics'].index, inplace = True)
quac_val.reset_index()
def turn_id(row):
    if ((row['turn_ids'][-1:])=='0'):
        return 0
    else:
        return int(len(row['prev_QA']))
    
quac_train['turn_ids'] = quac_train.apply(lambda row: turn_id(row), axis=1)
quac_val['turn_ids'] = quac_val.apply(lambda row: turn_id(row), axis=1)

quac_train['turn_ids'] = quac_train['turn_ids'] + 1
quac_val['turn_ids'] = quac_val['turn_ids'] + 1
quac_train.rename(columns={'turn_ids': 'question_no'}, inplace=True)
quac_val.rename(columns={'turn_ids': 'question_no'}, inplace=True)

# Fine-tune model

In [80]:
pad_on_right = tokenizer.padding_side == "right"

max_length = 384
doc_stride = 128
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")
    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [81]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]

    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i] 
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [82]:
def add_history(row):
    question_no = row["question_no"]
    prev_QA = row["prev_QA"]
    if (question_no == 1):
        return (row["question"])
    try: 
        prev_ans = prev_QA[question_no - 2][1][0][0]
    except: 
        return (row["question"])
    prev_question = prev_QA[question_no - 2][0][0]
    return (prev_question + " " + prev_ans + " " + row["question"])

quac_train['question'] = quac_train.apply(lambda row: add_history(row), axis=1)
quac_val['question'] = quac_val.apply(lambda row: add_history(row), axis=1)


# def add_history(row):
#     question_no = row["question_no"]
#     prev_QA = row["prev_QA"]
#     if (question_no == 1):
#         return (row["question"])
#     elif (question_no == 2):
#         try: 
#             prev_ans = prev_QA[question_no - 2][1][0][0]
#         except: 
#             return (row["question"])
#         prev_question = prev_QA[question_no - 2][0][0]
#         return (prev_question + " " + prev_ans + " " + row["question"])
#     else:
#         try: 
#             prev_ans_1 = prev_QA[question_no - 2][1][0][0]
#             prev_ans_2 = prev_QA[question_no - 3][1][0][0]
#         except: 
#             return (row["question"])
#         prev_question_1 = prev_QA[question_no - 2][0][0] 
#         prev_question_2 = prev_QA[question_no - 3][0][0] 
#     #     print(prev_ans)
#         return (prev_question_2 + " " + prev_ans_2 + " " + prev_question_1 + " " + prev_ans_1 + " " + row["question"])

# quac_train['question'] = quac_train.apply(lambda row: add_history(row), axis=1)
# quac_val['question'] = quac_val.apply(lambda row: add_history(row), axis=1)

In [83]:
from datasets import Dataset
quac_train_dataset = quac_train.drop(['question_no', 'prev_QA'], axis=1)
quac_val_dataset = quac_val.drop(['question_no', 'prev_QA'], axis=1)

quac_train = quac_train.drop(['question_no', 'prev_QA'], axis=1)
quac_val = quac_val.drop(['question_no', 'prev_QA'], axis=1)

quac_train.rename(columns={'dialogue_id': 'id', 'section_title': 'title'}, inplace=True)
quac_val.rename(columns={'dialogue_id': 'id', 'section_title': 'title'}, inplace=True)

quac_train = quac_train.reset_index(drop=True)
quac_val = quac_val.reset_index(drop=True)

quac_train_dataset = Dataset.from_pandas(quac_train)
quac_val_dataset = Dataset.from_pandas(quac_val)

In [84]:
from datasets.dataset_dict import DatasetDict

quac_new = DatasetDict({'train': quac_train_dataset, 'validation': quac_val_dataset})
quac_new

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 69109
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5868
    })
})

# Tokenize & train model

In [85]:
# tokenized_quac = quac_new.map(preprocess_function_squad, batched=True)
# tokenized_quac = quac_new.map(preprocess_function, batched=True, remove_columns=quac_new["train"].column_names)
tokenized_quac = quac_new.map(prepare_train_features, batched=True, remove_columns=quac_new["train"].column_names)

  0%|          | 0/70 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [86]:
from transformers import DefaultDataCollator, EarlyStoppingCallback, IntervalStrategy

data_collator = DefaultDataCollator()

from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model_base = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/jelle-van-der-lee/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/be

In [87]:
import wandb
wandb.login()

%env WANDB_PROJECT=bsc_AI_thesis

env: WANDB_PROJECT=bsc_AI_thesis


In [88]:
import torch 
model_base.half()
for layer in model_base.modules():
    if isinstance(layer, torch.nn.BatchNorm2d):
        layer.float()

In [None]:
# !pip install rouge_score
# !pip install evaluate

# import evaluate
# import numpy as np
# def compute_metrics(eval_preds):
#     metric = evaluate.load("rouge")
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

In [89]:
args = TrainingArguments(
    "bert-base-uncased-finetuned-quac-1QA",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=30,
    weight_decay=0.01,
    save_total_limit = 2,
    load_best_model_at_end=True,
#     metric_for_best_model = 'f1',
    report_to="wandb",
    run_name="bert-base-uncased-finetuned-quac-1QA",
)

trainer_quac = Trainer(
    model=model_base,
    args=args,
    train_dataset=tokenized_quac["train"],
    eval_dataset=tokenized_quac["validation"],
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)],
    data_collator=data_collator,
#     compute_metrics=compute_metrics,
)
trainer_quac.train()

PyTorch: setting up devices
***** Running training *****
  Num examples = 156477
  Num Epochs = 30
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 391200
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,
4,0.0,
5,0.0,
6,0.0,


***** Running Evaluation *****
  Num examples = 15044
  Batch size = 12
Saving model checkpoint to bert-base-uncased-finetuned-quac-1QA/checkpoint-13040
Configuration saved in bert-base-uncased-finetuned-quac-1QA/checkpoint-13040/config.json
Model weights saved in bert-base-uncased-finetuned-quac-1QA/checkpoint-13040/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned-quac-1QA/checkpoint-13040/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-quac-1QA/checkpoint-13040/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15044
  Batch size = 12
Saving model checkpoint to bert-base-uncased-finetuned-quac-1QA/checkpoint-26080
Configuration saved in bert-base-uncased-finetuned-quac-1QA/checkpoint-26080/config.json
Model weights saved in bert-base-uncased-finetuned-quac-1QA/checkpoint-26080/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned-quac-1QA/checkpoint-26080/tokenizer_config.json
Speci

TrainOutput(global_step=78240, training_loss=0.01944774791506902, metrics={'train_runtime': 10229.9357, 'train_samples_per_second': 458.88, 'train_steps_per_second': 38.241, 'total_flos': 1.8399119671700582e+17, 'train_loss': 0.01944774791506902, 'epoch': 6.0})

In [90]:
wandb.finish()

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/runtime,▄▂▂▁█▂
eval/samples_per_second,▅▇▇█▁▇
eval/steps_per_second,▅▇▇█▁▇
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁

0,1
eval/loss,
eval/runtime,46.3119
eval/samples_per_second,324.841
eval/steps_per_second,27.077
train/epoch,6.0
train/global_step,78240.0
train/learning_rate,2e-05
train/loss,0.0
train/total_flos,1.8399119671700582e+17
train/train_loss,0.01945


In [None]:
# from huggingface_hub import notebook_login

# notebook_login()