# Initialization dataset & tokenizer

In [1]:
# !pip install datasets
# !pip install transformers

from datasets import load_dataset
from transformers import AutoTokenizer



In [28]:
quac = load_dataset("quac")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")

Reusing dataset quac (/home/jelle-van-der-lee/.cache/huggingface/datasets/quac/plain_text/1.1.0/4170258e7e72d7c81bd6441b3f3489ea1544f0ff226ce61e22bb00c6e9d01fb6)


  0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

# Data Preprocessing


In [3]:
import pandas as pd

quac_new = quac.remove_columns(('background', 'followups', 'yesnos', 'orig_answers', 'wikipedia_page_title'))

quac_train = pd.DataFrame.from_dict(quac_new["train"])
quac_val = pd.DataFrame.from_dict(quac_new["validation"])

In [4]:
def define_prevQA(row):
    prev_QA = []
    prev_QA_2 = []
    for i in range(len(row['questions'])):
        if i == 0:
            prev_QA = prev_QA + [ [ [ [] , [] ] ] ]
        else:
            prev_QA_2 = prev_QA_2 + [ [ [row['questions'][i-1]] , [row['answers']['texts'][i-1]] ] ]
            prev_QA = prev_QA + [ prev_QA_2 ]

    return prev_QA

quac_train['prev_QA'] = quac_train.apply(lambda row: define_prevQA(row), axis=1)
quac_val['prev_QA'] = quac_val.apply(lambda row: define_prevQA(row), axis=1)

In [5]:
def explode(df):
    df['tmp']=df.apply(lambda row: list(zip(row['questions'],row['turn_ids'],row['texts'],row['answer_starts'],row['prev_QA'])), axis=1) 
    df=df.explode('tmp')
    df[['questions','turn_ids', 'texts', 'answer_starts', 'prev_QA']]=pd.DataFrame(df['tmp'].tolist(), index=df.index)
    df.drop(columns='tmp', inplace=True)
    return df

In [6]:
quac_train = quac_train.join(pd.DataFrame(quac_train.pop('answers').values.tolist()))
quac_val = quac_val.join(pd.DataFrame(quac_val.pop('answers').values.tolist()))

# quac_train = quac_train.explode(['questions','turn_ids', 'texts', 'answer_starts', 'prev_QA'])
# quac_val = quac_val.explode(['questions','turn_ids', 'texts', 'answer_starts', 'prev_QA'])
quac_train = explode(quac_train)
quac_val = explode(quac_val)

quac_train = quac_train.reset_index(drop=True)
quac_val = quac_val.reset_index(drop=True)

quac_train.rename(columns={'questions': 'question', 'texts': 'text', 'answer_starts': 'answer_start'}, inplace=True)
quac_val.rename(columns={'questions': 'question', 'texts': 'text', 'answer_starts': 'answer_start'}, inplace=True)

In [7]:
answer_columns = ['text', 'answer_start']

quac_train['answers'] = quac_train[answer_columns].to_dict(orient='records')
quac_val['answers'] = quac_val[answer_columns].to_dict(orient='records')

quac_train = quac_train.drop(columns=answer_columns)
quac_val = quac_val.drop(columns=answer_columns)

In [8]:
# Change turn ids (long unique number) to question nr of dialogue 
quac_train['turn_ids'] = quac_train['turn_ids'].str[-1:] 
quac_val['turn_ids'] = quac_val['turn_ids'].str[-1:]

quac_train['turn_ids'] = quac_train['turn_ids'].astype('int')
quac_val['turn_ids'] = quac_val['turn_ids'].astype('int')

quac_train['turn_ids'] = quac_train['turn_ids'] + 1
quac_val['turn_ids'] = quac_val['turn_ids'] + 1
quac_train.rename(columns={'turn_ids': 'question_no'}, inplace=True)
quac_val.rename(columns={'turn_ids': 'question_no'}, inplace=True)

# Fine-tune model

In [26]:
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", model_max_length=512)

# tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased", model_max_length=512)

In [10]:
pad_on_right = tokenizer.padding_side == "right"

In [11]:
max_length = 384
doc_stride = 128
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")
    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [12]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]

    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i] 
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [13]:
i = 0
for value in quac_train["answers"]:
    if (value["text"]==['CANNOTANSWER']):
        quac_train = quac_train.drop(i)
    # print(value)
    i += 1

j = 0
for value in quac_val["answers"]:
    if (value["text"][0]=='CANNOTANSWER'):
        quac_val = quac_val.drop(j)
    # print(value)
    j += 1
    
quac_train = quac_train.reset_index(drop=True)
quac_val = quac_val.reset_index(drop=True)

In [14]:
def add_history(row):
    question_no = row["question_no"]
    prev_QA = row["prev_QA"]
    if (question_no == 1):
        return (row["question"])
    elif (question_no == 2):
        try: 
            prev_ans = prev_QA[question_no - 2][1][0][0]
        except: 
            return (row["question"])
        prev_question = prev_QA[question_no - 2][0][0]
        return (prev_question + " " + prev_ans + " " + row["question"])
    else:
        try: 
            prev_ans_1 = prev_QA[question_no - 2][1][0][0]
            prev_ans_2 = prev_QA[question_no - 3][1][0][0]
        except: 
            return (row["question"])
        prev_question_1 = prev_QA[question_no - 2][0][0] 
        prev_question_2 = prev_QA[question_no - 3][0][0] 
    #     print(prev_ans)
        return (prev_question_2 + " " + prev_ans_2 + " " + prev_question_1 + " " + prev_ans_1 + " " + row["question"])

quac_train['question'] = quac_train.apply(lambda row: add_history(row), axis=1)
quac_val['question'] = quac_val.apply(lambda row: add_history(row), axis=1)

In [15]:
quac_train['question'][7]

'Do they speak any other languages? Malayalam is derived from old Tamil and Sanskrit in the 6th century. any literary items of interest? Malayalam literature is ancient in origin. The oldest literature works in Malayalam, distinct from the Tamil tradition, How old is their literature?'

In [16]:
from datasets import Dataset
quac_train_dataset = quac_train.drop(['question_no', 'prev_QA'], axis=1)
quac_val_dataset = quac_val.drop(['question_no', 'prev_QA'], axis=1)

quac_train = quac_train.drop(['question_no', 'prev_QA'], axis=1)
quac_val = quac_val.drop(['question_no', 'prev_QA'], axis=1)

quac_train.rename(columns={'dialogue_id': 'id', 'section_title': 'title'}, inplace=True)
quac_val.rename(columns={'dialogue_id': 'id', 'section_title': 'title'}, inplace=True)

quac_train_dataset = Dataset.from_pandas(quac_train)
quac_val_dataset = Dataset.from_pandas(quac_val)

In [17]:
quac_train = quac_train.reset_index(drop=True)
quac_val = quac_val.reset_index(drop=True)

quac_train

In [18]:
quac_train_test = quac_train_dataset.train_test_split(test_size=0.01)
quac_val_test = quac_val_dataset.train_test_split(test_size=0.01)

In [19]:
from datasets.dataset_dict import DatasetDict

quac_new = DatasetDict({'train': quac_train_dataset, 'validation': quac_val_dataset})
quac_test = DatasetDict({'train': quac_train_test["test"], 'validation': quac_val_test["test"]})
quac_val["answers"][1]

{'text': ['she gave birth to her daughter Sofia.',
  'in 1975 and in November she gave birth to her daughter Sofia.',
  'her daughter Sofia.',
  'in November she gave birth to her daughter Sofia.'],
 'answer_start': [104, 80, 122, 92]}

# Tokenize & train model

In [20]:
# tokenized_quac = quac_new.map(preprocess_function_squad, batched=True)
# tokenized_quac = quac_new.map(preprocess_function, batched=True, remove_columns=quac_new["train"].column_names)
tokenized_quac = quac_new.map(prepare_train_features, batched=True, remove_columns=quac_new["train"].column_names)

  0%|          | 0/70 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [21]:
# tokenized_quac_test = quac_test.map(preprocess_function_squad, batched=True)
# tokenized_quac_test = quac_test.map(preprocess_function, batched=True, remove_columns=quac_test["train"].column_names)
# tokenized_quac_test = quac_test.map(prepare_train_features, batched=True, remove_columns=quac_test["train"].column_names)



In [22]:
from transformers import DefaultDataCollator, EarlyStoppingCallback, IntervalStrategy

data_collator = DefaultDataCollator()

from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model_base = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", model_max_length=512)

# model_base = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [24]:
model_base.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [24]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /home/jelle-van-der-lee/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [25]:
# !pip3 install wandb
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjellemvdl[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [26]:
%env WANDB_PROJECT=bsc_AI_thesis

env: WANDB_PROJECT=bsc_AI_thesis


In [24]:
import torch
# torch.cuda.empty_cache()
foo = torch.tensor([1,2,3])
foo = foo.to('cuda')

In [35]:
quac_train['question'][2]

'Where is Malayali located? 30,803,747 speakers of Malayalam in Kerala, making up 93.2% of the total number of Malayalam speakers in India, What other languages are spoken there? 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. What else is this place known for?'

In [70]:
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     loggings_steps=50,
#     push_to_hub=True
# )

# model_name = model_checkpoint.split("/")[-1]
model_name = ("bert-base-uncased").split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-quac-2QA-History-v3",
#     evaluation_strategy = "epoch",
    evaluation_strategy = IntervalStrategy.STEPS,
    learning_rate=3e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=30,
    eval_steps = 3125,
    save_steps = 6250,
    weight_decay=0.01,
    push_to_hub=True,
#     metric_for_best_model = 'f1',
    save_total_limit = 2,
#     load_best_model_at_end=True,
    report_to="wandb",
    run_name="bert-base-uncased-finetuned-quac-2QA-History-v3"
#     label_names = ["start_positions", "end_positions"]
    #     logging_steps=50,
)

# trainer_squad = Trainer(
#     model=model_base,
#     args=training_args,
#     train_dataset=tokenized_squad["train"],
#     eval_dataset=tokenized_squad["validation"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
# )
# trainer_squad.train()

trainer_quac = Trainer(
    model=model_base,
    args=args,
    train_dataset=tokenized_quac["train"],
    eval_dataset=tokenized_quac["validation"],
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)],
    data_collator=data_collator
)
trainer_quac.train()

# trainer_test = Trainer(
#     model=model_base,
#     args=training_args,
#     train_dataset=tokenized_quac_test["train"],
#     eval_dataset=tokenized_quac_test["validation"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
# )

# trainer_test = Trainer(
#     model_base,
#     args,
#     train_dataset=tokenized_quac_test["train"],
#     eval_dataset=tokenized_quac_test["validation"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
# )
# trainer_test.train()

Cloning https://huggingface.co/Jellevdl/bert-base-uncased-finetuned-quac-2QA-History into local empty directory.
***** Running training *****
  Num examples = 168114
  Num Epochs = 30
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 420300
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
3125,2.0759,1.982329
6250,1.883,1.842735
9375,1.8134,1.852739
12500,1.7859,1.784662
15625,1.5483,1.801316
18750,1.5399,1.816233
21875,1.5801,1.775361
25000,1.521,1.78512
28125,1.5542,1.991099
31250,1.1556,2.101952


***** Running Evaluation *****
  Num examples = 16104
  Batch size = 12
***** Running Evaluation *****
  Num examples = 16104
  Batch size = 12
Saving model checkpoint to bert-base-uncased-finetuned-quac-2QA-History/checkpoint-6250
Configuration saved in bert-base-uncased-finetuned-quac-2QA-History/checkpoint-6250/config.json
Model weights saved in bert-base-uncased-finetuned-quac-2QA-History/checkpoint-6250/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned-quac-2QA-History/checkpoint-6250/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-quac-2QA-History/checkpoint-6250/special_tokens_map.json
tokenizer config file saved in bert-base-uncased-finetuned-quac-2QA-History/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-quac-2QA-History/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 16104
  Batch size = 12
***** Running Evaluation *****
  Num examples = 16104
  Batch size = 12
Sa

TrainOutput(global_step=37500, training_loss=1.6019872591145834, metrics={'train_runtime': 14272.9971, 'train_samples_per_second': 353.354, 'train_steps_per_second': 29.447, 'total_flos': 8.818530372758938e+16, 'train_loss': 1.6019872591145834, 'epoch': 2.68})

In [71]:
wandb.finish()

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,▅▂▃▁▂▂▁▁▆█▇▅
eval/runtime,▇█▆▅▇▇▂▁▃▃▂▂
eval/samples_per_second,▂▁▃▄▂▂▇█▆▆▇▇
eval/steps_per_second,▁▁▃▄▂▂▇█▇▆▇█
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▇▆▅▅▅▄▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,1.95209
eval/runtime,144.5526
eval/samples_per_second,111.406
eval/steps_per_second,9.284
train/epoch,2.68
train/global_step,37500.0
train/learning_rate,3e-05
train/loss,1.2203
train/total_flos,8.818530372758938e+16
train/train_loss,1.60199


In [None]:
pt_model = AutoTokenizer.from_pretrained("bert-base-uncased-finetuned-quac/checkpoint-29000", from_tf=True)
pt_model.save_pretrained("bert-base-uncased-finetuned-quac/final")

In [None]:
from transformers import AutoModel

model = AutoModel.from_pretrained("bert-base-uncased-finetuned-quac/checkpoint-29000")
# pt_model.push_to_hub("my-awesome-org/my-awesome-model")


In [None]:
model.push_to_hub("bert-base-uncased-finetuned-quac-1")