In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 28.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 62.3 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 65.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [3]:
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering

In [4]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 30.9 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 66.2 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 69.6 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 72.9 MB/s 
Installing collected packages: urllib3, xxhash, responses, multiprocess, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24

In [6]:
from torchvision import datasets
from datasets import load_dataset, load_metric
import seaborn
from transformers import AutoModelForQuestionAnswering,DistilBertForQuestionAnswering,TrainingArguments, Trainer
from transformers import default_data_collator
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric,DatasetDict
from transformers import BertTokenizer, BertForQuestionAnswering

import torch
from tqdm.auto import tqdm
import collections

In [7]:
def prepare_dataset(dataset_name,
                    split=False,
                    number_of_train_data=False,
                    number_of_validation_data=False,
                    list_of_columns_for_remove=False,
                    characters_to_ignore=False):
    
    
    ###############       Dataset Reduction because of Resource Limitation      #####################
    if split == True:
        train_data = load_dataset(dataset_name,split=f'train[:{number_of_train_data}]')
        val_data = load_dataset(dataset_name,split=f'validation[:{number_of_validation_data}]')
        dataset = DatasetDict({
                            'train':train_data,
                            'validation':val_data,
                             })
    else:
        dataset = load_dataset(dataset_name)
    ################     Remove additional Columns      ###########################
    if list_of_columns_for_remove == True:
        dataset = dataset.remove_columns(list_of_columns_for_remove)
    
    ################     Remove useless Characters      ###########################    
    if characters_to_ignore ==True:
        def special_characters_removal(data):
            data["text"] = re.sub(characters_to_ignore, '', data["text"]).lower()
            return data
        dataset = dataset.map(special_characters_removal)
        
    return dataset


def prepare_train_features(data,
                          max_length = 512,
                          doc_stride = 128 ):
    data["question"] = [q.lstrip() for q in data["question"]]
    tokenized_data = tokenizer(
                      data["question" if pad_on_right else "context"],
                      data["context" if pad_on_right else "question"],
                      truncation="only_second" if pad_on_right else "only_first",
                      max_length=max_length,
                      stride=doc_stride,
                      return_overflowing_tokens=True,
                      return_offsets_mapping=True,
                      padding="max_length")

    sample_mapping = tokenized_data.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_data.pop("offset_mapping")
    tokenized_data["start_positions"] = []
    tokenized_data["end_positions"] = []
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_data["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_data.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = data["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            tokenized_data["start_positions"].append(cls_index)
            tokenized_data["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_data["start_positions"].append(cls_index)
                tokenized_data["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_data["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_data["end_positions"].append(token_end_index + 1)
    return tokenized_data

def compute_metrics(pred): 
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    exact_match_metric_results = exact_match_metric.compute(predictions=preds, references=labels)

    return {"Exact_match": exact_match_metric_results['exact_match']}


def prepare_history(history,one_of_evaluation_metrics):
    history = pd.DataFrame(history)
    global list_of_selected_index
    list_of_selected_index = []
    for i in range(history.shape[0]):
        if ~np.isnan(history.loc[i,one_of_evaluation_metrics]):
            list_of_selected_index.append(i) 
    for index in list_of_selected_index:
        history.loc[index,'loss'] = history.loc[index-1,'loss']
        history.loc[index,'learning_rate'] = history.loc[index-1,'learning_rate']
    history = history.loc[list_of_selected_index]
    return history


def plot_metric(dataframe,
                columns_for_plot,
                metric_names,
                question_name,
                line_width = 4,
                ylim=False,
                ylim_range = (0,0.5),
                fig_size = (20,10)):
    
    steps = [str(x) for x in list(dataframe['step'].values)]
    ## Plort 
    number_of_plots = len(columns_for_plot)
    num_of_cols = 2
    num_of_rows = number_of_plots//2

    if number_of_plots == 1: fig,ax = plt.subplots(1,figsize=fig_size)
    else: fig,ax = plt.subplots(num_of_rows,num_of_cols,figsize=fig_size)
    plt.rcParams["font.family"] = "Times New Roman"  
    for plot_index in range(number_of_plots):
        current_row = plot_index % 2
        current_col = plot_index //2
        
        if number_of_plots == 1: curr_ax = ax
        elif number_of_plots == 2: curr_ax = ax[plot_index]
        else: curr_ax = ax[current_row,current_col]
            
        for label in curr_ax.xaxis.get_ticklabels():   
            label.set_color('black')
            label.set_rotation(0)
            label.set_fontsize(10)
        curr_ax.xaxis.set_major_locator(plt.MaxNLocator(round(len(steps)/3)))
        curr_ax.plot(steps ,
                     dataframe[columns_for_plot[plot_index]],
                     color='#380282',
                     linewidth=line_width)

        curr_ax.tick_params(axis='x', labelrotation = 45)
        curr_ax.spines["top"].set_visible(False)
        curr_ax.spines["right"].set_visible(False)
        curr_ax.set_title(f'{metric_names[plot_index]} @ Question : {question_name}',fontsize=20,fontweight='bold')
        curr_ax.grid(alpha=0.3, zorder=0, linewidth=1)
        curr_ax.set_xlabel('Step Number',fontsize=20) 
        curr_ax.set_ylabel(metric_names[plot_index],fontsize=20) 
        if ylim==True:
            curr_ax.set_ylim(ylim_range)
        
    fig.tight_layout(pad=3)
    plt.show()
    
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None 
                        or offset_mapping[end_index] == []
                        or offset_mapping[start_index] == []
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

    return predictions

def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples


def Metrics_evaluation():
    for batch in trainer.get_eval_dataloader():
        break
    batch = {k: v.to(trainer.args.device) for k, v in batch.items()}
    with torch.no_grad():
        output = trainer.model(**batch)
    output.keys()

    n_best_size = 20

    start_logits = output.start_logits[0].cpu().numpy()
    end_logits = output.end_logits[0].cpu().numpy()
    # Gather the indices the best start/end logits:
    start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
    end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
    valid_answers = []
    for start_index in start_indexes:
        for end_index in end_indexes:
            if start_index <= end_index: # We need to refine that test to check the answer is inside the context
                valid_answers.append({"score": start_logits[start_index] + end_logits[end_index],
                                      "text": ""})


    global max_length,max_answer_length,doc_stride

    max_length = 512
    max_answer_length = 30
    doc_stride = 128 

    validation_features = QA_dataset["validation"].map(
                                        prepare_validation_features,
                                        batched=True,
                                        remove_columns=QA_dataset["validation"].column_names)


    raw_predictions = trainer.predict(validation_features)

    start_logits = output.start_logits[0].cpu().numpy()
    end_logits = output.end_logits[0].cpu().numpy()
    offset_mapping = validation_features[0]["offset_mapping"]
    # The first feature comes from the first example. For the more general case, we will need to be match the example_id to
    # an example index
    context = QA_dataset["validation"][0]["context"]

    # Gather the indices the best start/end logits:
    start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
    end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
    valid_answers = []
    for start_index in start_indexes:
        for end_index in end_indexes:
            # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
            # to part of the input_ids that are not in the context.
            if (
                start_index >= len(offset_mapping)
                or end_index >= len(offset_mapping)
                or offset_mapping[start_index] is None
                or offset_mapping[end_index] is None
                or offset_mapping[end_index] == []
                or offset_mapping[start_index] == []
            ):
                continue
            # Don't consider answers with a length that is either < 0 or > max_answer_length.
            if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                continue
            if start_index <= end_index: # We need to refine that test to check the answer is inside the context
                start_char = offset_mapping[start_index][0]
                end_char = offset_mapping[end_index][1]
                valid_answers.append(
                    {"score": start_logits[start_index] + end_logits[end_index],
                     "text": context[start_char: end_char]})

    valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
    global squad_v2
    squad_v2 = True
    final_predictions = postprocess_qa_predictions(QA_dataset["validation"], validation_features, raw_predictions.predictions)
    metric = load_metric("squad_v2" if squad_v2 else "squad")

    if squad_v2:
        formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
    else:
        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in QA_dataset["validation"]]
    return metric.compute(predictions=formatted_predictions, references=references)



def plot_metric(dataframe,
                columns_for_plot,
                metric_names,
                question_name,
                line_width = 4,
                ylim=False,
                ylim_range = (0,0.5),
                fig_size = (20,10)):
    
    steps = [str(x) for x in list(dataframe['step'].values)]
    ## Plort 
    number_of_plots = len(columns_for_plot)
    num_of_cols = 2
    num_of_rows = number_of_plots//2

    if number_of_plots == 1: fig,ax = plt.subplots(1,figsize=fig_size)
    else: fig,ax = plt.subplots(num_of_rows,num_of_cols,figsize=fig_size)
    plt.rcParams["font.family"] = "Times New Roman"  
    for plot_index in range(number_of_plots):
        current_row = plot_index % 2
        current_col = plot_index //2
        
        if number_of_plots == 1: curr_ax = ax
        elif number_of_plots == 2: curr_ax = ax[plot_index]
        else: curr_ax = ax[current_row,current_col]
            
        for label in curr_ax.xaxis.get_ticklabels():   
            label.set_color('black')
            label.set_rotation(0)
            label.set_fontsize(10)
        curr_ax.xaxis.set_major_locator(plt.MaxNLocator(round(len(steps)/3)))
        curr_ax.plot(steps ,
                     dataframe[columns_for_plot[plot_index]],
                     color='#380282',
                     linewidth=line_width)

        curr_ax.tick_params(axis='x', labelrotation = 45)
        curr_ax.spines["top"].set_visible(False)
        curr_ax.spines["right"].set_visible(False)
        curr_ax.set_title(f'{metric_names[plot_index]} @ Question : {question_name}',fontsize=20,fontweight='bold')
        curr_ax.grid(alpha=0.3, zorder=0, linewidth=1)
        curr_ax.set_xlabel('Step Number',fontsize=20) 
        curr_ax.set_ylabel(metric_names[plot_index],fontsize=20) 
        if ylim==True:
            curr_ax.set_ylim(ylim_range)
        
    fig.tight_layout(pad=3)
    plt.show()

In [None]:
QA_dataset = prepare_dataset(dataset_name = 'squad_v2',
                             split=True,
                             number_of_train_data = 60000,
                             number_of_validation_data = 10000,
                             list_of_columns_for_remove = False,
                             characters_to_ignore = False) 





# Check Length of Context Sentences

lst = []
for sentence in QA_dataset['train']['context']:
    lst.append(len(sentence))
    
plt.figure(figsize=(20,8))
seaborn.histplot(lst)


model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
pad_on_right = tokenizer.padding_side == "right"
tokenized_datasets = QA_dataset.map(prepare_train_features, batched=True, remove_columns=QA_dataset["train"].column_names)


# Define Model
model = BertForQuestionAnswering.from_pretrained(model_checkpoint)
data_collator = default_data_collator
batch_size = 16
args = TrainingArguments(
                    "my_bert_model",
                    evaluation_strategy = "steps",
                    learning_rate=3e-5,
                    per_device_train_batch_size=batch_size,
                    per_device_eval_batch_size=batch_size,
                    num_train_epochs=12,
                    weight_decay=0.1,
                    push_to_hub=False,
                    save_strategy='no',
                    report_to='none')

exact_match_metric = load_metric("exact_match")
squad_metric = load_metric("squad")
f1_metric = load_metric("f1")


    


trainer = Trainer(
                model,
                args,
                train_dataset=tokenized_datasets["train"],
                eval_dataset=tokenized_datasets["validation"],
                compute_metrics=compute_metrics,
                data_collator=data_collator,
                tokenizer=tokenizer)
trainer.train()


# After Training
bert_history = trainer.state.log_history
history_list_bert = prepare_history(bert_history,
                          one_of_evaluation_metrics='eval_Exact_match')

plot_metric(history_list_bert,
           ['loss',],
           ['Train Loss'],
           '2 (Bert Model)',
           line_width = 3,
           ylim=True,
           ylim_range= (0,2),
           fig_size = (16,8))

Metrics_evaluation()

plot_metric(history_list_bert,
           ['loss'],
           ['Train Loss'],
           'Bert',
           line_width = 3,
           ylim=True,
           ylim_range= (0,2),
           fig_size = (16,10))

Downloading builder script:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/801k [00:00<?, ?B/s]

  

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.




Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/60 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

Downloading builder script:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

***** Running training *****
  Num examples = 60055
  Num Epochs = 12
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 45048
  Number of trainable parameters = 108893186


Step,Training Loss,Validation Loss,Exact Match
500,2.1862,1.43764,0.0
1000,1.5229,1.302136,0.0
1500,1.3495,1.312595,0.0


***** Running Evaluation *****
  Num examples = 10096
  Batch size = 16
***** Running Evaluation *****
  Num examples = 10096
  Batch size = 16
***** Running Evaluation *****
  Num examples = 10096
  Batch size = 16
***** Running Evaluation *****
  Num examples = 10096
  Batch size = 16


Step,Training Loss,Validation Loss,Exact Match
500,2.1862,1.43764,0.0
1000,1.5229,1.302136,0.0
1500,1.3495,1.312595,0.0
2000,1.2495,1.499415,0.0
2500,1.1828,1.289581,0.0
3000,1.1157,1.314077,0.0
3500,1.0925,1.246048,0.0
4000,0.9323,1.569363,0.0
4500,0.7681,1.282002,0.0
5000,0.7568,1.351493,0.0


***** Running Evaluation *****
  Num examples = 10096
  Batch size = 16
***** Running Evaluation *****
  Num examples = 10096
  Batch size = 16
***** Running Evaluation *****
  Num examples = 10096
  Batch size = 16
***** Running Evaluation *****
  Num examples = 10096
  Batch size = 16
***** Running Evaluation *****
  Num examples = 10096
  Batch size = 16
***** Running Evaluation *****
  Num examples = 10096
  Batch size = 16
***** Running Evaluation *****
  Num examples = 10096
  Batch size = 16
***** Running Evaluation *****
  Num examples = 10096
  Batch size = 16
