In [None]:
from transformers import AutoTokenizer


# All the words that we are going to use are in the context column.
def get_training_corpus_question_answering(datasets):
    for i in range(0, len(datasets["train"]), 1000):
        yield datasets["train"][i: i + 1000]["context"]


# All the words that we are going to use are in the tokens column.
def get_training_corpus_token_classification(datasets):
    for i in range(0, len(datasets["train"])):
        yield datasets["train"][i]["tokens"]


# All the words that we are going to use are in the text column.
def get_training_corpus_summarization(datasets):
    for i in range(0, len(datasets["train"]), 1000):
        yield datasets["train"][i: i + 1000]["text"]


# All the words that we are going to use are in the text column.
def get_training_corpus_text_classification(datasets):
    for i in range(0, len(datasets["train"]), 1000):
        yield datasets["train"][i: i + 1000]["text"]


# All the words that we are going to use are in the tokens column.
def get_training_corpus_token_split_sentence(datasets):
    for i in range(0, len(datasets["train"])):
        yield datasets["train"][i]["tokens"]


# Get a tunned tokenizer given its name for the given datasets.
def get_tokenizer(datasets, dataset_name):
    new_tokenizer_name = dataset_name + "_tokenizer_" + task_type

    # If we have the tokenizer load it from files, else create it.
    try:
        if task_type == question_answering or task_type == summarization \
           or task_type == text_classification:
            tokenizer = AutoTokenizer.from_pretrained(colab_data_path +
                                                      checkpoint + "/" +
                                                      new_tokenizer_name)
        elif task_type == token_classification or \
                task_type == token_split_sentence:
            tokenizer = AutoTokenizer.from_pretrained(colab_data_path +
                                                      checkpoint + "/" +
                                                      new_tokenizer_name,
                                                      add_prefix_space=True)
    except Exception:
        if task_type == question_answering or task_type == summarization \
           or task_type == text_classification:
            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        elif task_type == token_classification or \
                task_type == token_split_sentence:
            tokenizer = AutoTokenizer.from_pretrained(checkpoint,
                                                      add_prefix_space=True)

        if task_type == question_answering:
            training_corpus = get_training_corpus_question_answering(datasets)
        elif task_type == token_classification:
            training_corpus = \
                get_training_corpus_token_classification(datasets)
        elif task_type == summarization:
            training_corpus = get_training_corpus_summarization(datasets)
        elif task_type == text_classification:
            training_corpus = get_training_corpus_text_classification(datasets)
        elif task_type == token_split_sentence:
            training_corpus = \
                get_training_corpus_token_split_sentence(datasets)

        # We use a large enough size of vocabulary size.
        # Training dataset has like 14000 different words.
        if tokenizer.is_fast:
            tokenizer = tokenizer.train_new_from_iterator(training_corpus,
                                                          52000)
        else:
            # If the tokenizer is not fast, then the tokenizer is old,
            # and probably won't work.
            print("THE TOKENIZER IS NOT FAST!")

        # This overwrites previous files, take care!
        !mkdir -p {colab_data_path}{checkpoint}
        tokenizer.save_pretrained(colab_data_path + checkpoint + "/" +
                                  new_tokenizer_name)

    return tokenizer


In [None]:
from datasets import DatasetDict


# Helper function of get_tokenized_datasets for question_answering task.
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]

    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    if 'answers' in examples:
        answers = examples["answers"]
        start_positions = []
        end_positions = []

        for i, offset in enumerate(offset_mapping):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context.
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1

            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1

            context_end = idx - 1

            # If the answer is not fully inside the context, label it (0, 0).
            if offset[context_start][0] > end_char or \
               offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions.
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1

                start_positions.append(idx - 1)
                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1

                end_positions.append(idx + 1)

        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions

    return inputs


# Helper function of get_tokenized_datasets for token_classification task.
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"],
                                 truncation=True,
                                 is_split_into_words=True)
    labels = []

    if 'ner_tags' in examples:
        for i, label in enumerate(examples[f"ner_tags"]):
            # Map tokens to their respective word.
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []

            for word_idx in word_ids:  # Set the special tokens to -100.
                if word_idx is None:
                    label_ids.append(-100)
                # Only label the first token of a given word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)

                previous_word_idx = word_idx

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels

    return tokenized_inputs


# Variable for the models to know its a summarization task.
prefix = "summarize: "


# Helper function of get_tokenized_datasets for summarization task.
def preprocess_function_summarization(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    if 'summary' in examples:
        labels = tokenizer(text_target=examples["summary"],
                           max_length=128,
                           truncation=True)
        model_inputs["labels"] = labels["input_ids"]

    return model_inputs


# Helper function of get_tokenized_datasets for text classification task.
def preprocess_text_classification(examples):
    return tokenizer(examples["text"], truncation=True)


# It gets the tokenized datasets from the given dataset(s).
def get_tokenized_datasets(datasets):
    # If datasets is instance of DatasetDict, it will contain train dataset.
    if isinstance(datasets, DatasetDict):
        column_names = datasets["train"].column_names
    # else it will be only one dataset.
    else:
        column_names = datasets.column_names

    if task_type == question_answering:
        tokenized_dataset = datasets.map(preprocess_function,
                                         batched=True,
                                         remove_columns=column_names)
    elif task_type == token_classification or \
            task_type == token_split_sentence:
        tokenized_dataset = datasets.map(tokenize_and_align_labels,
                                         batched=True,
                                         remove_columns=column_names)
    elif task_type == summarization:
        tokenized_dataset = datasets.map(preprocess_function_summarization,
                                         batched=True,
                                         remove_columns=column_names)
    elif task_type == text_classification:
        tokenized_dataset = datasets.map(preprocess_text_classification,
                                         batched=True)

    print(tokenized_dataset)
    return tokenized_dataset


In [None]:
import evaluate
import numpy as np


# Metrics for token_classification task.
def compute_metrics_token_classification(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions,
                              references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


rouge = evaluate.load("rouge")


# Metrics for summarization task.
def compute_metrics_summarization(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions,
                                           skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds,
                           references=decoded_labels,
                           use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                       for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}


accuracy = evaluate.load("accuracy")


# Metrics for text classification task.
def compute_metrics_text_classification(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [1]:
from transformers import DataCollatorWithPadding, TrainingArguments
from transformers import AutoModelForQuestionAnswering, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer


# If create_new_trainer is True, it will replace previous trainers.
def get_trainer(dataset_name,
                tokenized_datasets,
                create_new_trainer,
                cause_model):
    trainer_name = dataset_name + "_trainer_" + task_type
    if (task_type == question_answering or task_type == summarization) \
       and cause_model:
        trainer_name += "_cause"
    elif (task_type == question_answering or task_type == summarization) \
            and not cause_model:
        trainer_name += "_effect"
    elif task_type == token_classification or \
            task_type == text_classification or \
            task_type == token_split_sentence:
        trainer_name += "_cause_effect"

    # If we have the trainer load it from files, else we have to create it.
    if not create_new_trainer:
        try:
            if task_type == question_answering:
                model = AutoModelForQuestionAnswering \
                        .from_pretrained(colab_data_path + checkpoint + "/" +
                                         trainer_name)
            elif task_type == token_classification:
                model = AutoModelForTokenClassification.from_pretrained(
                    colab_data_path + checkpoint + "/" + trainer_name,
                    num_labels=3,
                    id2label=id2label,
                    label2id=label2id
                )
            elif task_type == summarization:
                model = AutoModelForSeq2SeqLM \
                        .from_pretrained(colab_data_path + checkpoint + "/" +
                                         trainer_name)
            elif task_type == text_classification:
                model = AutoModelForSequenceClassification.from_pretrained(
                    colab_data_path + checkpoint + "/" + trainer_name,
                    num_labels=3,
                    id2label=id2label_text,
                    label2id=label2id_text
                )
            elif task_type == token_split_sentence:
                model = AutoModelForTokenClassification.from_pretrained(
                    colab_data_path + checkpoint + "/" + trainer_name,
                    num_labels=2,
                    id2label=id2label_split_sentence,
                    label2id=label2id_split_sentence
                )
        except Exception:
            create_new_trainer = True

    if create_new_trainer:
        if task_type == question_answering:
            model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)
        elif task_type == token_classification:
            model = AutoModelForTokenClassification \
                    .from_pretrained(checkpoint,
                                     num_labels=3,
                                     id2label=id2label,
                                     label2id=label2id)
        elif task_type == summarization:
            model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
        elif task_type == text_classification:
            model = AutoModelForSequenceClassification.from_pretrained(
                checkpoint,
                num_labels=3,
                id2label=id2label_text,
                label2id=label2id_text
            )
        elif task_type == token_split_sentence:
            model = AutoModelForTokenClassification \
                    .from_pretrained(checkpoint,
                                     num_labels=2,
                                     id2label=id2label_split_sentence,
                                     label2id=label2id_split_sentence)

    if task_type == question_answering or task_type == text_classification:
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    elif task_type == token_classification or \
            task_type == token_split_sentence:
        data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
    elif task_type == summarization:
        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,
                                               model=checkpoint)

    output_dir_training_args = colab_data_path + checkpoint + "/" + \
        dataset_name
    output_dir_training_args += "_training_args_" + task_type
    if (task_type == question_answering or task_type == summarization) \
       and cause_model:
        output_dir_training_args += "_cause"
    elif (task_type == question_answering or task_type == summarization) \
            and not cause_model:
        output_dir_training_args += "_effect"
    elif task_type == token_classification or \
            task_type == text_classification or \
            task_type == token_split_sentence:
        output_dir_training_args += "_cause_effect"

    num_train_epochs = 7  # 7 # IMPORTANT SETTING <--------------------------

    if task_type == question_answering:
        training_args = TrainingArguments(
            output_dir=output_dir_training_args,
            evaluation_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=num_train_epochs,
            weight_decay=0.01,
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
    elif task_type == token_classification:
        training_args = TrainingArguments(
            output_dir=output_dir_training_args,
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=num_train_epochs,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics_token_classification,
        )
    elif task_type == summarization:
        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir_training_args,
            evaluation_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            weight_decay=0.01,
            save_total_limit=3,
            num_train_epochs=num_train_epochs,
            predict_with_generate=True,
            fp16=True,
        )
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics_summarization,
        )
    elif task_type == text_classification:
        training_args = TrainingArguments(
            output_dir=output_dir_training_args,
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=num_train_epochs,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics_text_classification,
        )
    elif task_type == token_split_sentence:
        training_args = TrainingArguments(
            output_dir=output_dir_training_args,
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=num_train_epochs,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics_token_classification,
        )

    if create_new_trainer:
        trainer.train()
        # This overwrite previous files, take care!
        !mkdir -p {colab_data_path}{checkpoint}/
        trainer.save_model(colab_data_path + checkpoint + "/" + trainer_name)

    return trainer
