In [None]:
# Mount Drive directly from the Colab Folder Selector.

!pip install -q -U datasets evaluate accelerate seqeval rouge_score
!pip install -q -U transformers[sentencepiece,torch]
!pip install -q -U unidecode

# Necessary import for saving correctly
# (the error was found with summarization t5 small, after training).
import locale
def getpreferredencoding(do_setlocale=True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding


# Global path variables.
drive_path = "drive/MyDrive/tfg-juncodelasheras/"
colab_data_path = drive_path + "colab_saved_data/"


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
question_answering = "question_answering"
model_tinyroberta = "deepset/tinyroberta-squad2"

# This is the model that is currently used.
task_type = question_answering
checkpoint = model_tinyroberta

hub_checkpoint_cause = "mi_tinyROBERTA_cause"
hub_checkpoint_effect = "mi_tinyROBERTA_effect"


In [None]:
from datasets import Dataset

context_to_do_predictions = \
    "La bolsa ha subido tres puntos porque la auditoría ha sido exitosa"

# Create the dataset with the given context.
# Note, the question is always the same,
# independently of the cause or effect dataset.
dataset = {
    "context": [context_to_do_predictions],
    "question": ["Escribe la causa"],
}

cause_datasets = Dataset.from_dict(dataset)
effect_datasets = Dataset.from_dict(dataset)


In [None]:
from transformers import AutoTokenizer
from datasets import DatasetDict
from pathlib import Path
import torch


# Helper function of get_tokenized_datasets for question_answering task.
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]

    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    if 'answers' in examples:
        answers = examples["answers"]
        start_positions = []
        end_positions = []

        for i, offset in enumerate(offset_mapping):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context.
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1

            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1

            context_end = idx - 1

            # If the answer is not fully inside the context, label it (0, 0).
            if offset[context_start][0] > end_char or \
               offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions.
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1

                start_positions.append(idx - 1)
                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1

                end_positions.append(idx + 1)

        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions

    return inputs


# It gets the tokenized datasets from the given dataset(s).
def get_tokenized_datasets(datasets):
    # If datasets is instance of DatasetDict, it will contain train dataset.
    if isinstance(datasets, DatasetDict):
        column_names = datasets["train"].column_names
    # else it will be only one dataset.
    else:
        column_names = datasets.column_names

    tokenized_dataset = datasets.map(preprocess_function,
                                     batched=True,
                                     remove_columns=column_names)
    return tokenized_dataset


In [None]:
from transformers import AutoModelForQuestionAnswering, Trainer
from transformers import DataCollatorWithPadding


# If create_new_trainer is True, it will replace previous trainers.
def get_trainer(dataset_name,
                tokenized_datasets,
                cause_model):
    trainer_name = dataset_name + "_trainer_" + task_type
    if cause_model:
        trainer_name += "_cause"
    elif not cause_model:
        trainer_name += "_effect"

    # model = AutoModelForQuestionAnswering \
    #    .from_pretrained(colab_data_path + checkpoint + "/" +
    #                     trainer_name)
    if cause_model:
        model = AutoModelForQuestionAnswering \
            .from_pretrained("Juncodh/" + hub_checkpoint_cause)
    elif not cause_model:
        model = AutoModelForQuestionAnswering \
            .from_pretrained("Juncodh/" + hub_checkpoint_effect)
    #    .from_pretrained(colab_data_path + checkpoint + "/" +
    #                     trainer_name)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    return trainer


In [None]:
# It is the name of the dataset that was used when the model was trained.
original_dataset_path = "dataset/training_subtask_es.csv"
dataset_name = Path(original_dataset_path).stem

new_tokenizer_name = dataset_name + "_tokenizer_" + task_type
# tokenizer = AutoTokenizer.from_pretrained(colab_data_path +
#                                          checkpoint + "/" +
#                                          new_tokenizer_name)

tokenizer = AutoTokenizer.from_pretrained("Juncodh/" + hub_checkpoint_cause)

tokenized_cause_datasets = get_tokenized_datasets(cause_datasets)
tokenized_effect_datasets = get_tokenized_datasets(effect_datasets)

trainer_name = dataset_name + "_trainer_" + task_type
trainer_name += "_cause"
trainer_cause = get_trainer(dataset_name, tokenized_effect_datasets, True)

trainer_name = dataset_name + "_trainer_" + task_type
trainer_name += "_effect"
trainer_effect = get_trainer(dataset_name, tokenized_effect_datasets, False)




Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/879 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/326M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/326M [00:00<?, ?B/s]

In [None]:
from unidecode import unidecode


# Normalize the string str so that the correct string can be compared
# with the predicted string with more precission.
def normalize_str(s):
    # Normalize left and right double quotes to standard double quotes.
    s = s.replace('”', '"').replace('“', '"')
    # If there are an odd number of character ",
    # the last one has to be removed to not have an error in the csv parser.
    if s.count('"') % 2 == 1:
        idx = s.rfind('"')
        s = s[:idx] + s[idx + 1:]
    # There are some strings that contain the separator character,
    # so it has to be removed.
    return unidecode(s.replace(';', '').replace('[CLS]', '')
                      .replace('[SEP]', '').replace('summarize:', '')
                      .lower()).strip()


# Helper function of get_predictions for question_answering task.
# cause_model is True if it is predicting the Cause, else
# it is predicting the Effect.
# Return [start predicted tokens], [end predicted tokens].
def get_predictions_question_answering(tokenized_datasets, cause_model):
    if cause_model:
        predictions, _, _ = trainer_cause.predict(tokenized_datasets)
    else:
        predictions, _, _ = trainer_effect.predict(tokenized_datasets)

    start_logits, end_logits = predictions

    logits_probabilities = torch.nn.functional \
                                   .softmax(torch.from_numpy(start_logits),
                                            dim=-1)
    start_predicted_token = logits_probabilities.argmax(dim=-1).tolist()

    logits_probabilities = torch.nn.functional \
                                   .softmax(torch.from_numpy(end_logits),
                                            dim=-1)
    end_predicted_token = logits_probabilities.argmax(dim=-1).tolist()

    return start_predicted_token, end_predicted_token


In [None]:
def get_predictions():
    start_predicted_cause_token, end_predicted_cause_token = \
        get_predictions_question_answering(tokenized_cause_datasets, True)
    start_predicted_effect_token, end_predicted_effect_token = \
        get_predictions_question_answering(tokenized_cause_datasets, False)
    tokens_predicted_cause = \
        tokenized_cause_datasets[0]["input_ids"]\
                                [start_predicted_cause_token[0]:
                                 end_predicted_cause_token[0] + 1]
    tokens_predicted_effect = \
        tokenized_effect_datasets[0]["input_ids"]\
                                 [start_predicted_effect_token[0]:
                                  end_predicted_effect_token[0] + 1]
    str_predicted_cause = tokenizer.decode(tokens_predicted_cause)
    str_predicted_effect = tokenizer.decode(tokens_predicted_effect)
    str_predicted_cause = normalize_str(str_predicted_cause)
    str_predicted_effect = normalize_str(str_predicted_effect)
    return str_predicted_cause, str_predicted_effect


In [None]:
str_predicted_cause, str_predicted_effect = get_predictions()
str_predicted_cause, str_predicted_effect


NameError: name 'get_predictions' is not defined

In [None]:
# This connect the notebook with the Hugging Face Hub.
# !huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# This upload the models and tokenizer to the Hugging Face Hub

# trainer_cause.model.push_to_hub(hub_checkpoint_cause)
# tokenizer.push_to_hub(hub_checkpoint_cause)

# trainer_effect.model.push_to_hub(hub_checkpoint_effect)
# This is not necessary, since is the same tokenizer as the cause.
# tokenizer.push_to_hub(hub_checkpoint_effect)
