In [1]:
# Mount Drive directly from the Colab Folder Selector.

!pip install -q -U datasets evaluate accelerate seqeval rouge_score
!pip install -q -U transformers[sentencepiece,torch]
!pip install -q -U unidecode

# Necessary import for saving correctly
# (the error was found with summarization t5 small, after training).
import locale
def getpreferredencoding(do_setlocale=True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# Necessary import for task_evaluate.py work correctly.
import nltk
nltk.download('punkt')

# Global path variables.
drive_path = "drive/MyDrive/tfg-juncodelasheras/"
colab_data_path = drive_path + "colab_saved_data/"

# Test that task_evaluate works correctly.
!python3 -m unittest {drive_path}dataset/task_evaluate.py


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


..............
----------------------------------------------------------------------
Ran 14 tests in 0.054s

OK


In [2]:
# Some models are commented because they are used with multiples architectures.

# ---------------------------------------------------------------------
question_answering = "question_answering"
# List of tested models.
model_distilbert = "distilbert-base-uncased-distilled-squad"
model_tinyroberta = "deepset/tinyroberta-squad2"
# roberta a lot longer and with same results as tinyroberta.
model_roberta = "deepset/roberta-base-squad2"
model_bert = "deepset/bert-base-cased-squad2"
model_mobilebert = "csarron/mobilebert-uncased-squad-v2"
model_albert = "twmkn9/albert-base-v2-squad2"
model_minilm = "deepset/minilm-uncased-squad2"
model_roberta_bne = "BSC-LT/roberta-base-bne-sqac"

# Trained with a korean dataset. Don't use this model (but it works).
# model_koelectra = "monologg/koelectra-small-v2-distilled-korquad-384"


# List of not working models. (Not enaught RAM in CUDA GPU, Google Colab).
model_mdeberta = "timpal0l/mdeberta-v3-base-squad2"
model_longformer = "mrm8488/longformer-base-4096-finetuned-squadv2"
model_roberta_bne_large = "BSC-LT/roberta-large-bne-sqac" # need more than T4 GPU

# ---------------------------------------------------------------------
token_classification = "token_classification"
# O - the token not correspond to any entity.
# cause_answer - the token is part of the cause.
# effect_answer - the token is part of the effect.
custom_ner_tags = ["O", "cause_answer", "effect_answer"]
id2label = {0: "O", 1: "cause_answer", 2: "effect_answer", }
label2id = {"O": 0, "cause_answer": 1, "effect_answer": 2, }

# List of tested models.
# model_distilbert = "distilbert-base-uncased-distilled-squad"
# model_albert = "twmkn9/albert-base-v2-squad2"
# model_roberta = "deepset/roberta-base-squad2"
model_funnel = "funnel-transformer/small"
model_fnet = "google/fnet-base"

# List of not working models. (Too heavy).
model_bloom = "bigscience/bloom"

# ---------------------------------------------------------------------
summarization = "summarization"
model_t5_small = "google-t5/t5-small"
model_t5_small_fine_tuned = "Falconsai/text_summarization"
model_bart_base = "facebook/bart-base"
model_mvp = "RUCAIBox/mvp"  # It needs A100 GPU (30 Gb).
model_bert_summarization = \
    "mrm8488/bert-small2bert-small-finetuned-cnn_daily_mail-summarization"


# List of not working models. (Too heavy GPU or CUDA error).
model_led_base = "allenai/led-base-16384"
# CUDA weird error: device-side assert triggered.
model_blenderbot = "facebook/blenderbot-400M-distill"
# Need A100 for training, but it can't save the model on 15Gb Drive,
# and need more GPU to predict.
model_pegasus = "google/pegasus-xsum"
# No tiene fast tokenizer.
model_blenderbot_small = "facebook/blenderbot_small-90M"
# No tiene fast tokenizer.
model_marianmt = "Helsinki-NLP/opus-mt-en-roa"
# No tiene fast tokenizer.
model_m2m = "facebook/m2m100_418M"
# No tiene fast tokenizer.
model_plbart = "uclanlp/plbart-base"
model_bart_small = "lucadiliello/bart-small"
# Need more GPU in training.
model_nases = "ELiRF/NASES"

# ---------------------------------------------------------------------
text_classification = "text_classification"

# nothing - the text is not a cause nor an effect.
# cause- the text is a cause.
# effect - the text is an effect.
custom_text = ["Nothing", "Cause", "Effect"]
id2label_text = {0: "Nothing", 1: "Cause", 2: "Effect", }
label2id_text = {"Nothing": 0, "Cause": 1, "Effect": 2, }

model_distilbert_base_uncased = "distilbert-base-uncased"


# ---------------------------------------------------------------------
# First use this token_split_sentence with the desired checkpoint.
# Then use text_classification and model_distilbert_base_uncased.
token_split_sentence = "token_split_sentence"
# 0 - the token is not the end of a segment.
# 1 - the token is the end of a segment.
custom_tags_split_sentence = ["0", "1"]
id2label_split_sentence = {0: "0", 1: "1", }
label2id_split_sentence = {"0": 0, "1": 1, }

# List of tested models.
# model_distilbert = "distilbert-base-uncased-distilled-squad"
# model_albert = "twmkn9/albert-base-v2-squad2"
# model_roberta = "deepset/roberta-base-squad2"  # Needs a lot of storage.
# model_funnel = "funnel-transformer/small"  # Needs a lot of storage.
# model_fnet = "google/fnet-base"


# ---------------------------------------------------------------------

# This is the model that is currently used.
task_type = question_answering
checkpoint = model_tinyroberta


In [3]:
# This part loads two dataset, one for the cause and anohter for the effect.
%run "drive/MyDrive/tfg-juncodelasheras/colab_notebooks/load_dataset.ipynb"


# This will be divided into two datasets, train and test.
original_dataset_path = "dataset/training_subtask_es.csv"

if task_type == question_answering or task_type == summarization:
    cause_datasets = get_datasets(original_dataset_path,
                                  split_dataset=True,
                                  cause_model=True,
                                  have_answers=True)
    effect_datasets = get_datasets(original_dataset_path,
                                   split_dataset=True,
                                   cause_model=False,
                                   have_answers=True)
elif task_type == token_classification or task_type == text_classification \
      or task_type == token_split_sentence:
    cause_effect_datasets = get_datasets(original_dataset_path,
                                         split_dataset=True,
                                         cause_model=True,
                                         have_answers=True)


DatasetDict({
    train: Dataset({
        features: ['context', 'Cause', 'question', 'answers'],
        num_rows: 1796
    })
    test: Dataset({
        features: ['context', 'Cause', 'question', 'answers'],
        num_rows: 200
    })
})
DatasetDict({
    train: Dataset({
        features: ['context', 'Effect', 'question', 'answers'],
        num_rows: 1796
    })
    test: Dataset({
        features: ['context', 'Effect', 'question', 'answers'],
        num_rows: 200
    })
})


In [4]:
# This parts get the tokenizer and the trainer.
from pathlib import Path


%run "drive/MyDrive/tfg-juncodelasheras/colab_notebooks/load_model.ipynb"


# Variables only used in token_classification.
if task_type == token_classification or task_type == token_split_sentence:
    label_list = cause_effect_datasets["train"] \
                 .features[f"ner_tags"].feature.names
    example = cause_effect_datasets["train"][0]
    seqeval = evaluate.load("seqeval")
    labels = [label_list[i] for i in example[f"ner_tags"]]

# This will be divided into two datasets, train and test.
# After that, it will be tokenized,
# and the tokenizer will add input_ids and attention_mask columns.
original_dataset_path = "dataset/training_subtask_es.csv"
dataset_name = Path(original_dataset_path).stem

# cause and effect datasets have the same tokenizer.
# Here cause and effect datasets are two different datasets.
if task_type == question_answering or task_type == summarization:
    tokenizer = get_tokenizer(cause_datasets, dataset_name=dataset_name)

    tokenized_cause_datasets = get_tokenized_datasets(cause_datasets)
    tokenized_effect_datasets = get_tokenized_datasets(effect_datasets)

    trainer_cause = get_trainer(dataset_name,
                                tokenized_cause_datasets,
                                create_new_trainer=False,
                                cause_model=True)
    trainer_effect = get_trainer(dataset_name,
                                 tokenized_effect_datasets,
                                 create_new_trainer=False,
                                 cause_model=False)
# Here only one dataset is needed.
elif task_type == token_classification or task_type == text_classification \
      or task_type == token_split_sentence:
    tokenizer = get_tokenizer(cause_effect_datasets, dataset_name=dataset_name)

    tokenized_cause_effect_datasets = \
        get_tokenized_datasets(cause_effect_datasets)

    trainer_cause_effect = get_trainer(dataset_name,
                                       tokenized_cause_effect_datasets,
                                       create_new_trainer=False,
                                       cause_model=True)


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/858k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/516k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Map:   0%|          | 0/1796 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1796
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 200
    })
})


Map:   0%|          | 0/1796 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1796
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 200
    })
})


pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 144.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 81.06 MiB is free. Process 3373 has 14.67 GiB memory in use. Of the allocated memory 14.45 GiB is allocated by PyTorch, and 82.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# This part predicts the datasets in the split_sentence architecture.
%run "drive/MyDrive/tfg-juncodelasheras/colab_notebooks/load_prediction.ipynb"


if task_type == token_split_sentence:
    original_dataset_path = "dataset/formatted_test_set_540.csv"
    original_list_split_sentence = token_split_sentence_create_list()
else if task_type == text_classification:
    list_split_sentence = original_list_split_sentence
    list_split_sentence = do_predictions_split_sentence()

    original_dataset_path = "dataset/formatted_test_set_540.csv"
    write_split_sentence_to_file()  # BE CAREFUL UNCOMMENTING THIS LINE.

# Evaluate the predictions made, and get its metrics.
!python3 {drive_path}dataset/task_evaluate.py from-folder {drive_path}dataset/input {drive_path}dataset/output


In [None]:
# This part predicts the datasets.
%run "drive/MyDrive/tfg-juncodelasheras/colab_notebooks/load_prediction.ipynb"


original_dataset_path = "dataset/formatted_test_set_540.csv"

# This overwrite previous files, take care!
# Comment this call if the dataset does not contain answers.
do_predictions_with_answers(original_dataset_path)
# This overwrite previous files, take care!
do_predictions_without_answers(original_dataset_path)

# Evaluate the predictions made, and get its metrics.
!python3 {drive_path}dataset/task_evaluate.py from-folder {drive_path}dataset/input {drive_path}dataset/output


Dataset({
    features: ['context', 'Cause', 'question', 'answers'],
    num_rows: 516
})
Dataset({
    features: ['context', 'Effect', 'question', 'answers'],
    num_rows: 516
})
Dataset({
    features: ['context', 'Cause', 'Effect', 'question'],
    num_rows: 516
})


Map:   0%|          | 0/516 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 516
})


INFO   | * Loading reference data
INFO   | * Loading prediction data
INFO   | Load Data: check data set length = True
INFO   | Load Data: check data set ref. text = True
F1: 0.755693
Recall: 0.757716
Precision: 0.758091
ExactMatch: 0.406977


In [None]:
# Remove all predicted data.
# This overwrites previous files, take care!
def delete_predicted_data():
    !rm {drive_path}dataset/input/ref/*
    !rm {drive_path}dataset/input/res/*
    return


# delete_predicted_data()  # BE CAREFUL UNCOMMENTING THIS LINE.
!ls {drive_path}dataset/input/ref

!head {drive_path}dataset/input/ref/formatted_test_set_540.csv
!head {drive_path}dataset/input/res/formatted_test_set_540.csv

# !wc {drive_path}dataset/input/ref/formatted_test_set_540.csv
# !wc {drive_path}dataset/input/res/formatted_test_set_540.csv


formatted_test_set_540.csv
Index;Text;Cause;Effect
0;atresmedia ha superado la recesion publicitaria y economica gracias a su capacidad para adaptarse a un entorno socioeconomico y legislativo en permanente cambio, al diseno de nuevas lineas de negocio, a la mejora en la gestion y a una gran dosis de innovacion.;gracias a su capacidad para adaptarse a un entorno socioeconomico y legislativo en permanente cambio, al diseno de nuevas lineas de negocio, a la mejora en la gestion y a una gran dosis de innovacion;atresmedia ha superado la recesion publicitaria y economica
1;la instruccion de un procedimiento adecuado a las circunstancias del caso, en el que se actuara siempre on independencia y pleno respeto del derecho de audiencia y de la presuncion de inocencia de cualquier persona afectada. -la indemnidad de cualquier denunciante como consecuencia de la presentacion de instancias o denuncias de buena fe al comite. 353 cuentas anuales sciif en las principales normas para la prevencion de

In [None]:
# If there are an odd number of character ",
# the last one has to be removed to not have an error in the csv parser.
def normalize_str_test(s):
    # Normalize left and right double quotes to standard double quotes.
    s = s.replace('”', '"').replace('“', '"')
    if s.count('"') % 2 == 1:
        idx = s.rfind('"')
        s = s[:idx] + s[idx + 1:]
    s = s.replace(";", ",")
    return s


# It creates formatted_test_set_540.csv from test_set_540.csv.
def create_valid_test_set_540():
    with open(drive_path + "dataset/test_set_540.csv", "r") as my_file:
        lines = []
        for line in my_file:
            s = line.strip()
            indice = s[0:s.find(";")]
            context = s[s.find(";") + 3:]
            cause = context[context.find("<cause>") + 7:
                            context.find("</cause>")]
            effect = context[context.find("<effect>") + 8:
                             context.find("</effect>")]
            context = context.replace("<cause>", "").replace("</cause>", "")
            context = context.replace("<effect>", "").replace("</effect>", "")
            context = normalize_str_test(context)
            cause = normalize_str_test(cause)
            effect = normalize_str_test(effect)
            lines += [(indice, context, cause, effect)]

    # This overwrites previous files, take care!
    with open(drive_path + "dataset/formatted_test_set_540.csv", "w") \
         as my_file:
        my_file.write("Index;Text;Cause;Effect\n")
        for line in lines:
            my_file.write(line[0] + ";\"" + line[1] + "\";\"" +
                          line[2] + "\";\"" + line[3] + "\"\n")
    return


# create_valid_test_set_540()  # BE CAREFUL UNCOMMENTING THIS LINE.
