In [None]:
from datasets import load_dataset, load_from_disk, ClassLabel
from datasets import Sequence, Dataset
from pathlib import Path


# It returns the first index where the string s2 appears in the string s1.
def get_start_index(s1, s2):
    return [s1.find(s2)]


# Convert raw_dataset into question_answering_dataset.
# cause_model is True if the model will predict cause.
# have_answers is True if the dataset contains the real cause and
# the real effect.
def create_question_answering_dataset(raw_dataset, cause_model, have_answers):
    question_column = ["Escribe la causa"] * len(raw_dataset)
    raw_dataset = raw_dataset.add_column("question", question_column)

    if have_answers:
        answers_column = []
        if cause_model:
            for row in raw_dataset:
                answers_column += [{'answer_start':
                                    get_start_index(row['context'],
                                                    row['Cause']),
                                    'text': [row['Cause']]}]
        else:
            for row in raw_dataset:
                answers_column += [{'answer_start':
                                    get_start_index(row['context'],
                                                    row['Effect']),
                                    'text': [row['Effect']]}]
        raw_dataset = raw_dataset.add_column("answers", answers_column)

    return raw_dataset


# Split the words of the string s with space and symbol delimiters.
def tokenize_by_words(s):
    return s.replace(".", " . ").replace(",", " , ").replace(";", " ; ") \
           .replace(")", " ) ").replace("\"", " \" ").replace(":", " : ") \
           .replace("(", " ) ").replace("-", " - ").split()


# Return the index where list2 starts as a sublist of list1,
# or -1 if is not a sublist.
def get_start_index_token(list1, list2):
    for i in range(0, len(list1)):
        for j in range(0, len(list2)):
            if list1[i + j] != list2[j]:
                break
            if j == len(list2) - 1:
                return i
    return -1


# Convert raw_dataset into token_classification_dataset.
# have_answers is True if the dataset contains the real cause and
# the real effect.
def create_token_classification_dataset(raw_dataset, have_answers):
    tokens_column = []
    for row in raw_dataset:
        tokens_column += [tokenize_by_words(row['context'])]
    raw_dataset = raw_dataset.add_column("tokens", tokens_column)

    if have_answers:
        ner_tags_column = []
        for row in raw_dataset:
            context_split = tokenize_by_words(row['context'])
            answer_cause_split = tokenize_by_words(row['Cause'])
            answer_effect_split = tokenize_by_words(row['Effect'])
            ner_tags_new_column = [0 for _ in context_split]
            found_i = get_start_index_token(context_split, answer_cause_split)
            # The token is inside the cause.
            for j in range(0, len(answer_cause_split)):
                ner_tags_new_column[found_i + j] = 1
            found_i = get_start_index_token(context_split, answer_effect_split)
            # The token is inside the effect.
            for j in range(0, len(answer_effect_split)):
                ner_tags_new_column[found_i + j] = 2
            ner_tags_column += [ner_tags_new_column]
        raw_dataset = raw_dataset.add_column("ner_tags", ner_tags_column)
        sequence_classlabel = Sequence(ClassLabel(names=custom_ner_tags))
        raw_dataset = raw_dataset.cast_column("ner_tags", sequence_classlabel)

    return raw_dataset


# Convert raw_dataset into summarization_dataset.
# cause_model is True if the model will predict cause.
# have_answers is True if the dataset contains the real cause and
# the real effect.
def create_summarization_dataset(raw_dataset, cause_model, have_answers):
    raw_dataset = raw_dataset.rename_columns({"context": "text"})

    if have_answers:
        if cause_model:
            raw_dataset = raw_dataset.rename_columns({"Cause": "summary"})
        else:
            raw_dataset = raw_dataset.rename_columns({"Effect": "summary"})

    return raw_dataset


# Convert raw_dataset into text_classification_dataset.
# have_answers is True if the dataset contains the real cause and
# the real effect.
def create_text_classification_dataset(raw_dataset, have_answers):
    text_column = []
    label_column = []

    if have_answers:
        for row in raw_dataset:
            text_column += [row['Cause']]
            label_column += [label2id_text["Cause"]]
            text_column += [row['Effect']]
            label_column += [label2id_text["Effect"]]

            list_slit = row['context'].split(row['Cause'])
            for str_split in list_slit:
                list_slit2 = str_split.split(row['Effect'])
                for str_split2 in list_slit2:
                    if str_split2 == '':
                        continue
                    text_column += [str_split2]
                    label_column += [label2id_text["Nothing"]]
        # The tutorial says label, but with distilbert,
        # the name has to be labels.
        data = {
            "text": text_column,
            "labels": label_column
        }
        raw_dataset = Dataset.from_dict(data)

    return raw_dataset


# Convert raw_dataset into token_split_sentence_dataset.
# have_answers is True if the dataset contains the real cause and
# the real effect.
def create_token_split_sentence_dataset(raw_dataset, have_answers):
    tokens_column = []
    for row in raw_dataset:
        tokens_column += [tokenize_by_words(row['context'])]
    raw_dataset = raw_dataset.add_column("tokens", tokens_column)

    if have_answers:
        ner_tags_column = []
        for row in raw_dataset:
            context_split = tokenize_by_words(row['context'])
            answer_cause_split = tokenize_by_words(row['Cause'])
            answer_effect_split = tokenize_by_words(row['Effect'])
            ner_tags_new_column = [0 for _ in context_split]
            # Answer cause.
            found_i = get_start_index_token(context_split, answer_cause_split)
            # Start of answer cause.
            if found_i - 1 >= 0:
                ner_tags_new_column[found_i - 1] = 1
            # End of answer cause.
            ner_tags_new_column[found_i + len(answer_cause_split) - 1] = 1
            # Effect cause.
            found_i = get_start_index_token(context_split, answer_effect_split)
            # Start of answer effect.
            if found_i - 1 >= 0:
                ner_tags_new_column[found_i - 1] = 1
            # End of answer effect.
            ner_tags_new_column[found_i + len(answer_effect_split) - 1] = 1

            ner_tags_column += [ner_tags_new_column]
        raw_dataset = raw_dataset.add_column("ner_tags", ner_tags_column)
        sequence_classlabel = \
            Sequence(ClassLabel(names=custom_tags_split_sentence))
        raw_dataset = raw_dataset.cast_column("ner_tags", sequence_classlabel)

    return raw_dataset


# Function to filter corrupted data.
# Return False if the row is corrupted, else True.
def data_corrupted_filter(example):
    if example['context'] is None:
        return False

    if example["Cause"] not in example['context']:
        return False

    if example["Effect"] not in example['context']:
        return False

    if get_start_index_token(tokenize_by_words(example['context']),
                             tokenize_by_words(example['Cause'])) == -1:
        return False

    if get_start_index_token(tokenize_by_words(example['context']),
                             tokenize_by_words(example['Effect'])) == -1:
        return False

    return True


# If split_dataset is True, it will split the datasaet into train and
# test datasets.
# If cause_model is True, it will return a dataset to predict the Cause,
# else it will predict the Effect.
# have_answers is True if the dataset contains the real cause and
# the real effect.
def get_datasets(original_dataset_path,
                 split_dataset,
                 cause_model,
                 have_answers):
    new_dataset_name = Path(original_dataset_path).stem
    new_dataset_name += "_dataset_" + task_type
    if not have_answers:
        new_dataset_name += "_predictions"
    elif (task_type == question_answering or task_type == summarization) \
            and cause_model:
        new_dataset_name += "_cause"
    elif (task_type == question_answering or task_type == summarization) \
            and not cause_model:
        new_dataset_name += "_effect"
    elif task_type == token_classification or \
            task_type == text_classification or \
            task_type == token_split_sentence:
        new_dataset_name += "_cause_effect"

    # If we have the dataset load it from files, else create it.
    try:
        datasets = load_from_disk(colab_data_path + "datasets/" +
                                  new_dataset_name)
    except Exception:
        # Read the dataset from the file.
        raw_datasets = load_dataset("csv",
                                    data_files=drive_path +
                                    original_dataset_path,
                                    delimiter=';',
                                    on_bad_lines='skip')

        # There is only one dataset, train.
        raw_dataset = raw_datasets["train"]

        # Remove Index column.
        raw_dataset = raw_dataset.remove_columns(["Index"])

        # Rename Text to context.
        raw_dataset = raw_dataset.rename_columns({"Text": "context"})

        # Filter rows in case the data is corrupted.
        raw_dataset = raw_dataset.filter(data_corrupted_filter)

        if have_answers:
            if (task_type == question_answering or
               task_type == summarization) and cause_model:
                raw_dataset = raw_dataset.remove_columns(["Effect"])
            elif (task_type == question_answering or
                  task_type == summarization) and not cause_model:
                raw_dataset = raw_dataset.remove_columns(["Cause"])

        if task_type == question_answering:
            raw_dataset = create_question_answering_dataset(raw_dataset,
                                                            cause_model,
                                                            have_answers)
        elif task_type == token_classification:
            raw_dataset = create_token_classification_dataset(raw_dataset,
                                                              have_answers)
        elif task_type == summarization:
            raw_dataset = create_summarization_dataset(raw_dataset,
                                                       cause_model,
                                                       have_answers)
        elif task_type == text_classification:
            raw_dataset = create_text_classification_dataset(raw_dataset,
                                                             have_answers)

        elif task_type == token_split_sentence:
            raw_dataset = create_token_split_sentence_dataset(raw_dataset,
                                                              have_answers)

        if split_dataset:
            # Before split, it is randomly shuffled.
            datasets = raw_dataset.train_test_split(test_size=0.1)
        else:
            datasets = raw_dataset

        # This overwrites previous files, take care!
        !mkdir -p {colab_data_path}datasets/
        datasets.save_to_disk(colab_data_path + "datasets/" + new_dataset_name)

    print(datasets)
    return datasets
