In [None]:
from unidecode import unidecode
import torch
from pathlib import Path
from torch.nn.utils.rnn import pad_sequence


correct_data_path = drive_path + "dataset/input/ref/"
predicted_data_path = drive_path + "dataset/input/res/"

# Uncomment this if the structure of the Drive
# Here we will store our correct answer data.
!mkdir -p {correct_data_path}
# Here we will store the predicted answer data.
!mkdir -p {predicted_data_path}
# Here it will be stored the computed metrics for our predictions.
!mkdir -p {drive_path}dataset/output


# Helper function of get_predictions for question_answering task.
# cause_model is True if it is predicting the Cause, else
# it is predicting the Effect.
# Return [start predicted tokens], [end predicted tokens].
def get_predictions_question_answering(tokenized_datasets, cause_model):
    if cause_model:
        predictions, _, _ = trainer_cause.predict(tokenized_datasets)
    else:
        predictions, _, _ = trainer_effect.predict(tokenized_datasets)

    start_logits, end_logits = predictions

    logits_probabilities = torch.nn.functional \
                                   .softmax(torch.from_numpy(start_logits),
                                            dim=-1)
    start_predicted_token = logits_probabilities.argmax(dim=-1).tolist()

    logits_probabilities = torch.nn.functional \
                                   .softmax(torch.from_numpy(end_logits),
                                            dim=-1)
    end_predicted_token = logits_probabilities.argmax(dim=-1).tolist()

    return start_predicted_token, end_predicted_token


# Helper function of get_predictions for token_classification task.
# cause_model is True if it is predicting the Cause,
# else it is predicting the Effect.
# Return [start predicted tokens], [end predicted tokens].
def get_predictions_token_classification(tokenized_datasets, cause_model):
    predictions, _, _ = trainer_cause_effect.predict(tokenized_datasets)

    predictions_argmax = torch.argmax(torch.from_numpy(predictions), dim=2)

    start_predicted_token = []
    end_predicted_token = []

    if cause_model:
        number_to_match = 1
    else:
        number_to_match = 2

    for i in range(0, predictions_argmax.shape[0]):
        # The first token is [CLS], so it have to be 0.
        predictions_argmax[i][0] = 0
        # Useful for debugging wich tokens the model predict (1/2).
        # if i < 10:
        #    print(predictions_argmax[i])

        # It is Kadane algorithm.
        # Historical answer.
        historical_first_index = 0
        historical_last_index = 0

        # Current answer.
        current_sum = 0
        current_first_index = 0
        for j in range(0, len(predictions_argmax[i])):
            if predictions_argmax[i][j] == number_to_match:
                current_sum += 1
            else:
                current_sum -= 1
                if current_sum < 0:
                    current_sum = 0
                    current_first_index = j

            if predictions_argmax[i][current_first_index] != number_to_match:
                current_first_index = j

            if predictions_argmax[i][current_first_index] == number_to_match \
               and predictions_argmax[i][j] == number_to_match and \
               j - current_first_index > \
               historical_last_index - historical_first_index:
                historical_first_index = current_first_index
                historical_last_index = j

        start_predicted_token += [historical_first_index]
        end_predicted_token += [historical_last_index]

        # Useful for debugging wich tokens the model predict (2/2).
        # if i < 10:
        #    print(number_to_match,
        #          predictions_argmax[i][historical_first_index:
        #                                historical_last_index + 1])

    return start_predicted_token, end_predicted_token


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


# Helper function of get_predictions for summarization task.
# cause_model is True if it is predicting the Cause,
# else it is predicting the Effect.
# It returns the outputs that have to be decoded.
def get_predictions_summarization(tokenized_datasets, cause_model):
    if cause_model:
        model = trainer_cause.model
    else:
        model = trainer_effect.model

    # Add padding to the input_ids to homogenize the length.
    input_ids_padded = pad_sequence([torch.tensor(ids) for ids in
                                     tokenized_datasets["input_ids"]],
                                    batch_first=True)

    # Send the input_ids and model to the same device.
    model = model.to(device)
    input_ids_tensor = input_ids_padded.to(device)

    # 100 works fine for bart_base and mvp, but not for pegasus.
    iteration_step = 40
    outputs = model.generate(input_ids_tensor[0:iteration_step],
                             max_new_tokens=100,
                             do_sample=False)
    for k in range(iteration_step, input_ids_tensor.shape[0], iteration_step):
        # Useful to know wether the model can pass GPU limits or not.
        print("Iteration", k)
        outputs2 = model.generate(input_ids_tensor[k:k + iteration_step],
                                  max_new_tokens=100,
                                  do_sample=False)

        max_columns = max(outputs.size(1), outputs2.size(1))

        # Pad tensors to have the same number of columns.
        outputs = torch.nn.functional \
                          .pad(outputs,
                               (0, max_columns - outputs.size(1)),
                               value=0)
        outputs2 = torch.nn.functional \
                           .pad(outputs2,
                                (0, max_columns - outputs2.size(1)),
                                value=0)

        outputs = torch.cat((outputs, outputs2), dim=0)

    return outputs


# Return the predictions according to the current task_type.
def get_predictions(tokenized_datasets, cause_model):
    if task_type == question_answering:
        return get_predictions_question_answering(tokenized_datasets,
                                                  cause_model)
    elif task_type == token_classification:
        return get_predictions_token_classification(tokenized_datasets,
                                                    cause_model)
    elif task_type == summarization:
        return get_predictions_summarization(tokenized_datasets, cause_model)
    return


# Normalize the string str so that the correct string can be compared
# with the predicted string with more precission.
def normalize_str(s):
    # Normalize left and right double quotes to standard double quotes.
    s = s.replace('”', '"').replace('“', '"')
    # If there are an odd number of character ",
    # the last one has to be removed to not have an error in the csv parser.
    if s.count('"') % 2 == 1:
        idx = s.rfind('"')
        s = s[:idx] + s[idx + 1:]
    # There are some strings that contain the separator character,
    # so it has to be removed.
    return unidecode(s.replace(';', '').replace('[CLS]', '')
                     .replace('[SEP]', '').replace('summarize:', '')
                     .lower()).strip()


# Predict dataset with answers.
# It just print with the expected format the answers to the file.
# It does not use any model, no predictions are done.
def do_predictions_with_answers(original_dataset_path):
    dataset_to_predict = Path(original_dataset_path).stem
    if task_type == question_answering or task_type == summarization:
        cause_datasets = get_datasets(original_dataset_path,
                                      split_dataset=False,
                                      cause_model=True,
                                      have_answers=True)
        effect_datasets = get_datasets(original_dataset_path,
                                       split_dataset=False,
                                       cause_model=False,
                                       have_answers=True)
        size_dataset = len(cause_datasets)
    elif task_type == token_classification or \
            task_type == token_split_sentence:
        cause_effect_datasets = get_datasets(original_dataset_path,
                                             split_dataset=False,
                                             cause_model=False,
                                             have_answers=True)
        size_dataset = len(cause_effect_datasets)

    with open(correct_data_path + dataset_to_predict + ".csv", 'w') \
         as correct_file:
        correct_file.write("Index;Text;Cause;Effect\n")

        for i in range(0, size_dataset):
            # The context is the same in the two datasets.
            if task_type == question_answering:
                str_context = cause_datasets[i]["context"]
                correct_cause = cause_datasets[i]["Cause"]
                correct_effect = effect_datasets[i]["Effect"]
            elif task_type == token_classification:
                str_context = cause_effect_datasets[i]["context"]
                correct_cause = cause_effect_datasets[i]["Cause"]
                correct_effect = cause_effect_datasets[i]["Effect"]
            elif task_type == summarization:
                str_context = cause_datasets[i]["text"]
                correct_cause = cause_datasets[i]["summary"]
                correct_effect = effect_datasets[i]["summary"]
            elif task_type == token_split_sentence:
                str_context = cause_effect_datasets[i]["context"]
                correct_cause = cause_effect_datasets[i]["Cause"]
                correct_effect = cause_effect_datasets[i]["Effect"]

            str_context = normalize_str(str_context)
            correct_cause = normalize_str(correct_cause)
            correct_effect = normalize_str(correct_effect)

            correct_file.write(str(i) + ";" + str_context + ";" +
                               correct_cause + ";" + correct_effect + "\n")
    return


# Predict dataset without answers.
# It use the model task_type.
def do_predictions_without_answers(original_dataset_path):
    dataset_to_predict_name = Path(original_dataset_path).stem
    dataset_to_predict = get_datasets(original_dataset_path,
                                      split_dataset=False,
                                      cause_model=True,
                                      have_answers=False)

    tokenized_dataset_to_predict = get_tokenized_datasets(dataset_to_predict)

    if task_type == question_answering or task_type == token_classification:
        start_predicted_cause_token, end_predicted_cause_token = \
            get_predictions(tokenized_dataset_to_predict, cause_model=True)
        start_predicted_effect_token, end_predicted_effect_token = \
            get_predictions(tokenized_dataset_to_predict, cause_model=False)
    elif task_type == summarization:
        prediction_list_cause = get_predictions(tokenized_dataset_to_predict,
                                                cause_model=True)
        prediction_list_effect = get_predictions(tokenized_dataset_to_predict,
                                                 cause_model=False)

    with open(predicted_data_path + dataset_to_predict_name + ".csv", 'w') \
         as predicted_file:
        predicted_file.write("Index;Text;Cause;Effect\n")

        for i in range(0, len(tokenized_dataset_to_predict)):
            # The context is the same in the two datasets.
            #if i >= len(prediction_list_cause):
            #    break
            if task_type == question_answering or \
               task_type == token_classification:
                str_context = normalize_str(dataset_to_predict[i]["context"])
                tokens_predicted_cause = \
                    tokenized_dataset_to_predict[i]["input_ids"]\
                                                [start_predicted_cause_token[i]:
                                                 end_predicted_cause_token[i] + 1]
                tokens_predicted_effect = \
                    tokenized_dataset_to_predict[i]["input_ids"]\
                                                [start_predicted_effect_token[i]:
                                                 end_predicted_effect_token[i] + 1]

                str_predicted_cause = \
                    tokenizer.decode(tokens_predicted_cause)
                str_predicted_effect = \
                    tokenizer.decode(tokens_predicted_effect)

                str_predicted_cause = normalize_str(str_predicted_cause)
                str_predicted_effect = normalize_str(str_predicted_effect)
            elif task_type == summarization:
                str_context = normalize_str(dataset_to_predict[i]["text"])
                str_predicted_cause = \
                    tokenizer.decode(prediction_list_cause[i],
                                     skip_special_tokens=True)
                str_predicted_effect = \
                    tokenizer.decode(prediction_list_effect[i],
                                     skip_special_tokens=True)

                str_predicted_cause = normalize_str(str_predicted_cause)
                str_predicted_effect = normalize_str(str_predicted_effect)

            predicted_file.write(str(i) + ";" + str_context + ";" +
                                 str_predicted_cause + ";" +
                                 str_predicted_effect + "\n")
    return


In [None]:
# It split the tokenized sentence tokenized[i],
# given the predictions predictions_argmax[i].
def do_split_sentence(predictions_argmax, tokenized_dataset_to_predict):
    split_sentence = []
    l = 0
    for r in range(0, len(tokenized_dataset_to_predict["input_ids"])):
        if predictions_argmax[r] == 1:
            if r - l > 0:
                tokenized_list = tokenized_dataset_to_predict["input_ids"][l:r + 1]
                str_split_sentence = tokenizer.decode(tokenized_list)
                str_split_sentence = normalize_str(str_split_sentence)
                split_sentence += [str_split_sentence]

                l = r + 1
    if r - l > 0:
        tokenized_list = tokenized_dataset_to_predict["input_ids"][l:r + 1]
        str_split_sentence = tokenizer.decode(tokenized_list)
        str_split_sentence = normalize_str(str_split_sentence)
        split_sentence += [str_split_sentence]
    return split_sentence


# Helper function of do_predictions_token_split_sentence
# for token_split_sentence.
# It will create a list which each row is the prediction of
# each entry in the dataset.
# In each row, the first element is the context,
# needed in the result for the tests.
# The rest of the elements are the split sentence.
def token_split_sentence_create_list():
    dataset_to_predict = get_datasets(original_dataset_path,
                                      split_dataset=False,
                                      cause_model=True,
                                      have_answers=False)

    tokenized_dataset_to_predict = get_tokenized_datasets(dataset_to_predict)
    tokenized_datasets = tokenized_dataset_to_predict
    predictions, _, _ = trainer_cause_effect.predict(tokenized_datasets)

    predictions_argmax = torch.argmax(torch.from_numpy(predictions), dim=2)

    list_split_sentence = []
    for i in range(0, len(predictions_argmax)):
        list_context = [dataset_to_predict[i]["context"]]
        list_rest_sentence = do_split_sentence(predictions_argmax[i],
                                               tokenized_dataset_to_predict[i])
        list_split_sentence += [list_context + list_rest_sentence]

    return list_split_sentence


In [None]:
# This function predict which class has the string text.
# It is only for text classification task.
def get_predictions_text_classification(text):
    inputs = tokenizer(text, return_tensors="pt")
    model = trainer_cause_effect.model

    # Send the input_ids and model to the same device.
    model = model.to(device)
    # inputs_tensor = inputs.to(device)

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_class_id = logits.argmax().item()
    return model.config.id2label[predicted_class_id]


# This calculates the precission of the text classification model.
# It is only for testing it, and it is not used in production.
def calculate_text_classification_precission():
    original_dataset_path = "dataset/formatted_test_set_540.csv"
    cause_effect_datasets = get_datasets(original_dataset_path,
                                         split_dataset=False,
                                         cause_model=False,
                                         have_answers=True)
    size_dataset = len(cause_effect_datasets)
    correct_predictions = 0
    for i in range(0, size_dataset):
        text = cause_effect_datasets[i]["text"]
        prediction = get_predictions_text_classification(text)
        if id2label_text[cause_effect_datasets[i]["labels"]] == prediction:
            correct_predictions += 1

    print(correct_predictions,
          size_dataset,
          correct_predictions / size_dataset)
    return


# Given the list_split_sentence created with the token classification model,
# now classify them with the text classification model.
def do_predictions_split_sentence():
    for i in range(0, len(list_split_sentence)):
        str_context = list_split_sentence[i][0]
        str_cause = ""
        str_effect = ""
        # Find the cause with the text_classification model.
        for split_sentence in list_split_sentence[i][1:]:
            if get_predictions_text_classification(split_sentence) == 'Cause':
                str_cause = split_sentence
                break
        # Find the effect with the text_classification model.
        for split_sentence in list_split_sentence[i][1:]:
            if get_predictions_text_classification(split_sentence) == 'Effect':
                str_effect = split_sentence
                break
        # If the model do not predict any cause,
        # take the longest split_sentence.
        if str_cause == "":
            list_split_sentence_sorted = sorted(list_split_sentence[i][1:],
                                                key=len)
            str_cause = list_split_sentence_sorted[-1]
        # If the model do not predict any effect,
        # take the longest split_sentence.
        if str_effect == "":
            list_split_sentence_sorted = sorted(list_split_sentence[i][1:],
                                                key=len)
            str_effect = list_split_sentence_sorted[-1]

        # Normalize the output strings.
        str_context = normalize_str(str_context)
        str_cause = normalize_str(str_cause)
        str_effect = normalize_str(str_effect)
        list_split_sentence[i] = [str_context, str_cause, str_effect]

    return list_split_sentence


In [None]:
# It writes the predictions of the split sentence to the predicted file.
def write_split_sentence_to_file():
    dataset_to_predict_name = Path(original_dataset_path).stem
    with open(predicted_data_path + dataset_to_predict_name + ".csv", 'w') \
         as predicted_file:
        predicted_file.write("Index;Text;Cause;Effect\n")

        for i in range(0, len(list_split_sentence)):
            str_context = list_split_sentence[i][0]
            str_predicted_cause = list_split_sentence[i][1]
            str_predicted_effect = list_split_sentence[i][2]
            predicted_file.write(str(i) + ";" + str_context + ";" +
                                 str_predicted_cause + ";" +
                                 str_predicted_effect + "\n")
    return
