In [76]:
from transformers import AutoTokenizer, RobertaForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch
import json
import time

In [25]:
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

In [26]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 124056578
all model parameters: 124056578
percentage of trainable model parameters: 100.00%


In [54]:
dataset = load_dataset("json", data_files="Dados/dataset_SEDS.json")

Não são elegíveis para o programa aqueles que recebem o Bolsa Família, do Governo Federal.


In [None]:
data_list = [{'context': context, 'question': question, 'answer': answer} for context, question, answer in zip(dataset['train']['context'], dataset['train']['question'], dataset['train']['answer'])]
train_data, test_data = train_test_split(data_list, test_size = 0.25, random_state = 42)

In [None]:
index = 3
teste_context = test_data[index]['context']
teste_pergunta = test_data[index]['question']

In [46]:
prompt = f"""
Context of the Question.

{teste_context}

"""

inputs = tokenizer(teste_pergunta, teste_context, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
resposta = tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(f'Question:\n{teste_pergunta}\n')
print(dash_line)
print(f'MODEL ANSWER - ZERO SHOT:\n{resposta}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Context of the Question.

O Dignidade beneficia com R$ 300 mensais pessoas que tenham entre 60 anos e 64 anos 11 meses e 29 dias em situação de pobreza ou de extrema pobreza.


Question:
O que é o Programa Dignidade?

---------------------------------------------------------------------------------------------------
MODEL ANSWER - ZERO SHOT:
 R$ 300 mensais pessoas


In [47]:
start_probs = torch.nn.functional.softmax(outputs.start_logits, dim=1)
end_probs = torch.nn.functional.softmax(outputs.end_logits, dim=1)

start_one_hot = torch.nn.functional.one_hot(torch.tensor([answer_start_index]), num_classes=start_probs.size(1))
end_one_hot = torch.nn.functional.one_hot(torch.tensor([answer_end_index]), num_classes=end_probs.size(1))

start_loss = torch.nn.functional.binary_cross_entropy(start_probs, start_one_hot.float())
end_loss = torch.nn.functional.binary_cross_entropy(end_probs, end_one_hot.float())

total_loss = start_loss + end_loss

total_loss_value = total_loss.item()

rounded_loss = round(total_loss_value, 2)

print(f'Loss: {rounded_loss}')

Loss: 0.04


In [73]:
def tokenize_function(example):
    start_prompt = 'Answer the question.\n\n'
    end_prompt = '\n\nAnswer: '
    prompt = [start_prompt + question + end_prompt for question in example["question"]]
    inputs = tokenizer(prompt, [example["context"]] * len(example["question"]), padding="max_length", truncation=True, return_tensors="pt")
    example['input_ids'] = inputs.input_ids
    example['attention_mask'] = inputs.attention_mask
    
    start_positions = []
    end_positions = []
    for answer in example["answer"]:
        start_pos = example["context"].find(answer)
        if start_pos == -1:  # Se a resposta não for encontrada no contexto
            start_positions.append(0)
            end_positions.append(0)
        else:
            end_pos = start_pos + len(answer) - 1
            start_positions.append(start_pos)
            end_positions.append(end_pos)
    
    example['start_positions'] = start_positions
    example['end_positions'] = end_positions
    
    return example

# Aplicar a função tokenize_function ao conjunto de dados de treinamento
tokenized_train_data = [tokenize_function(example) for example in train_data]
#tokenized_train_data = [example for i, example in enumerate(tokenized_train_data) if i % 100 == 0]
tokenized_test_data = [tokenize_function(example) for example in test_data]
#tokenized_test_data = [example for i, example in enumerate(tokenized_test_data) if i % 100 == 0]

In [74]:
print(f"Shapes of the datasets:")
print(f"Number of examples in the training dataset: {len(tokenized_train_data)}")
print(f"Number of examples in the test dataset: {len(tokenized_test_data)}")

Shapes of the datasets:
Number of examples in the training dataset: 9
Number of examples in the test dataset: 4
DatasetDict({
    train: Dataset({
        features: ['question', 'context', 'answer', 'input_ids', 'labels'],
        num_rows: 1
    })
})


In [78]:
from accelerate import Accelerator, DataLoaderConfiguration

dataloader_config = DataLoaderConfiguration(
    dispatch_batches=None,
    split_batches=False,
    even_batches=True,
    use_seedable_sampler=True
)

accelerator = Accelerator(dataloader_config=dataloader_config)

In [79]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model = model,  # Altere para o seu modelo original
    args = training_args,
    train_dataset = tokenized_train_data
)

In [80]:
trainer.train()

RuntimeError: stack expects each tensor to be equal size, but got [43, 512] at entry 0 and [52, 512] at entry 1