In [277]:
import json
from datasets import Dataset
from pathlib import Path
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [278]:
path = Path("treino_ia.json")
with open(path, "rb") as arq:
    leitura = arq.read()

In [279]:
json_de_treino = json.loads(leitura)
dataset = Dataset.from_list(json_de_treino)

In [280]:
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
def tokenize(example):
    return tokenizer(example["input"], truncation=True, padding="max_length", max_length=128)
tokenized_dataset = dataset.map(tokenize)

Map: 100%|██████████| 122/122 [00:00<00:00, 4356.31 examples/s]


In [None]:
label2id = {label: idx for idx, label in enumerate(set(dataset["label"]))}
id2label = {v: k for k, v in label2id.items()}

In [282]:
def encode_label(example):
    example["label"] = label2id[example["label"]]
    return example

tokenized_dataset = tokenized_dataset.map(encode_label)

Map: 100%|██████████| 122/122 [00:00<00:00, 15246.11 examples/s]


In [283]:
model = AutoModelForSequenceClassification.from_pretrained(
    "neuralmind/bert-base-portuguese-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [284]:
training_args = TrainingArguments(
    output_dir="./resultados",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
)

In [285]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # se quiser, pode separar
)

In [286]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=80, training_loss=3.0747629165649415, metrics={'train_runtime': 137.712, 'train_samples_per_second': 4.43, 'train_steps_per_second': 0.581, 'total_flos': 40132721948160.0, 'train_loss': 3.0747629165649415, 'epoch': 5.0})

In [492]:
entrada = "Quero um personagem difícil"
tokens = tokenizer(entrada, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
output = model(**tokens)
pred_id = output.logits.argmax(dim=1).item()
print(f"Recomendação: {id2label[pred_id]}")

Recomendação: Juri
