In [None]:
model_path = 'model_path/'
src_path = 'dataset_path/'   
output_path = 'output_path/'

In [None]:
from datasets import load_dataset

train_file = src_path + "train.csv"
test_file = src_path + "test.csv"

dataset = load_dataset('csv', data_files={'train': train_file, 'test': test_file})

dataset

In [None]:
from emoji import demojize
from transformers import AutoTokenizer



tokenizer = AutoTokenizer.from_pretrained(model_path, do_lower_case=False, normalization=True)

tokenizer.demojizer = lambda x: demojize(x, language='pt')

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True)


tokenized = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import numpy as np
from datasets import load_metric

accuracy = load_metric("accuracy")
f1 = load_metric("f1")
recall = load_metric("recall")
precision = load_metric("precision")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels),
        "f1": f1.compute(predictions=predictions, references=labels, average='macro'),
        "recall": recall.compute(predictions=predictions, references=labels, average='macro'),
        "precision": precision.compute(predictions=predictions, references=labels, average='macro')
    }


In [None]:
id2label = {0: "neutro", 1: "ofensivo", 2:"discursOdio"}

label2id = {"neutro": 0, "ofensivo": 1, "discursOdio": 2}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_path, num_labels=3, id2label=id2label, label2id=label2id
)


In [None]:
training_args = TrainingArguments(
    output_dir=output_path,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()