In [None]:
import numpy as np
import pandas as pd
import torch

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import LlamaForSequenceClassification
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_metric
from peft import LoraConfig, TaskType, get_peft_model
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix

In [None]:
model_save_path = 'model_path/'
src_path = 'dataset_path/'   
output_path = 'output_path/'
data = '' #hatebr, toldbr, olidbr

In [None]:
model = '' #PORTULAN/gervasio-7b-portuguese-ptbr-decoder, maritaca-ai/sabia-7b

In [None]:
train_file = src_path + f'''/{data}_train_balanced.csv'''
val_file = src_path + f'''/{data}_val.csv'''
test_file = src_path + f'''/{data}_test.csv'''

dataset = load_dataset('csv', data_files={'train': train_file, 'validation':val_file, 'test': test_file})

dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model, do_lower_case=False, use_fast = False, padding=True, truncation=True, max_length=512)

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True, max_length=512)


tokenized = dataset.map(preprocess_function, batched=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy = load_metric("accuracy")
f1 = load_metric("f1")
recall = load_metric("recall")
precision = load_metric("precision")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels),
        "f1": f1.compute(predictions=predictions, references=labels, average='macro'),
        "recall": recall.compute(predictions=predictions, references=labels, average='macro'),
        "precision": precision.compute(predictions=predictions, references=labels, average='macro')
    }

In [None]:
id2label = {0: "neutro", 1: "ofensivo", 2:"discursOdio"}

label2id = {"neutro": 0, "ofensivo": 1, "discursOdio": 2}

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=16, lora_alpha=32, lora_dropout=0.1
)

In [None]:
model = LlamaForSequenceClassification.from_pretrained(
    model, num_labels=3, id2label=id2label, label2id=label2id, device_map="auto"
)

In [None]:
model_peft = get_peft_model(model, lora_config)

In [None]:
model_peft.print_trainable_parameters()

In [None]:
!nvidia-smi

In [None]:
model.config.pad_token_id = model.config.eos_token_id
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    output_dir=model_save_path,
    gradient_checkpointing=True,
    optim="adamw_torch",
)

trainer = Trainer(
    model=model_peft,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()

In [None]:
class TestDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

texts = dataset['test']['text']  
labels = dataset['test']['label']  
encodings = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=128)

# Create a test dataset
test_dataset = TestDataset(encodings, labels)

In [None]:
test_metrics = trainer.evaluate(test_dataset)
print("Test Metrics:")
print(test_metrics)

In [None]:
test_predictions = trainer.predict(test_dataset)
test_labels = dataset['test']['label']

confusion_mat = confusion_matrix(test_labels, test_predictions.predictions.argmax(axis=1))
print("Test Confusion Matrix:")
print(confusion_mat)

In [None]:
respostas = pd.DataFrame({
    "predictions": test_predictions.predictions.argmax(axis=1),
})

respostas.to_csv(model + ".csv", index=False)