In [None]:
import pandas as pd

dataset_number = 1

# Lacafe
text = 'txt'
label = 'has_anger'
train_data = pd.read_csv(f'./Datasets/Lacafe/df_dataset_train_{dataset_number}.csv')
test_data = pd.read_csv(f'./Datasets/Lacafe/df_dataset_test_{dataset_number}.csv')

# Fortuna (Updated)
# text = 'text'
# label = 'hatespeech_comb'
# train_data = pd.read_csv(f'./Datasets/FortunaUpdated/2019-05-28_portuguese_hate_speech_binary_classification_train_{dataset_number}.csv')
# test_data = pd.read_csv(f'./Datasets/FortunaUpdated/2019-05-28_portuguese_hate_speech_binary_classification_test_{dataset_number}.csv')

# OffComBr
# text = 'text'
# label = 'offensive'
# train_data = pd.read_csv(f'./Datasets/OffComBR/OffComBR2_train_{dataset_number}.csv')
# test_data = pd.read_csv(f'./Datasets/OffComBR/OffComBR2_test_{dataset_number}.csv')

# HateBR
# text = 'instagram_comments'
# label = 'offensive_language'
# train_data = pd.read_csv(f'./Datasets/HateBR/HateBR_train_{dataset_number}.csv')
# test_data = pd.read_csv(f'./Datasets/HateBR/HateBR_test_{dataset_number}.csv')

In [None]:
from datasets import Dataset, DatasetDict

train_data = Dataset.from_pandas(train_data)
test_data = Dataset.from_pandas(test_data)

raw_datasets = DatasetDict({'train': train_data, 'test': test_data})

train_data
test_data
raw_datasets

In [None]:
# Tratamento básico dos dados

train_texts = raw_datasets['train'][text]
train_labels = raw_datasets['train'][label]
test_texts = raw_datasets['test'][text]
test_labels = raw_datasets['test'][label]

print("TrainTexts Length: ", len(train_texts))
print("TrainLabels Length: ", len(train_labels))
print("TestTexts Length: ", len(test_texts))
print("TestLabels Length: ", len(test_labels))

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.1, stratify=train_labels)

In [None]:
print("TrainTexts Length: ", len(train_texts))
print("TrainLabels Length: ", len(train_labels))
print("ValidationTexts Length: ", len(val_texts))
print("ValidationLabels Length: ", len(val_labels))

In [None]:
print('Quantidade classes positivas (treino): ', sum(train_labels))
print('Quantidade classes negativas (treino): ', len(train_labels) - sum(train_labels))
print('Quantidade classes positivas (validação): ', sum(val_labels))
print('Quantidade classes negativas (validação): ', len(val_labels) - sum(val_labels))

In [None]:
# Importando o Tokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
# tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
# tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

In [None]:
# Tokenização dos datasets

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

In [None]:
# Voltando os datasets tokenizados para instâncias da classe de Dataset

import torch

class HSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = HSDataset(train_encodings, train_labels)
val_dataset = HSDataset(val_encodings, val_labels)
test_dataset = HSDataset(test_encodings, test_labels)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", num_labels=2)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "test_trainer",
    evaluation_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
)

In [None]:
import numpy as np
from datasets import load_metric

metric_acc = load_metric("accuracy")
metric_f1 = load_metric("f1")
metric_precision = load_metric("precision")
metric_recall = load_metric("recall")

def cma(labels, predictions):
    cm = {'true_positive': 0, 'true_negative': 0, 'false_positive': 0, 'false_negative': 0}
    for i in range(0, len(labels)):
        if labels[i] == predictions[i]:
            if labels[i] == 1:
                cm['true_positive'] += 1
            else:
                cm['true_negative'] += 1
        else:
            if labels[i] == 0:
                cm['false_positive'] += 1
            else:
                cm['false_negative'] += 1
    return cm

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    result = {
        'accuracy': metric_acc.compute(predictions=predictions, references=labels)["accuracy"] * 100,
        'precision': metric_precision.compute(predictions=predictions, references=labels)["precision"] * 100,
        'recall': metric_recall.compute(predictions=predictions, references=labels)["recall"] * 100,
        'f1': metric_f1.compute(predictions=predictions, references=labels)["f1"] * 100,
        'total': len(predictions),
        'cm': cma(labels, predictions)
    }
    
    return result

In [None]:
from transformers import Trainer

# Fine-tuning with HuggingFace
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()
trainer.evaluate(eval_dataset=test_dataset)