In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Train for multilabel classification

In [None]:
corpus_path = "/content/drive/MyDrive/toxic/unified-hate.csv"
df = pd.read_csv(corpus_path)
texts = df["text"].tolist()
labels = df.drop(columns=["text"]).values

In [None]:
tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-large-portuguese-cased")

In [None]:
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': labels
        }

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

train_dataset = TweetDataset(train_texts, train_labels, tokenizer)
val_dataset = TweetDataset(val_texts, val_labels, tokenizer)

In [None]:
model = BertForSequenceClassification.from_pretrained("neuralmind/bert-large-portuguese-cased",
                                                      num_labels=7,
                                                      problem_type="multi_label_classification")


In [None]:
class_columns = ["insult", "obscene", "ideology", "lgbtqphobia", "racism", "sexism", "xenophobia"]
class_labels = df[class_columns]

num_examples_per_class = torch.tensor(class_labels.sum().values, dtype=torch.float)

total_examples = num_examples_per_class.sum()

pos_weight = (total_examples - num_examples_per_class) / num_examples_per_class

In [None]:
def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.6).int().numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, *args, **kwargs):
        labels = inputs.get("labels").float()
        logits = model(**inputs).logits
        
        pos_weight_tensor = pos_weight.to(logits.device)
        
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
        loss = loss_fct(logits, labels)
        
        return (loss, logits) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()

In [None]:
test_predictions = trainer.predict(val_dataset)
logits = test_predictions.predictions
labels = test_predictions.label_ids

probs = torch.sigmoid(torch.tensor(logits))
preds = (probs > 0.6).int().numpy()

print(classification_report(labels, preds,
                            target_names=["lgbtqphobia", "obscene", "insult", "racism", "sexism", "xenophobia", "ideology"]))


In [None]:
trainer.save_model("/content/drive/MyDrive/toxic/bert-large-toxic-classifier")

In [None]:
model_path = "/content/drive/MyDrive/toxic/bert-large-toxic-classifier"
csv_file = ""
output_csv = ""

tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

df = pd.read_csv(csv_file)

label_names = ["lgbtqphobia", "obscene", "insult", "racism", "sexism", "xenophobia", "ideology"]

def classify_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits.squeeze().cpu()
    probs = torch.sigmoid(logits)
    preds = (probs > 0.6).int().numpy()

    return preds.tolist()

df["predictions"] = df["comment"].apply(classify_text)

predictions_df = pd.DataFrame(df["predictions"].to_list(), columns=label_names)

df = pd.concat([df.drop(columns=["predictions"]), predictions_df], axis=1)

df.to_csv(output_csv, index=False)

In [None]:
model_path = "/content/drive/MyDrive/toxic/bert-large-toxic-classifier"

df = pd.read_csv("/content/drive/MyDrive/toxic/unified-hate.csv")
texts = df["text"].tolist()
labels = df.drop(columns=["text"]).values
label_names = df.columns.drop("text").tolist()

tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': labels
        }

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
val_dataset = TweetDataset(val_texts, val_labels, tokenizer)

batch_size = 32
all_preds = []
all_true = []

with torch.no_grad():
    for i in range(0, len(val_texts), batch_size):
        batch_texts = val_texts[i:i+batch_size]
        batch_labels = val_labels[i:i+batch_size]
        encodings = tokenizer(batch_texts, truncation=True, padding=True, return_tensors="pt", max_length=128)
        input_ids = encodings["input_ids"].to(device)
        attention_mask = encodings["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.sigmoid(logits)
        preds = (probs > 0.6).int().cpu().numpy()
        all_preds.append(preds)
        all_true.append(batch_labels)

y_pred = np.vstack(all_preds)
y_true = np.vstack(all_true)

for i, label in enumerate(label_names):
    print(f"\nMétricas para: {label}")
    print(classification_report(y_true[:, i], y_pred[:, i], zero_division=0))


# Train for binary classification

In [None]:
df = pd.read_csv("/content/drive/MyDrive/toxic/HateBR.csv")
texts = df["comentario"].tolist()
labels = df["label_final"].tolist()

In [None]:
tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-large-portuguese-cased")

In [None]:
class BinaryTweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': label
        }

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

train_dataset = BinaryTweetDataset(train_texts, train_labels, tokenizer)
val_dataset = BinaryTweetDataset(val_texts, val_labels, tokenizer)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "neuralmind/bert-large-portuguese-cased",
    num_labels=2)

In [None]:
def compute_metrics(pred):
    logits, labels = pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./binary_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./binary_logs",
    logging_steps=10,
    load_best_model_at_end=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("/content/drive/MyDrive/toxic/binary_model")
tokenizer.save_pretrained("/content/drive/MyDrive/toxic/binary_model")

In [None]:
model_path = "/content/drive/MyDrive/toxic/binary_model"

tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

model.eval()

In [None]:
model_path = "/content/drive/MyDrive/toxic/binary_model"

In [None]:
def predict(text):
    encoding = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

    with torch.no_grad():
        output = model(**encoding)

    prediction = torch.argmax(output.logits, dim=-1).item()

    return prediction


In [None]:
test_predictions = trainer.predict(val_dataset)

logits = test_predictions.predictions
labels = test_predictions.label_ids

preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()

print(classification_report(labels, preds, target_names=["Não Ofensivo", "Ofensivo"]))

In [None]:
input_file = "/content/classify_bolsonaro.csv"
output_file = "/content/drive/MyDrive/toxic/classified_bolsonaro.csv"


model_path = "/content/drive/MyDrive/toxic/binary_model"

tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def predict(text):
    encoding = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    encoding = {key: value.to(device) for key, value in encoding.items()}

    with torch.no_grad():
        output = model(**encoding)

    prediction = torch.argmax(output.logits, dim=-1).item()
    return prediction


df = pd.read_csv(input_file, nrows=100_000)

df["comment"] = df["comment"].fillna("")

df["offensive"] = df["comment"].apply(predict)

df.to_csv(output_file, index=False)


In [None]:
df = df[df["offensive"] == 1]

df_sampled = df.sample(n=100_000, random_state=42)

output_sampled_file = "/content/drive/MyDrive/toxic/offensive_sample_bolsonaro.csv"

df_sampled.to_csv(output_sampled_file, index=False)

In [None]:
input_file = "/content/drive/MyDrive/toxic/BOLSONARO.csv"
output_file = "/content/drive/MyDrive/toxic/BOLSONARO_classified.csv"
model_path = "/content/drive/MyDrive/toxic/binary_model"

tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

class CommentDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(text,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_length,
                                  return_tensors='pt')
        return {key: val.squeeze(0) for key, val in encoding.items()}

df = pd.read_csv(input_file)

dataset = CommentDataset(df["text"].tolist(), tokenizer, max_length=128)
loader = DataLoader(dataset, batch_size=32, shuffle=False)

predictions = []

with torch.no_grad():
    for batch in tqdm(loader, desc="Classificando"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())

df["offensive"] = predictions

df.to_csv(output_file, index=False)
