In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import os
from transformers import BertModel, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [None]:
train_df = pd.read_excel("base de dados/Tweets-processado.xlsx")

In [9]:
# Removendo colunas inúteis
train_df.drop(labels=['UserTags', 'sentimento'], axis=1, inplace=True)

# Reorganizando colunas
train_df = train_df[['texto', 'alegria', 'tristeza', 'raiva', 'medo',
                     'nojo', 'surpresa', 'confianca', 'antecipacao']]

target_list = ['alegria', 'tristeza', 'raiva', 'medo',
               'nojo', 'surpresa', 'confianca', 'antecipacao']

In [10]:
# Hiperparâmetros
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 4
LEARNING_RATE = 1e-05

In [11]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)

In [12]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['texto']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [13]:
# Dividindo corretamente o dataset em conjunto de treinamento e validação
train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=200)

# Resetando o índice dos DataFrames
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

In [14]:
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [15]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [16]:
device

device(type='cuda')

In [17]:
def load_ckp(checkpoint_fpath, model, optimizer):
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    try:
        torch.save(state, checkpoint_path)
        if is_best:
            torch.save(state, best_model_path)
    except Exception as e:
        print(f"Erro ao salvar o checkpoint: {e}")

In [18]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        # self.model = AutoModelForPreTraining.from_pretrained('neuralmind/bert-base-portuguese-cased')
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 8)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [19]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [20]:
val_targets=[]
val_outputs=[]

In [21]:
def save_ckp(state, is_best, checkpoint_path, best_model_path):
    try:
        torch.save(state, checkpoint_path)
        if is_best:
            torch.save(state['state_dict'], best_model_path)  # só os pesos
    except Exception as e:
        print(f"Erro ao salvar o checkpoint: {e}")


In [22]:
def train_model(n_epochs, training_loader, validation_loader, model, optimizer, checkpoint_path, best_model_path):

    # Verificar se o diretório existe
    if not os.path.exists(os.path.dirname(checkpoint_path)):
        os.makedirs(os.path.dirname(checkpoint_path))

    # Inicializar o rastreador para a perda mínima de validação
    valid_loss_min = np.inf

    for epoch in range(1, n_epochs+1):
        train_loss = 0
        valid_loss = 0

        model.train()
        print(f'############# Época {epoch}: Início do Treinamento #############')

        # Armazenar alvos e previsões de treino
        train_targets = []
        train_outputs = []

        for batch_idx, data in enumerate(training_loader):
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)

            outputs = model(ids, mask, token_type_ids)

            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            # Armazenar alvos e previsões de treino
            train_targets.extend(targets.cpu().detach().numpy().tolist())
            train_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

        print(f'############# Época {epoch}: Fim do Treinamento #############')

        print(f'############# Época {epoch}: Início da Validação #############')

        model.eval()

        val_targets = []
        val_outputs = []

        with torch.no_grad():
            for batch_idx, data in enumerate(validation_loader):
                ids = data['input_ids'].to(device, dtype=torch.long)
                mask = data['attention_mask'].to(device, dtype=torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
                targets = data['targets'].to(device, dtype=torch.float)

                outputs = model(ids, mask, token_type_ids)

                loss = loss_fn(outputs, targets)
                valid_loss += loss.item()

                val_targets.extend(targets.cpu().detach().numpy().tolist())
                val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

        print(f'############# Época {epoch}: Fim da Validação #############')

        # Calcular perdas médias
        if len(training_loader) > 0:
            train_loss = train_loss / len(training_loader)
        else:
            train_loss = float('inf')

        if len(validation_loader) > 0:
            valid_loss = valid_loss / len(validation_loader)
        else:
            valid_loss = float('inf')

        # Binarizar saídas para cálculos de métricas (assume classificação binária)
        train_outputs = np.array(train_outputs) > 0.5
        val_outputs = np.array(val_outputs) > 0.5

        # Calcular métricas para treino
        train_accuracy = accuracy_score(train_targets, train_outputs)
        train_precision = precision_score(train_targets, train_outputs, zero_division=0, average='micro')
        train_recall = recall_score(train_targets, train_outputs, zero_division=0, average='micro')
        train_f1 = f1_score(train_targets, train_outputs, zero_division=0, average='micro')

        # Calcular métricas para validação
        val_accuracy = accuracy_score(val_targets, val_outputs)
        val_precision = precision_score(val_targets, val_outputs, zero_division=0, average='micro')
        val_recall = recall_score(val_targets, val_outputs, zero_division=0, average='micro')
        val_f1 = f1_score(val_targets, val_outputs, zero_division=0, average='micro')

        # Printar estatísticas de treino/validação e métricas
        print(f'Época: {epoch} \tPerda Média de Treinamento: {train_loss:.6f} \tPerda Média de Validação: {valid_loss:.6f}')
        print(f'Métricas de Treino - Acurácia: {train_accuracy:.4f} \tPrecisão: {train_precision:.4f} \tRecall: {train_recall:.4f} \tF1 Score: {train_f1:.4f}')
        print(f'Métricas de Validação - Acurácia: {val_accuracy:.4f} \tPrecisão: {val_precision:.4f} \tRecall: {val_recall:.4f} \tF1 Score: {val_f1:.4f}')

        # Criar variável de checkpoint e adicionar dados importantes
        checkpoint = {
                'epoch': epoch + 1,
                'valid_loss_min': valid_loss,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
        }

        # Salvar checkpoint
        save_ckp(checkpoint, False, checkpoint_path, best_model_path)

        # Salvar o modelo se a perda de validação tiver diminuído
        if valid_loss <= valid_loss_min:
            print(f'A perda de validação diminuiu ({valid_loss_min:.6f} --> {valid_loss:.6f}). Salvando o modelo ...')
            save_ckp(checkpoint, True, checkpoint_path, best_model_path)
            valid_loss_min = valid_loss

        print(f'############# Época {epoch} Concluída #############\n')

    return model

In [23]:
ckpt_path = 'modelos/checkpoint.pth'
best_model_path = 'modelos/weights.pt'

In [24]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)

############# Época 1: Início do Treinamento #############
############# Época 1: Fim do Treinamento #############
############# Época 1: Início da Validação #############
############# Época 1: Fim da Validação #############
Época: 1 	Perda Média de Treinamento: 0.477333 	Perda Média de Validação: 0.367613
Métricas de Treino - Acurácia: 0.0266 	Precisão: 0.4319 	Recall: 0.0581 	F1 Score: 0.1024
Métricas de Validação - Acurácia: 0.1953 	Precisão: 0.9728 	Recall: 0.2686 	F1 Score: 0.4210
A perda de validação diminuiu (inf --> 0.367613). Salvando o modelo ...
############# Época 1 Concluída #############

############# Época 2: Início do Treinamento #############
############# Época 2: Fim do Treinamento #############
############# Época 2: Início da Validação #############
############# Época 2: Fim da Validação #############
Época: 2 	Perda Média de Treinamento: 0.262822 	Perda Média de Validação: 0.181437
Métricas de Treino - Acurácia: 0.5412 	Precisão: 0.9458 	Recall: 0.6407 	F1 Scor

In [None]:
# Carregar os dados de teste
test_df = pd.read_excel("base de dados/FrasesChatgpt.xlsx")

# Remover colunas inúteis dos dados de teste
test_df.drop(labels=['sentimento'], axis=1, inplace=True)

# Reorganizar colunas
test_df = test_df[['texto', 'alegria', 'tristeza', 'raiva', 'medo',
                   'nojo', 'surpresa', 'confianca', 'antecipacao']]

In [25]:
# Criar dataset e dataloader para o conjunto de teste
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)

In [None]:
# Função para carregar o melhor modelo salvo
def load_best_model(model, best_model_path):
    model.load_state_dict(torch.load(best_model_path))
    model.to(device)
    model.eval()
    return model

# Carregar o melhor modelo
best_model = BERTClass()
best_model = load_best_model(best_model, best_model_path)
best_model.to(device)

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [27]:
# Avaliar o modelo no conjunto de teste
def evaluate_model(test_loader, model):
    model.eval()
    test_targets = []
    test_outputs = []

    with torch.no_grad():
        for batch_idx, data in enumerate(test_loader):
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)

            outputs = model(ids, mask, token_type_ids)

            test_targets.extend(targets.cpu().detach().numpy().tolist())
            test_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

    test_outputs = np.array(test_outputs) > 0.5

    accuracy = accuracy_score(test_targets, test_outputs)
    precision = precision_score(test_targets, test_outputs, zero_division=0, average='micro')
    recall = recall_score(test_targets, test_outputs, zero_division=0, average='micro')
    f1 = f1_score(test_targets, test_outputs, zero_division=0, average='micro')

    print(f'Acurácia no conjunto de teste: {accuracy:.4f}')
    print(f'Precisão no conjunto de teste: {precision:.4f}')
    print(f'Recall no conjunto de teste: {recall:.4f}')
    print(f'F1 Score no conjunto de teste: {f1:.4f}')

In [None]:
evaluate_model(test_data_loader, best_model)

In [29]:
def predict(model, data_loader):
    model.eval()
    outputs = []
    targets = []

    with torch.no_grad():
        for batch in data_loader:
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            target = batch['targets'].to(device, dtype=torch.float)

            output = model(ids, mask, token_type_ids)
            outputs.extend(torch.sigmoid(output).cpu().detach().numpy().tolist())
            targets.extend(target.cpu().detach().numpy().tolist())

    return outputs, targets

In [30]:
# Realizar previsões no conjunto de dados de teste
test_outputs, test_targets = predict(best_model, test_data_loader)

# Converter as previsões e os rótulos verdadeiros em arrays binários
test_outputs_bin = np.array(test_outputs) > 0.5
test_targets_bin = np.array(test_targets)

# Nomes das colunas de emoção (ajuste conforme necessário)
emotion_columns = ['alegria', 'tristeza', 'raiva', 'medo', 'nojo', 'surpresa', 'confianca', 'antecipacao']

# Converter previsões e rótulos verdadeiros para DataFrames
df_prediction = pd.DataFrame(test_outputs_bin, columns=emotion_columns)
df_true_labels = pd.DataFrame(test_targets_bin, columns=emotion_columns)

# Calcular métricas para cada emoção
for column in emotion_columns:
    accuracy = accuracy_score(df_true_labels[column], df_prediction[column])
    precision = precision_score(df_true_labels[column], df_prediction[column])
    recall = recall_score(df_true_labels[column], df_prediction[column])
    f1 = f1_score(df_true_labels[column], df_prediction[column])

    print(f"Métricas para a emoção '{column}':")
    print(f"Acurácia: {accuracy:.4f}")
    print(f"Precisão: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\n")

Métricas para a emoção 'alegria':
Acurácia: 0.7000
Precisão: 0.4474
Recall: 0.8500
F1 Score: 0.5862


Métricas para a emoção 'tristeza':
Acurácia: 0.8875
Precisão: 1.0000
Recall: 0.5500
F1 Score: 0.7097


Métricas para a emoção 'raiva':
Acurácia: 0.8625
Precisão: 0.9091
Recall: 0.5000
F1 Score: 0.6452


Métricas para a emoção 'medo':
Acurácia: 0.8750
Precisão: 1.0000
Recall: 0.5000
F1 Score: 0.6667


Métricas para a emoção 'nojo':
Acurácia: 0.9000
Precisão: 0.8333
Recall: 0.7500
F1 Score: 0.7895


Métricas para a emoção 'surpresa':
Acurácia: 0.9125
Precisão: 1.0000
Recall: 0.6500
F1 Score: 0.7879


Métricas para a emoção 'confianca':
Acurácia: 0.7625
Precisão: 0.6000
Recall: 0.1500
F1 Score: 0.2400


Métricas para a emoção 'antecipacao':
Acurácia: 0.7000
Precisão: 0.3333
Recall: 0.2632
F1 Score: 0.2941


