In [None]:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss, Dropout, Linear, BatchNorm1d
from tqdm import tqdm
import torch.nn as nn
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

# Juntar os dataframes de treino e teste
dataset = pd.read_parquet('/content/drive/MyDrive/Dados/gpt_100k_processed_data.parquet')

# Preparar os dados de entrada para treino e teste
X = dataset['processed_pergunta'].apply(lambda x: ' '.join(x)).tolist()
y = dataset['Intencao']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(X, y_encoded, test_size=0.4, random_state=42)

# Carregar o tokenizer e o modelo BERTimbau
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

# Função para tokenizar e preparar os dados
def tokenize_data(texts, labels, tokenizer, max_len):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return TensorDataset(input_ids, attention_masks, labels)

# Definir tamanho máximo da sequência
max_len = 128

# Preparar datasets de treino e teste
train_dataset = tokenize_data(X_train, y_train_encoded, tokenizer, max_len)
test_dataset = tokenize_data(X_test, y_test_encoded, tokenizer, max_len)

# Criar DataLoaders de treino e teste
batch_size = 128
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Definir dispositivo (CPU ou GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Definir a arquitetura do modelo com Dropout, BatchNorm e Regularização L1 e L2
class BertWithRegularization(nn.Module):
    def __init__(self, bert_model, num_labels):
        super(BertWithRegularization, self).__init__()
        self.bert = bert_model
        self.dropout = Dropout(0.75)
        self.batchnorm = BatchNorm1d(768)
        self.classifier = Linear(768, num_labels, bias=True)
        self.l1 = 1e-07
        self.l2 = 1e-08

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs[1]  # Obter o embedding do token [CLS]
        cls_output = self.batchnorm(cls_output)
        cls_output = self.dropout(cls_output)

        # Aplicar regularização L1 e L2
        l1_reg = torch.tensor(0., requires_grad=True).to(device)
        l2_reg = torch.tensor(0., requires_grad=True).to(device)
        for param in self.classifier.parameters():
            l1_reg = l1_reg + torch.norm(param, 1)
            l2_reg = l2_reg + torch.norm(param, 2)

        logits = self.classifier(cls_output)

        loss = None
        if labels is not None:
            ce_loss = CrossEntropyLoss()(logits, labels)
            loss = ce_loss + self.l1 * l1_reg + self.l2 * l2_reg

        return (loss, logits) if labels is not None else logits

# Carregar o modelo base BERTimbau
bert_model = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased')

# Instanciar o modelo com as camadas adicionais
model = BertWithRegularization(bert_model=bert_model, num_labels=len(label_encoder.classes_))
model.to(device)

# Otimizador
optimizer = AdamW(model.parameters(), lr=2e-5)

# Inicializar listas para armazenar as métricas de acurácia e perda
train_accuracies = []
test_accuracies = []
train_losses = []
test_losses = []

# Early Stopping Parameters
early_stopping_patience = 5
best_test_loss = float('inf')
patience_counter = 0

# Treinamento do modelo com Early Stopping
epochs = 5

for epoch in range(epochs):
    # Treinamento
    model.train()
    total_loss_train = 0
    correct_train = 0
    total_train = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        model.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss, logits = outputs
        total_loss_train += loss.item()
        loss.backward()
        optimizer.step()

        # Calcular acurácia de treinamento
        preds = torch.argmax(logits, dim=1)
        correct_train += (preds == b_labels).sum().item()
        total_train += b_labels.size(0)

    train_accuracy = correct_train / total_train
    train_loss = total_loss_train / len(train_dataloader)
    train_accuracies.append(train_accuracy)
    train_losses.append(train_loss)

    # Avaliação no dataset de teste
    model.eval()
    total_loss_test = 0
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for batch in test_dataloader:
            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
            outputs = model(b_input_ids, attention_mask=b_input_mask)
            logits = outputs
            loss = CrossEntropyLoss()(logits, b_labels)
            total_loss_test += loss.item()

            # Calcular acurácia no teste
            preds = torch.argmax(logits, dim=1)
            correct_test += (preds == b_labels).sum().item()
            total_test += b_labels.size(0)

    test_accuracy = correct_test / total_test
    test_loss = total_loss_test / len(test_dataloader)
    test_accuracies.append(test_accuracy)
    test_losses.append(test_loss)

    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Training Loss: {train_loss:.3f}, Training Accuracy: {train_accuracy:.3f}")
    print(f"Test Loss: {test_loss:.3f}, Test Accuracy: {test_accuracy:.3f}")

    # Early Stopping
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= early_stopping_patience:
        print(f"Early stopping triggered at epoch {epoch+1}")
        break

Epoch 1/5: 100%|██████████| 530/530 [04:42<00:00,  1.87it/s]


Epoch 1/5
Training Loss: 0.916, Training Accuracy: 0.747
Test Loss: 0.041, Test Accuracy: 0.990


Epoch 2/5: 100%|██████████| 530/530 [04:42<00:00,  1.87it/s]


Epoch 2/5
Training Loss: 0.058, Training Accuracy: 0.988
Test Loss: 0.016, Test Accuracy: 0.995


Epoch 3/5: 100%|██████████| 530/530 [04:42<00:00,  1.87it/s]


Epoch 3/5
Training Loss: 0.024, Training Accuracy: 0.994
Test Loss: 0.013, Test Accuracy: 0.995


Epoch 4/5: 100%|██████████| 530/530 [04:42<00:00,  1.87it/s]


Epoch 4/5
Training Loss: 0.018, Training Accuracy: 0.995
Test Loss: 0.013, Test Accuracy: 0.995


Epoch 5/5: 100%|██████████| 530/530 [04:42<00:00,  1.87it/s]


Epoch 5/5
Training Loss: 0.016, Training Accuracy: 0.995
Test Loss: 0.012, Test Accuracy: 0.995


In [None]:
import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from torch.nn import Dropout, Linear, BatchNorm1d
from sklearn.preprocessing import LabelEncoder
import pickle

# Função para definir o dispositivo (CPU ou GPU)
def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Classe do modelo com Dropout, BatchNorm e Regularização L1 e L2
class BertWithRegularization(nn.Module):
    def __init__(self, bert_model, num_labels):
        super(BertWithRegularization, self).__init__()
        self.bert = bert_model
        self.dropout = Dropout(0.75)
        self.batchnorm = BatchNorm1d(768)
        self.classifier = Linear(768, num_labels, bias=True)
        self.l1 = 1e-07
        self.l2 = 1e-08

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs[1]  # Obter o embedding do token [CLS]
        cls_output = self.batchnorm(cls_output)
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits

# Função para carregar o modelo
def load_model(model_path, num_labels):
    bert_model = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
    model = BertWithRegularization(bert_model=bert_model, num_labels=num_labels)
    model.load_state_dict(torch.load(model_path))
    return model

# Função para carregar o tokenizador
def load_tokenizer(tokenizer_path):
    return BertTokenizer.from_pretrained(tokenizer_path)

# Função para carregar o label encoder
def load_label_encoder(label_encoder_path):
    with open(label_encoder_path, 'rb') as f:
        label_encoder = pickle.load(f)
    return label_encoder

# Função de predição
def predict_intention(input_text, model, tokenizer, max_len, label_encoder, device):
    encoded = tokenizer.encode_plus(
        input_text,
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    model.eval()
    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(logits, dim=1)

    predicted_label = label_encoder.inverse_transform(preds.cpu().numpy())[0]
    return predicted_label

# Caminhos dos arquivos
model_path = '/content/drive/MyDrive/Dados/models/nlu/modelo_bert_classificacao.pt'
tokenizer_path = '/content/drive/MyDrive/Dados/models/nlu/tokenizer_bert.pt'
label_encoder_path = '/content/drive/MyDrive/Dados/models/nlu/label_encoder.pkl'

# Definir dispositivo
device = get_device()

# Carregar o label encoder para obter o número de labels
label_encoder = load_label_encoder(label_encoder_path)

# Carregar modelo e tokenizer
model = load_model(model_path, num_labels=len(label_encoder.classes_))
model.to(device)
tokenizer = load_tokenizer(tokenizer_path)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  model.load_state_dict(torch.load(model_path))


In [None]:
input_text = "Como posso criar conta para meu filho?" # @param {type: 'string'}
predicted_intention = predict_intention(input_text, model, tokenizer, 128, label_encoder, device)
print(f"Intenção prevista: {predicted_intention}")

Intenção prevista: Cadastro de beneficiario
