In [None]:
import torch
import torch.nn as nn
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import DataLoader, Dataset
from google.colab import drive
import os
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import SubsetRandomSampler, DataLoader


drive.mount('/content/drive')

# Caminho para a pasta onde estão os arquivos de poemas e críticas
main_dir = '/content/drive/My Drive/Trabalho 2 IA'

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Carregamento de dois tokenizadores e modelos
tokenizers = {
    "bert": AutoTokenizer.from_pretrained("bert-base-multilingual-cased"),
    "distilbert": AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased"),
}

models = {
    "bert": AutoModel.from_pretrained("bert-base-multilingual-cased").to(device),
    "distilbert": AutoModel.from_pretrained("distilbert-base-multilingual-cased").to(device),
}

# Função para carregar os dados
def load_data(main_dir):
    texts, labels = [], []
    os.chdir(main_dir)
    for subfolder in ["poema", "crítica"]:
        for filename in os.listdir(subfolder):
            if filename.endswith(".txt"):
                filepath = os.path.join(subfolder, filename)
                with open(filepath, "r", encoding="utf-8") as file:
                    content = file.read().strip()
                    if content:
                        texts.append(content)
                        labels.append(0 if subfolder == "poema" else 1)
    if not texts or not labels:
        raise ValueError("Nenhum dado foi encontrado. Verifique o caminho e os arquivos.")
    return texts, labels

texts, labels = load_data(main_dir)

# Dividir os dados em treino (70%), validação (15%) e teste (15%)
tokenizer = tokenizers["distilbert"]
train_idx, temp_idx = train_test_split(
    np.arange(len(labels)),
    test_size=0.3,  # 30% dos dados para validação+teste
    shuffle=True,
    stratify=labels
)

valid_idx, test_idx = train_test_split(
    temp_idx,
    test_size=0.5,  # Divide os 30% em 15% para validação e 15% para teste
    shuffle=True,
    stratify=np.array(labels)[temp_idx]
)

# Criar os samplers para cada conjunto
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
test_sampler = SubsetRandomSampler(test_idx)


class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        return inputs.input_ids.squeeze(), inputs.attention_mask.squeeze(), torch.tensor(label)

class TextClassifier(nn.Module):
    def __init__(self, bert_model):
        super(TextClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        return self.fc(pooled_output)

# Treino e Avaliação
def train_and_evaluate(model_name):
    print(f"\nTreinando modelo: {model_name}")
    tokenizer = tokenizers[model_name]
    bert_model = models[model_name]

    # Inicializa o modelo
    model = TextClassifier(bert_model).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)

    # Criar os DataLoaders (fora do loop de treinamento)
    train_loader = DataLoader(TextDataset(texts, labels, tokenizer), batch_size=16, sampler=train_sampler)
    val_loader = DataLoader(TextDataset(texts, labels, tokenizer), batch_size=16, sampler=valid_sampler)
    test_loader = DataLoader(TextDataset(texts, labels, tokenizer), batch_size=16, sampler=test_sampler)

    # Parâmetros do Treinamento
    patience_time = 30
    epochs = 4
    lowest_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        # Treinamento
        model.train()
        train_losses = []
        for batch_input_ids, batch_attention_mask, batch_labels in train_loader:
            batch_input_ids = batch_input_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            batch_labels = batch_labels.to(device)

            optimizer.zero_grad()
            outputs = model(batch_input_ids, batch_attention_mask)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

        avg_train_loss = np.mean(train_losses)

        # Validação
        model.eval()
        val_losses = []
        y_true, y_pred = [], []
        with torch.no_grad():
            for batch_input_ids, batch_attention_mask, batch_labels in val_loader:
                batch_input_ids = batch_input_ids.to(device)
                batch_attention_mask = batch_attention_mask.to(device)
                batch_labels = batch_labels.to(device)

                outputs = model(batch_input_ids, batch_attention_mask)
                loss = criterion(outputs, batch_labels)
                val_losses.append(loss.item())

                y_true.extend(batch_labels.cpu().numpy())
                y_pred.extend(torch.argmax(outputs, dim=1).cpu().numpy())

        avg_val_loss = np.mean(val_losses)
        accuracy = accuracy_score(y_true, y_pred)

        # Early Stopping e Salvamento do Melhor Modelo
        if avg_val_loss < lowest_val_loss:
            lowest_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), f'/content/drive/My Drive/Trabalho 2 IA/best_model_{model_name}.pth')
            print(f"Melhor modelo salvo ({model_name})! Val Loss: {avg_val_loss:.4f}")
        else:
            patience_counter += 1

        if patience_counter >= patience_time:
            print(f"Early stopping ativado para {model_name}.")
            break

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")
        scheduler.step(avg_val_loss)

    # Avaliação no Conjunto de Teste
    print(f"\nAvaliação do modelo: {model_name}")
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch_input_ids, batch_attention_mask, batch_labels in test_loader:
            batch_input_ids = batch_input_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)

            outputs = model(batch_input_ids, batch_attention_mask)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(batch_labels.numpy())

    print("Accuracy:", accuracy_score(true_labels, predictions))
    print(classification_report(true_labels, predictions, target_names=["Poema", "Crítica"]))


def debug():
  print(f"Tamanho do treino: {len(X_train)}")
  print(f"Tamanho da validação: {len(X_val)}")
  print(f"Tamanho do teste: {len(X_test)}")

  print(f"Interseção Treino/Validação: {len(set(train_idx).intersection(valid_idx))}")
  print(f"Interseção Treino/Teste: {len(set(train_idx).intersection(test_idx))}")
  print(f"Interseção Validação/Teste: {len(set(valid_idx).intersection(test_idx))}")

  from collections import Counter
  print(f"\nDistribuição dos rótulos (treino): {Counter(y_train)}")
  print(f"Distribuição dos rótulos (validação): {Counter(y_val)}")
  print(f"Distribuição dos rótulos (teste): {Counter(y_test)}")

#  print(f"\nExemplo do conjunto de treino: Texto: {X_train[0]}, Rótulo: {y_train[0]}")
 # print(f"Exemplo do conjunto de validação: Texto: {X_val[0]}, Rótulo: {y_val[0]}")
  #print(f"Exemplo do conjunto de teste: Texto: {X_test[0]}, Rótulo: {y_test[0]}")

train_and_evaluate("bert")
#train_and_evaluate("distilbert")




#debug()

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]


Treinando modelo: bert
Melhor modelo salvo (bert)! Val Loss: 0.2650
Epoch 1/4, Train Loss: 0.5405, Val Loss: 0.2650, Accuracy: 0.9333
Epoch 2/4, Train Loss: 0.3250, Val Loss: 0.3539, Accuracy: 0.9000
Epoch 3/4, Train Loss: 0.5857, Val Loss: 0.6841, Accuracy: 0.5333
Melhor modelo salvo (bert)! Val Loss: 0.1705
Epoch 4/4, Train Loss: 0.4214, Val Loss: 0.1705, Accuracy: 0.9000

Avaliação do modelo: bert
Accuracy: 0.9333333333333333
              precision    recall  f1-score   support

       Poema       1.00      0.87      0.93        15
     Crítica       0.88      1.00      0.94        15

    accuracy                           0.93        30
   macro avg       0.94      0.93      0.93        30
weighted avg       0.94      0.93      0.93        30



In [None]:
def predict_text(text, model_name):
    # Carregar o modelo salvo
    bert_model = models[model_name]
    tokenizer = tokenizers[model_name]
    model = TextClassifier(bert_model).to(device)

    #print(f"Dispositivo atual: {device}")
    #print(f"Tokenizador: {tokenizers['distilbert']}")
    #print(f"Arquivo de modelo: {os.path.exists(f'/content/drive/My Drive/Trabalho 2 IA/best_model_distilbert.pth')}")
    #print(f"Caminho: {(f'/content/drive/My Drive/Trabalho 2 IA/best_model_distilbert.pth')}")

    model.load_state_dict(torch.load(f'/content/drive/My Drive/Trabalho 2 IA/best_model_distilbert.pth'))
    model.eval()

    # Tokenizar o texto
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128).to(device)

    # Fazer a previsão
    with torch.no_grad():
        outputs = model(inputs["input_ids"], inputs["attention_mask"])
        predicted_class = torch.argmax(outputs, dim=1).item()

    # Retornar o rótulo correspondente
    return "Poema" if predicted_class == 0 else "Crítica"

texto = "Olá, sou o jota e isso é um teste"
resultado = predict_text(texto, "distilbert")
print(f"\nO texto foi classificado como: {resultado}")

  model.load_state_dict(torch.load(f'/content/drive/My Drive/Trabalho 2 IA/best_model_distilbert.pth'))



O texto foi classificado como: Poema
