In [3]:
import numpy as np
import pandas as pd
import transformers
from transformers import BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import torch
from torch import nn, optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader

## Pré-processamento

### Lendo o arquivo

In [4]:
# Reading Data
df = pd.read_csv("../Lucas Braz/data.csv")
encoder = LabelEncoder()
labels = encoder.fit_transform(df.Sentiment)

### Removendo linhas com valores nulos ou vazios

In [5]:
# Remover linhas com valores nulos ou vazios
df = df.dropna()

## Tonekenização

In [6]:
# Init tokenizer
# Bert has its own tokenizer which also include some special tokens
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

# Spliting data
X_train, X_test, y_train, y_test = train_test_split(df.Sentence.values, labels, test_size=.2, stratify=df.Sentiment.values)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

## BERT

In [None]:
# Data generator
# Inherit Dataset from torch which need to implement __len__ and __getitem__ function

class Datagen(Dataset):
    def __init__(self, sentence, target, tokenizer, max_len):
        self.sentence = sentence
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.sentence)
    def __getitem__(self, idx):
        sentence = self.sentence[idx]
        bert_encoding = tokenizer.encode_plus(
            sentence,
            max_length = self.max_len,
            add_special_tokens = True, # include special tokens
            padding = 'max_length',
            truncation = True,
            return_attention_mask = True, # return attention mask which is required during training 
            return_token_type_ids = False,
            return_tensors = 'pt' # pt is pytorch format tensor
        )
        return {
            "input_ids": torch.squeeze(bert_encoding["input_ids"]),
            "attention_mask": torch.squeeze(bert_encoding["attention_mask"]),
            "target": torch.tensor(self.target[idx], dtype=torch.long)
        }
train_data = Datagen(X_train, y_train, tokenizer, 50)
test_data = Datagen(X_test, y_test, tokenizer, 50)
# DataLoader created batch generator and have prefatch ability
train_dataloader = DataLoader(train_data, batch_size=16, num_workers=2)
test_dataloader = DataLoader(test_data, batch_size=15, num_workers=2)

### Definindo o modelo

In [7]:
# Model
class SentimentModel(nn.Module):
    def __init__(self, n_class):
        # Here we are initializing some nn.Module attributes and functions
        super(SentimentModel, self).__init__()
        # Initializing bert model
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_class)
        self.softmax = nn.Softmax(dim=1)
    def forward(self, input_ids, attention_mask):
        # bert model gives two outputs, sequenced output and pooled output
        # Sequence output is last layer output for each token in sentence usualy used for similarity task
        # Pooled output is cls token(starting token for each sentence) output from model used for classification task
        _, pooled_output = self.bert(input_ids, attention_mask, return_dict=False)
        output = self.drop(pooled_output)
        output = self.out(output)
        return self.softmax(output)
model = SentimentModel(n_class=3)
# Moving model to GPU
model = model.to(torch.device('cuda'))
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
scheduler = ReduceLROnPlateau(optimizer, 'min')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
def check_acc(preds, labels):
    correct = 0
    for pred, label in zip(preds, labels):
        if pred == label:
            correct += 1
    return correct/len(preds)

In [9]:
def accuracy(dataloader, model):
    model.eval()
    batch_acc = 0
    for inputs in dataloader:
        to_cuda = lambda x: x.to(torch.device('cuda'), non_blocking=True).long()
        input_ids, attention_mask, target = list(map(to_cuda, inputs.values()))

        outputs = model(input_ids, attention_mask)
        prediction = outputs.argmax(dim=1)
        acc = check_acc(prediction, target)
        batch_acc += acc
    return batch_acc/len(dataloader)

### Treinamento

In [1]:
# Training
for epoch in range(5):
    # Telling model to start training mode for dropout and BN
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for steps, inputs in enumerate(train_dataloader):
        to_cuda = lambda x: x.to(torch.device('cuda'), non_blocking=True).long()
        # Moving variables to GPU
        input_ids, attention_mask, target = list(map(to_cuda, inputs.values()))
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, target)

        # Initializing gradient to 0 so that it don't add up previous gradient        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Log the training statistics
        total_loss += loss.item()
        _, predicted_labels = torch.max(outputs, 1)
        correct_predictions += (predicted_labels == target).sum().item()
        total_samples += target.size(0)

        if steps % 10 == 0:  # Print every 10 batches
            print(f"Epoch: {epoch}, Batch: {steps}, Loss: {loss.item():.4f}")

    # Calculate and print average training loss and accuracy for the epoch
    avg_loss = total_loss / (steps + 1)
    accuracy_epoch = correct_predictions / total_samples
    print(f"Epoch: {epoch}, Avg Loss: {avg_loss:.4f}, Accuracy: {accuracy_epoch:.4f}")

    # Validation accuracy
    val_accuracy = accuracy(test_dataloader, model)
    print(f"Epoch: {epoch}, Validation Accuracy: {val_accuracy:.4f}")

    # Adjust learning rate based on validation accuracy
    scheduler.step(val_accuracy)

NameError: name 'model' is not defined

In [46]:
# Saving model
torch.save(model.state_dict(), "bert_v1_semOtim.pth")

# BERT com otimização e CV

## Parte 1: Imports e Leitura dos Dados

In [47]:
import numpy as np
import pandas as pd
import transformers
from transformers import BertModel
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import torch
from torch import nn, optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

# Leitura dos dados
df = pd.read_csv("../Lucas Braz/data.csv")
encoder = LabelEncoder()
labels = encoder.fit_transform(df.Sentiment)


## Parte 2: Pré-processamento e Tokenização

In [48]:
# Remover linhas com valores nulos ou vazios
df = df.dropna()

# Tokenização
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
X_train, X_test, y_train, y_test = train_test_split(df.Sentence.values, labels, test_size=.2, stratify=df.Sentiment.values)


## Parte 3: Definição da Classe de Dataset

In [49]:
class Datagen(Dataset):
    def __init__(self, sentence, target, tokenizer, max_len):
        self.sentence = sentence
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentence)

    def __getitem__(self, idx):
        sentence = self.sentence[idx]
        bert_encoding = tokenizer.encode_plus(
            sentence,
            max_length=self.max_len,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        return {
            "input_ids": torch.squeeze(bert_encoding["input_ids"]),
            "attention_mask": torch.squeeze(bert_encoding["attention_mask"]),
            "target": torch.tensor(self.target[idx], dtype=torch.long)
        }

## Parte 4: Configuração do TensorBoard e Parâmetros do Modelo

In [50]:
# Configuração do TensorBoard
writer = SummaryWriter()

# Modelo
class SentimentModel(nn.Module):
    def __init__(self, n_class):
        super(SentimentModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_class)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids, attention_mask, return_dict=False)
        output = self.drop(pooled_output)
        output = self.out(output)
        return self.softmax(output)

model = SentimentModel(n_class=3)
model = model.to(torch.device('cuda'))
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
scheduler = ReduceLROnPlateau(optimizer, 'min')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Parte 5: Funções Auxiliares e Treinamento com Validação Cruzada

In [None]:
def check_acc(preds, labels):
    correct = 0
    for pred, label in zip(preds, labels):
        if pred == label:
            correct += 1
    return correct/len(preds)

def accuracy(dataloader, model):
    model.eval()
    batch_acc = 0
    for inputs in dataloader:
        to_cuda = lambda x: x.to(torch.device('cuda'), non_blocking=True).long()
        input_ids, attention_mask, target = list(map(to_cuda, inputs.values()))

        outputs = model(input_ids, attention_mask)
        prediction = outputs.argmax(dim=1)
        acc = check_acc(prediction, target)
        batch_acc += acc
    return batch_acc/len(dataloader)

In [52]:
# Treinamento com validação cruzada
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
num_epochs = 5  # Substitua este valor pelo número desejado de épocas

for fold, (train_index, test_index) in enumerate(skf.split(df.Sentence.values, labels)):
    print(f"Fold {fold + 1}/{skf.get_n_splits()}")

    X_train, X_test = df.Sentence.values[train_index], df.Sentence.values[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    train_data = Datagen(X_train, y_train, tokenizer, 50)
    test_data = Datagen(X_test, y_test, tokenizer, 50)

    train_dataloader = DataLoader(train_data, batch_size=16, num_workers=2)
    test_dataloader = DataLoader(test_data, batch_size=15, num_workers=2)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for steps, inputs in enumerate(train_dataloader):
            to_cuda = lambda x: x.to(torch.device('cuda'), non_blocking=True).long()
            input_ids, attention_mask, target = list(map(to_cuda, inputs.values()))
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        average_loss = total_loss / len(train_dataloader)
        writer.add_scalar(f"Train/Average_Loss_Fold{fold}", average_loss, epoch)

        train_acc = accuracy(train_dataloader, model)
        test_acc = accuracy(test_dataloader, model)

        print(f"Fold {fold + 1}/{skf.get_n_splits()}\t Epoch: {epoch}\t Train accuracy: {train_acc}\t Test accuracy: {test_acc}")

        writer.add_scalar(f"Train/Accuracy_Fold{fold}", train_acc, epoch)
        writer.add_scalar(f"Test/Accuracy_Fold{fold}", test_acc, epoch)

        scheduler.step(average_loss)

# Salvando o modelo treinado
torch.save(model.state_dict(), "sentiment_model.pth")

# Fechando o TensorBoard
writer.close()


Fold 1/5
Fold 1/5	 Epoch: 0	 Train accuracy: 0.8180460750853242	 Test accuracy: 0.756349206349206
Fold 1/5	 Epoch: 1	 Train accuracy: 0.8191126279863481	 Test accuracy: 0.7486568986568983
Fold 1/5	 Epoch: 2	 Train accuracy: 0.863481228668942	 Test accuracy: 0.7434676434676433
Fold 1/5	 Epoch: 3	 Train accuracy: 0.8607081911262798	 Test accuracy: 0.7383394383394383
Fold 1/5	 Epoch: 4	 Train accuracy: 0.875	 Test accuracy: 0.7357753357753355
Fold 2/5
Fold 2/5	 Epoch: 0	 Train accuracy: 0.8538822525597269	 Test accuracy: 0.8631257631257628
Fold 2/5	 Epoch: 1	 Train accuracy: 0.8598549488054608	 Test accuracy: 0.8562881562881558
Fold 2/5	 Epoch: 2	 Train accuracy: 0.8705204778156996	 Test accuracy: 0.873443223443223
Fold 2/5	 Epoch: 3	 Train accuracy: 0.871160409556314	 Test accuracy: 0.8708791208791206
Fold 2/5	 Epoch: 4	 Train accuracy: 0.8071672354948806	 Test accuracy: 0.815201465201465
Fold 3/5
Fold 3/5	 Epoch: 0	 Train accuracy: 0.8617747440273038	 Test accuracy: 0.8723865877712024
F