# Importar os dados

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import optuna
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds_test = pd.read_csv(r"C:\Users\filip\Desktop\Mestrado\2semestre\TMCD\Trabalho\Dataset\amazon_reviews_test.csv")
ds_train = pd.read_csv(r"C:\Users\filip\Desktop\Mestrado\2semestre\TMCD\Trabalho\Dataset\amazon_reviews_train.csv")

print(ds_train.count())
print(ds_test.count())

sentiment    48902
review       48902
dtype: int64
sentiment    2417
review       2417
dtype: int64


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer
)

import optuna
from transformers import pipeline

In [4]:
# 1. Carrega o modelo como pipeline
pipe = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment", device=0)

# 2. Aplica o modelo às reviews (atenção à performance!)
def get_star(text):
    result = pipe(text, truncation=True, max_length=128)[0]
    label = result['label']  # exemplo: '4 stars'
    return int(label[0]) - 1  # retorna 0–4

Device set to use cpu


In [6]:
# === 1. Carregamento e pré-processamento dos dados ===
ds_train['label'] = ds_train['review'].apply(get_star)

train_df, val_df = train_test_split(ds_train, test_size=0.2, stratify=ds_train['label'], random_state=42)
train_dataset = Dataset.from_pandas(train_df[['review', 'label']])
val_dataset = Dataset.from_pandas(val_df[['review', 'label']])

In [16]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    # Tokeniza as reviews e preserva os rótulos no dataset
    tokenized_batch = tokenizer(batch["review"], padding="max_length", truncation=True, max_length=128)
    tokenized_batch["label"] = batch["label"]  # Adiciona o campo 'label' manualmente
    return tokenized_batch

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

# Remove texto original, já está tokenizado
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask","label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask","label"])

Map: 100%|██████████| 39121/39121 [00:04<00:00, 7915.78 examples/s]
Map: 100%|██████████| 9781/9781 [00:01<00:00, 8489.21 examples/s]


In [18]:
# === 3. model_init ===
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    for param in model.bert.parameters():
        param.requires_grad = False  # congela a BERT
    return model


def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    preds_bin = [0 if p <= 1 else 1 for p in preds]  # 1-2 estrelas = negativo
    labels = preds_bin  # como não temos rótulos binários reais, usamos as mesmas predições
    return {
        "accuracy": accuracy_score(labels, preds_bin),
        "f1": f1_score(labels, preds_bin)
    }

In [19]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",
    logging_strategy="epoch",
    load_best_model_at_end=False,
    metric_for_best_model="f1"
)

In [20]:
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [21]:
# === 7. Espaço de Hiperparâmetros ===
def model_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 3),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.2)
    }


In [22]:
print(train_dataset[0])

{'label': tensor(3), 'input_ids': tensor([  101,   151, 11531,   143, 10772, 20044, 18336, 19341, 27553, 15713,
        37181, 10110, 26333,   143, 13983, 12850, 28573,   119, 10372, 51503,
        90190, 11770, 17675, 17654, 91236, 68612, 10107,   119, 10103, 61304,
        10803, 11408, 10110, 12112, 17675, 17654, 10127, 10398,   143, 12050,
        19209,   119, 11526, 10197, 10127, 14666, 16359, 10855, 10144, 17503,
        42266,   143, 20306, 11602, 15724, 10125, 12787, 87911,   119, 10103,
        44960, 10127, 63400, 10114, 11523,   151, 11008, 33831, 10871, 35263,
        10139,   143, 24242, 28531, 10525, 10700,   119,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,  

In [23]:
best_run = trainer.hyperparameter_search(
    direction="maximize",
    hp_space=model_hp_space,
    n_trials=10  # leve!
)

print("Melhores hiperparâmetros encontrados:")
print(best_run.hyperparameters)

[I 2025-04-25 23:15:33,335] A new study created in memory with name: no-name-14e94257-cb5c-45d7-a2d7-82240edc375b


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4758,0.346008,1.0,1.0
2,0.4064,0.32204,1.0,1.0


[I 2025-04-26 01:49:01,278] Trial 0 finished with value: 2.0 and parameters: {'learning_rate': 1.1278972692767592e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.14519556451995608}. Best is trial 0 with value: 2.0.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4503,0.313683,1.0,1.0
2,0.3783,0.290278,1.0,1.0


[I 2025-04-26 04:08:47,665] Trial 1 finished with value: 2.0 and parameters: {'learning_rate': 1.651342500518463e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.12115635908987131}. Best is trial 0 with value: 2.0.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4865,0.36048,1.0,1.0
2,0.4191,0.336381,1.0,1.0


[I 2025-04-26 06:41:52,015] Trial 2 finished with value: 2.0 and parameters: {'learning_rate': 1.4752043773263689e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2, 'weight_decay': 0.1300687560296922}. Best is trial 0 with value: 2.0.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4549,0.320018,1.0,1.0
2,0.3831,0.295927,1.0,1.0


[I 2025-04-26 09:21:11,214] Trial 3 finished with value: 2.0 and parameters: {'learning_rate': 2.3752190241347427e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2, 'weight_decay': 0.018934848631755187}. Best is trial 0 with value: 2.0.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3953,0.249643,1.0,1.0
2,0.3282,0.230565,1.0,1.0


[I 2025-04-26 11:54:43,338] Trial 4 finished with value: 2.0 and parameters: {'learning_rate': 3.749315534698745e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.17741665649529104}. Best is trial 0 with value: 2.0.


Epoch,Training Loss,Validation Loss


[W 2025-04-26 12:25:07,964] Trial 5 failed with parameters: {'learning_rate': 2.1378367365068535e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.17456015049061568} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\filip\AppData\Local\Programs\Python\Python313\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "c:\Users\filip\AppData\Local\Programs\Python\Python313\Lib\site-packages\transformers\integrations\integration_utils.py", line 254, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\filip\AppData\Local\Programs\Python\Python313\Lib\site-packages\transformers\trainer.py", line 2245, in train
    return inner_training_loop(
        args=args,
    ...<2 lines>...
        ignore_keys_for_eval=ignore_keys_for_eval,
    )
  File "

KeyboardInterrupt: 