In [None]:
# Instalar bibliotecas necessárias
!pip install transformers datasets torch scikit-learn

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# Carregamento do dataset IMDb
dataset = load_dataset("imdb")

# Divisão em treinamento, validação e teste
train_data = dataset["train"].train_test_split(test_size=0.1)["train"]
val_data = dataset["train"].train_test_split(test_size=0.1)["test"]
test_data = dataset["test"]

In [None]:
# Configuração do tokenizador e modelo
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenização dos textos
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

train_data = train_data.map(tokenize_function, batched=True, remove_columns=["text"])
val_data = val_data.map(tokenize_function, batched=True, remove_columns=["text"])
test_data = test_data.map(tokenize_function, batched=True, remove_columns=["text"])

# Preparação dos dados para o Trainer
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Configuração do modelo
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
# Função personalizada para calcular métricas
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
# Configuração do treinamento
training_args = TrainingArguments(
    output_dir="./results",          # Pasta para salvar resultados
    evaluation_strategy="epoch",    # Avaliação a cada época
    learning_rate=2e-5,             # Taxa de aprendizado
    per_device_train_batch_size=16, # Batch size para treino
    per_device_eval_batch_size=16,  # Batch size para validação
    num_train_epochs=3,             # Número de épocas
    weight_decay=0.01,              # Regularização L2 (weight decay)
    save_strategy="epoch",          # Salvar modelo a cada época
    logging_dir="./logs",           # Diretório para logs
    report_to="none",               # Desativa serviços externos (W&B e outros serviços de monitoramento)
)

# Inicialização do Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Passa as métricas personalizadas
)

# Fine-Tuning (Treinamento do modelo)
trainer.train()

In [None]:
# Avaliação final no conjunto de teste
results = trainer.evaluate(test_data)
print(results)

In [None]:
# Salvar o modelo ajustado (Fine-Tuned)
model.save_pretrained("./fine_tuned_bert")
tokenizer.save_pretrained("./fine_tuned_bert")

In [None]:
# Inferência com o modelo ajustado e mapeamento de rótulos
sentiment_analyzer = pipeline("sentiment-analysis", model="./fine_tuned_bert", tokenizer="./fine_tuned_bert")
label_map = {"LABEL_0": "NEGATIVE", "LABEL_1": "POSITIVE"}

# Textos para análise
texts = [
    "This movie was fantastic!",
    "I hated every minute of this film.",
    "The plot was okay, but the acting was superb.",
    "I wouldn't recommend this to anyone.",
    "It was a decent film, not too bad but not great either.",
    "Absolutely amazing! A masterpiece.",
    "Terrible, just terrible. A waste of time.",
    "The visuals were stunning, but the story lacked depth.",
    "One of the best movies I’ve ever seen!",
    "It’s not my kind of movie, but it was well-made.",
]

# Obtenção e formatação das previsões
for text, prediction in zip(texts, sentiment_analyzer(texts)):
    label = label_map[prediction["label"]]
    score = prediction["score"]
    print(f"text: {text}, label: {label}, score: {score:.4f}")
