# Importar os dados

In [None]:
import pandas as pd

ds_test = pd.read_csv(r"C:\Users\filip\Desktop\Mestrado\2semestre\TMCD\Trabalho\Dataset\amazon_reviews_test.csv")
ds_train = pd.read_csv(r"C:\Users\filip\Desktop\Mestrado\2semestre\TMCD\Trabalho\Dataset\amazon_reviews_train.csv")

print(ds_train.count())
print(ds_test.count())

sentiment    48902
review       48902
dtype: int64
sentiment    2417
review       2417
dtype: int64


# MODELOS PRÉ-TREINADOS COM FINE-TUNING

### siebert/sentiment-roberta-large-english

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    RobertaForSequenceClassification,
    RobertaTokenizer,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model_name = "siebert/sentiment-roberta-large-english"
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = RobertaTokenizer.from_pretrained(model_name)

In [None]:
ds_train['label'] = ds_train['sentiment'].map({'negative': 0, 'positive': 1})

train_df, val_df = train_test_split(ds_train, test_size=0.2, stratify=ds_train['label'], random_state=42)

train_dataset = Dataset.from_pandas(train_df[['review', 'label']])
val_dataset = Dataset.from_pandas(val_df[['review', 'label']])

model_id = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_id)

def tokenize(batch):
    return tokenizer(batch["review"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map: 100%|██████████| 39121/39121 [00:02<00:00, 13193.67 examples/s]
Map: 100%|██████████| 9781/9781 [00:00<00:00, 13080.61 examples/s]


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)

for param in model.roberta.parameters():
    param.requires_grad = False


In [None]:
for name, param in model.named_parameters():
    if "classifier" in name:  # Deixe apenas a camada "classifier" com requires_grad=True
        param.requires_grad = True
    else:
        param.requires_grad = False  # Congele todas as outras camadas

In [None]:
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)

In [None]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"]}

In [None]:
import torch

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=10,
    fp16= torch.cuda.is_available(),
    gradient_accumulation_steps=10
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

  trainer = Trainer(


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
100,0.1872,0.197911,0.922298
200,0.2148,0.191768,0.922912
300,0.1773,0.198305,0.922912
400,0.2296,0.192461,0.923014
500,0.1724,0.193725,0.923832
600,0.1963,0.199046,0.923321
700,0.1702,0.193055,0.923627


TrainOutput(global_step=700, training_loss=0.19458646348544528, metrics={'train_runtime': 26856.7942, 'train_samples_per_second': 2.913, 'train_steps_per_second': 0.073, 'total_flos': 6523519543296000.0, 'train_loss': 0.19458646348544528, 'epoch': 0.7156732440445762})