# RoBERTa Training for Sentiment Analysis

In [None]:
!pip install transformers datasets torch pandas scikit-learn evaluate accelerate -q

In [None]:
import pandas as pd
import json
import time
from datasets import Dataset
from transformers import (
    RobertaTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import evaluate
from sklearn.model_selection import train_test_split
import torch

In [None]:
train_df = pd.read_csv('dataset/train.csv')
train, val = train_test_split(train_df, test_size=0.2, stratify=train_df['Sentiment'], random_state=42)

print(f"Train: {len(train)}, Val: {len(val)}")

In [None]:
label_list = sorted(train_df['Sentiment'].unique())
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}
num_labels = len(label_list)

def encode_labels(example):
    example['labels'] = label2id[example['Sentiment']]
    return example

train_dataset = Dataset.from_pandas(train).map(encode_labels)
val_dataset = Dataset.from_pandas(val).map(encode_labels)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_function(example):
    return tokenizer(example['Text'], truncation=True, padding='max_length', max_length=64)

tokenized_train = train_dataset.map(tokenize_function, batched=True).remove_columns(['Text'])
tokenized_val = val_dataset.map(tokenize_function, batched=True).remove_columns(['Text'])

tokenized_train.set_format('torch')
tokenized_val.set_format('torch')

In [None]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="macro")["f1"],
        "precision": precision.compute(predictions=predictions, references=labels, average="macro")["precision"],
        "recall": recall.compute(predictions=predictions, references=labels, average="macro")["recall"]
    }

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    'roberta-base', 
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

In [None]:
training_args = TrainingArguments(
    output_dir="./results/roberta-base",
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    learning_rate=3e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=15,
    weight_decay=0.01,
    max_grad_norm=1.0,
    logging_steps=20,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=3,
    fp16=torch.cuda.is_available(),
    report_to="none",
    seed=42
)

In [None]:
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

In [None]:
start = time.time()
train_result = trainer.train()
elapsed = time.time() - start

print(f"\nTraining time: {int(elapsed//60)}m {int(elapsed%60)}s")

In [None]:
eval_results = trainer.evaluate()

print(f"Accuracy:  {eval_results['eval_accuracy']:.4f}")
print(f"F1:        {eval_results['eval_f1']:.4f}")
print(f"Precision: {eval_results['eval_precision']:.4f}")
print(f"Recall:    {eval_results['eval_recall']:.4f}")

In [None]:
best_model_path = "./best_roberta_model"

trainer.save_model(best_model_path)
tokenizer.save_pretrained(best_model_path)

label_mappings = {
    'label2id': label2id,
    'id2label': id2label,
    'label_list': label_list
}

with open(f"{best_model_path}/label_mappings.json", 'w') as f:
    json.dump(label_mappings, f, indent=2)

print(f"Model saved to {best_model_path}")

In [None]:
import torch.nn.functional as F

loaded_model = AutoModelForSequenceClassification.from_pretrained(best_model_path)
loaded_tokenizer = RobertaTokenizer.from_pretrained(best_model_path)

with open(f"{best_model_path}/label_mappings.json", 'r') as f:
    loaded_labels = json.load(f)

test_texts = [
    "This is absolutely amazing! I love it!",
    "Terrible experience, very disappointed.",
    "It's okay, nothing special."
]

loaded_model.eval()

for text in test_texts:
    inputs = loaded_tokenizer(text, return_tensors='pt', truncation=True, max_length=64)
    with torch.no_grad():
        outputs = loaded_model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
        pred_class = outputs.logits.argmax(dim=-1).item()
    
    pred_label = loaded_labels['id2label'][str(pred_class)]
    print(f"{text[:50]:50s} -> {pred_label} ({probs[0, pred_class]:.3f})")