# Fine-Tuning a Transformer for Sentiment Analysis
Goal: Train a model to classify IMDb movie reviews as "positive" or "negative".


**Cell 1: Install Necessary Libraries**

In [None]:
!pip install --upgrade transformers datasets accelerate evaluate

**Cell 2: Imports and GPU Check**

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

# Check if a GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

**Cell 3: Load and Prepare the Dataset**

In [None]:
# Load the dataset from the Hugging Face Hub.
# The "imdb" dataset is a common dataset for sentiment analysis.
dataset = load_dataset("imdb")

# The dataset has 'train' and 'test' splits by default.
# We will use these splits for training and evaluating our model.

# To speed up training for this example, we will select a smaller subset of the data.
# We shuffle the training dataset with a fixed seed for reproducibility and select the first 10,000 examples.
train_dataset = dataset["train"].shuffle(seed=42).select(range(25000))
# We shuffle the test dataset with the same seed and select the first 2,000 examples for evaluation.
test_dataset = dataset["test"].shuffle(seed=42).select(range(2500))

# Print a sample from the training dataset to see the data structure.
print("Sample training data:")
print(train_dataset[0])

**Cell 4: Preprocessing with a Tokenizer**

In [None]:
# Load the tokenizer associated with the pre-trained model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a function to tokenize the text data.
# This function takes a dictionary of examples as input and returns a dictionary with tokenized sequences.
# 'padding="max_length"' pads the sequences to the maximum length of the model's input.
# 'truncation=True' truncates sequences longer than the model's maximum input length.
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the tokenizer function to both the training and testing datasets using the .map() method.
# 'batched=True' processes the examples in batches, which is more efficient.
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# The tokenizer adds new columns to the dataset: 'input_ids' (the token IDs) and 'attention_mask' (indicates which tokens are padding).
print("\nSample tokenized data:")
print(tokenized_train_dataset[0])

**Cell 5: Load the Pre-trained Model**

In [None]:
# Load the pre-trained model with a classification head
# AutoModelForSequenceClassification is a model that is suitable for classification tasks.
# We specify the pre-trained model name ("distilbert-base-uncased") to load its weights.
# num_labels=2 tells the model that we have two output classes (positive and negative) for our sentiment analysis task.
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Move the model to the GPU for faster training if a GPU is available.
# The 'device' variable was determined earlier in the notebook.
model.to(device)

**Cell 6: Define Training Arguments**

In [None]:
from transformers.trainer_utils import IntervalStrategy

# Define the training arguments for the Trainer.
# These arguments control various aspects of the training process.
training_args = TrainingArguments(
    output_dir="./results",          # Directory where the trained model and checkpoints will be saved.
    num_train_epochs=3,              # The total number of training epochs to perform. An epoch is one full pass over the training data.
    per_device_train_batch_size=16,  # Batch size per GPU/CPU for training. Larger batch sizes can speed up training but require more memory.
    per_device_eval_batch_size=16,   # Batch size per GPU/CPU for evaluation.
    warmup_steps=500,                # Number of steps for the learning rate scheduler to warm up. This helps in stable training at the beginning.
    weight_decay=0.01,               # The amount of weight decay to apply for regularization. This helps prevent overfitting.
    logging_dir="./logs",            # Directory where training logs will be saved.
    logging_steps=100,               # Log the training loss and other metrics every 100 steps.
    save_total_limit=1               # Limit the total number of checkpoints saved to 1. This saves disk space by only keeping the best checkpoint.
)

In [None]:
# from datasets import load_metric
import evaluate

# Load the accuracy metric using the evaluate library.
# This metric will be used to assess the performance of the model during evaluation.
# metric = load_metric("accuracy")
metric = evaluate.load("accuracy")


# Function to compute metrics during evaluation.
# This function is required by the Trainer.
# eval_pred is a tuple containing the model's predictions (logits) and the true labels.
def compute_metrics(eval_pred):
    # Unpack the predictions and labels.
    logits, labels = eval_pred
    # Get the predicted class index by finding the index of the maximum logit for each example.
    predictions = logits.argmax(axis=-1)
    # Compute the accuracy using the loaded metric.
    return metric.compute(predictions=predictions, references=labels)

**Cell 7: Create the Trainer and Train!**

In [None]:
# Create the Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics, # Add this line to include the metric computation
)

# Start the fine-tuning process!
print("Starting training...")
trainer.train()
print("Training finished!")

**Cell 8: Evaluate the Fine-Tuned Model**

In [None]:
print("Evaluating the model on the test set...")
evaluation_results = trainer.evaluate()

print("\n--- Evaluation Results ---")
print(f"Accuracy: {evaluation_results['eval_accuracy']:.4f}")
print(f"Loss: {evaluation_results['eval_loss']:.4f}")

**Cell 9: Use the Model for a New Prediction**

In [None]:
from torch.nn.functional import softmax

# Let's test with two different reviews
reviews = [
    "This movie was absolutely fantastic! The acting was brilliant and the plot was engaging.",
    "It was a complete waste of time. The plot was predictable and the characters were boring."
]

# The labels are 0 for 'negative' and 1 for 'positive'
labels = ["Negative", "Positive"]

for review in reviews:
    # Tokenize the new text
    inputs = tokenizer(review, return_tensors="pt", padding=True, truncation=True)

    # Move tensors to the same device as the model (the GPU)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get predictions from the model
    with torch.no_grad(): # Disable gradient calculation for inference
        outputs = model(**inputs)
        logits = outputs.logits

        # Apply softmax to convert logits to probabilities
        probabilities = softmax(logits, dim=1)

        # Get the most likely class
        prediction_index = torch.argmax(probabilities, dim=1).item()

    print("\n--------------------")
    print(f"Review: '{review}'")
    print(f"Prediction: {labels[prediction_index]}")
    print(f"Confidence (Probabilities): Negative={probabilities[0][0]:.4f}, Positive={probabilities[0][1]:.4f}")