In [1]:
!pip install -q torch torchvision torchaudio
!pip install -q transformers datasets evaluate accelerate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q tensorboard


In [3]:
# ==============================================
# IMDb Sentiment Classification Fine-Tuning (Colab + GPU)
# ==============================================

import os
import torch
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    set_seed,
)

# ---------------------------
# Config
# ---------------------------
MODEL_NAME = "distilbert-base-uncased"
DATASET = "imdb"
TEXT_COLUMN = "text"
LABEL_COLUMN = "label"
OUTPUT_DIR = "./outputs/imdb-finetune"

SEED = 42
NUM_EPOCHS = 3
LR = 2e-5
BATCH_SIZE = 16
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06

set_seed(SEED)

# ---------------------------
# Device Check (GPU/CPU)
# ---------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🔹 Using device: {device}")

# ---------------------------
# Load Dataset
# ---------------------------
print("🔹 Loading IMDb dataset...")
raw_datasets = load_dataset(DATASET)

# ---------------------------
# Tokenizer
# ---------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    return tokenizer(examples[TEXT_COLUMN], truncation=True)

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# ---------------------------
# Model
# ---------------------------
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels).to(device)

# ---------------------------
# Data Collator
# ---------------------------
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ---------------------------
# Metric
# ---------------------------
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# ---------------------------
# Training Arguments
# ---------------------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",   # ✅ corrected (was eval_strategy)
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    logging_dir=f"{OUTPUT_DIR}/logs",
    load_best_model_at_end=True,
    report_to="none",
)

# ---------------------------
# Trainer
# ---------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,   # ✅ updated
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ---------------------------
# Train
# ---------------------------
print("🚀 Starting training...")
trainer.train()

# ---------------------------
# Evaluate
# ---------------------------
print("📊 Evaluating model...")
metrics = trainer.evaluate()
print(metrics)

# ---------------------------
# Save Model
# ---------------------------
print("💾 Saving fine-tuned model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"✅ Model fine-tuned and saved at {OUTPUT_DIR}")


🔹 Using device: cuda
🔹 Loading IMDb dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

🚀 Starting training...


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2266,0.223637,0.91156
2,0.1584,0.228116,0.93052
3,0.0939,0.276004,0.93316


📊 Evaluating model...


{'eval_loss': 0.22363707423210144, 'eval_accuracy': 0.91156, 'eval_runtime': 356.519, 'eval_samples_per_second': 70.122, 'eval_steps_per_second': 4.384, 'epoch': 3.0}
💾 Saving fine-tuned model...
✅ Model fine-tuned and saved at ./outputs/imdb-finetune


In [4]:
# ==============================================
# Inference Script - IMDb Sentiment Prediction
# ==============================================

from transformers import pipeline

# Path to fine-tuned model
MODEL_PATH = "./outputs/imdb-finetune"

# Load pipeline with fine-tuned model
sentiment_pipeline = pipeline(
    "text-classification",
    model=MODEL_PATH,
    tokenizer=MODEL_PATH,
    device=0 if torch.cuda.is_available() else -1   # ✅ GPU if available
)

# Test sample reviews
test_reviews = [
    "This movie was absolutely fantastic! I loved the story and acting.",
    "Terrible movie. Waste of time, I wouldn’t recommend it.",
]

print("🔹 Running Inference...\n")
for review in test_reviews:
    result = sentiment_pipeline(review)[0]
    label = "Positive 😀" if result["label"] == "LABEL_1" else "Negative 😡"
    print(f"Review: {review}")
    print(f"Prediction: {label} (score={result['score']:.4f})\n")


Device set to use cuda:0


🔹 Running Inference...

Review: This movie was absolutely fantastic! I loved the story and acting.
Prediction: Positive 😀 (score=0.9916)

Review: Terrible movie. Waste of time, I wouldn’t recommend it.
Prediction: Negative 😡 (score=0.9935)



In [5]:
# ==============================================
# Gradio Sentiment Analysis App
# ==============================================
import gradio as gr
from transformers import pipeline

# Path to fine-tuned model
MODEL_PATH = "./outputs/imdb-finetune"

# Load fine-tuned pipeline
sentiment_pipeline = pipeline(
    "text-classification",
    model=MODEL_PATH,
    tokenizer=MODEL_PATH,
    device=0 if torch.cuda.is_available() else -1
)

def predict_sentiment(review):
    result = sentiment_pipeline(review)[0]
    label = "Positive 😀" if result["label"] == "LABEL_1" else "Negative 😡"
    return f"{label} (confidence: {result['score']:.2f})"

# Launch Gradio app
demo = gr.Interface(
    fn=predict_sentiment,
    inputs=gr.Textbox(lines=3, placeholder="Enter a movie review..."),
    outputs="text",
    title="🎬 IMDb Sentiment Classifier",
    description="Fine-tuned DistilBERT model predicting movie review sentiment."
)

demo.launch(share=True)   # share=True gives public link in Colab


Device set to use cuda:0


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://68e254dd121aa40a4f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


