# 🧠 Fine-Tune a Transformer (HuggingFace Trainer)

Yo, bro! Ready to teach a model to vibe with movie reviews? 😎
We’re fine-tuning **DistilBERT** on the IMDB dataset for sentiment analysis — positive or negative, let’s find out! 🎬
This is step 4 of your learning path, so let’s make this model a movie critic star! 🌟

In [None]:
# ✅ Step 1: Install the tools we need
# Grabbing HuggingFace’s transformers, datasets, and evaluate for metrics. Let’s roll!
try:
    !pip install -q transformers==4.41.2 datasets==2.20.0 evaluate==0.4.3 fsspec==2024.6.0
    print("🎉 Libraries installed — ready to make some magic!")
except Exception as e:
    print(f"😕 Installation failed: {e}")
    raise

In [None]:
# ✅ Step 2: Clear cache to avoid loading issues
# Let’s start fresh to dodge any pesky cache errors!
try:
    import shutil
    import os
    cache_dir = "/root/.cache/huggingface"
    if os.path.exists(cache_dir):
        shutil.rmtree(cache_dir)
        print("🧹 Cleared cache to start fresh!")
    os.makedirs(cache_dir, exist_ok=True)
except Exception as e:
    print(f"😕 Cache clearing failed: {e}")
    raise

In [None]:
# ✅ Step 3: Import the goodies
# Loading libraries to make fine-tuning and evaluation a breeze.
try:
    import torch
    import os
    from datasets import load_dataset, Dataset, DatasetDict
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
    import evaluate
    import pandas as pd
    print("🛠️ Libraries imported — let’s get to work!")
except Exception as e:
    print(f"😕 Import failed: {e}")
    raise

In [None]:
# ✅ Step 4: Check for GPU (faster training!)
# Let’s see if we can speed things up with a GPU.
try:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"🚀 Running on {device} — let’s make it happen!")
except Exception as e:
    print(f"😕 Device check failed: {e}")
    raise

In [None]:
# ✅ Step 5: Download and load the IMDB dataset manually
# Downloading the dataset files directly to avoid issues with load_dataset("imdb").
try:
    if not os.path.exists("aclImdb"):
        !wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
        !tar -xf aclImdb_v1.tar.gz
        print("\nContents of aclImdb directory after extraction:")
        !ls aclImdb
    else:
        print("✅ Dataset directory 'aclImdb' already exists.")

    def load_imdb_data(directory):
        reviews = []
        labels = []
        for label in ['pos', 'neg']:
            subdir = os.path.join(directory, label)
            for filename in os.listdir(subdir):
                if filename.endswith(".txt"):
                    with open(os.path.join(subdir, filename), 'r', encoding='utf-8') as f:
                        reviews.append(f.read())
                    labels.append(1 if label == 'pos' else 0)
        return pd.DataFrame({'text': reviews, 'label': labels})

    train_df = load_imdb_data("aclImdb/train")
    test_df = load_imdb_data("aclImdb/test")

    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    dataset = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })
    print(f"📚 Loaded IMDB dataset with {len(dataset['train'])} train and {len(dataset['test'])} test samples!")
except Exception as e:
    print(f"😕 Failed to load dataset: {e}")
    print("👉 Try restarting Colab (Runtime > Restart session) or checking your internet.")
    raise

In [None]:
# ✅ Step 6: Grab a smaller chunk to keep Colab happy
# Using 2000 train and 1000 test samples to avoid memory hiccups.
try:
    small_train = dataset['train'].shuffle(seed=42).select(range(2000))
    small_test = dataset['test'].shuffle(seed=42).select(range(1000))
    print(f"✂️ Using {len(small_train)} train and {len(small_test)} test samples — nice and lightweight!")
except Exception as e:
    print(f"😕 Failed to split dataset: {e}")
    raise

In [None]:
# ✅ Step 7: Load tokenizer and model
# DistilBERT is a speedy, small version of BERT — perfect for Colab’s free tier.
try:
    model_ckpt = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2).to(device)
    print("🧠 Loaded DistilBERT model and tokenizer — ready to learn those reviews!")
except Exception as e:
    print(f"😕 Failed to load model/tokenizer: {e}")
    raise

In [None]:
# ✅ Step 8: Tokenize the reviews
# Turning text into numbers the model can understand (like teaching it movie lingo).
try:
    def tokenize_fn(batch):
        return tokenizer(batch['text'], padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)
    tokenized_train = small_train.map(tokenize_fn, batched=True)
    tokenized_test = small_test.map(tokenize_fn, batched=True)
    print("✨ Tokenized datasets — we're speaking DistilBERT’s language now!")
except Exception as e:
    print(f"😕 Tokenization failed: {e}")
    raise

In [None]:
# ✅ Step 9: Set up accuracy metric
# Let’s measure how well our model predicts positive vs. negative vibes.
try:
    accuracy = evaluate.load('accuracy')
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = torch.argmax(torch.tensor(logits, device=device), axis=1)
        return accuracy.compute(predictions=preds.cpu().numpy(), references=labels)
    print("📊 Accuracy metric ready — ready to check our score!")
except Exception as e:
    print(f"😕 Failed to load metric: {e}")
    raise

In [None]:
# ✅ Step 10: Set up the training pipeline
# This is our study plan for DistilBERT to learn the movie review game.
try:
    training_args = TrainingArguments(
        output_dir='./results',
        eval_strategy='epoch',
        save_strategy='epoch',
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        logging_dir='./logs',
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
        report_to='none'
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        compute_metrics=compute_metrics
    )
    print("⚙️ Training pipeline ready — time to train like a champ!")
except Exception as e:
    print(f"😕 Failed to set up trainer: {e}")
    raise

In [None]:
# ✅ Step 11: Train the model
# Let’s teach DistilBERT to predict those movie review sentiments!
try:
    trainer.train()
    print("🎉 Training complete — our model’s got some serious movie review skills now!")
except Exception as e:
    print(f"😕 Training failed: {e}")
    print("👉 Try reducing batch size to 4 or restarting Colab if memory runs out!")
    raise

In [None]:
# ✅ Step 12: Evaluate the model
# How good is our model at predicting sentiments? Let’s find out!
try:
    results = trainer.evaluate()
    print(f"\n📈 Evaluation results: {results}")
except Exception as e:
    print(f"😕 Evaluation failed: {e}")
    raise

In [None]:
# ✅ Step 13: Test with your own review
# Throw in a movie review and see what the model thinks!
try:
    test_review = "I absolutely loved this movie, it was fantastic!"
    inputs = tokenizer(test_review, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    outputs = model(**inputs)
    prediction = outputs.logits.argmax().item()
    print(f"\n🎥 Your review: '{test_review}'")
    print(f"🤖 Prediction: {'Positive' if prediction == 1 else 'Negative'}")
except Exception as e:
    print(f"😕 Prediction failed: {e}")
    raise

# 📚 Tips for Having Fun
- Got a GPU? Try a bigger dataset (e.g., 5000 train samples) in Step 6.
- Play with Step 13: Test reviews like "This movie was awful!" or your own.
- Want a bigger model? Swap to bert-base-uncased in Step 7 (needs more memory).
- Dive into HuggingFace’s Trainer docs (https://huggingface.co/docs/transformers/main_classes/trainer) for pro tips!

# 🚀 What’s Next?
- Save this as your fourth notebook in your learning path.
- Use this model in your LLM Evaluation notebook (step 3) to check its outputs.
- Explore RAG with LangChain/ChromaDB or Weaviate (steps 5–6) for more fun!
