In [None]:
# Installing Hugging Face transformers for model handling
# Installing datasets library for easy data loading
# Setting up evaluation metrics
# Checking GPU availability (crucial for faster training)

# Install required libraries
!pip install transformers datasets accelerate evaluate torch scikit-learn -q

# Import libraries
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import evaluate
import warnings
warnings.filterwarnings('ignore')

print("✓ All libraries installed successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h✓ All libraries installed successfully!
PyTorch version: 2.9.0+cu128
CUDA available: True


In [None]:
# IMDb contains 50,000 movie reviews (25k train, 25k test)
# Binary labels: 0 (negative), 1 (positive)
# Reviews are variable length text
# We use a subset to speed up training for learning purposes

# Load IMDb dataset from Hugging Face
print("Loading IMDb dataset...")
dataset = load_dataset("imdb")

# Explore the dataset
print("\n📊 Dataset Information:")
print(dataset)
print(f"\nTraining samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")

# Look at sample data
print("\n📝 Sample Review:")
sample = dataset['train'][0]
print(f"Text: {sample['text'][:200]}...")
print(f"Label: {sample['label']} (0=negative, 1=positive)")

# Create a smaller subset for faster training (optional for learning)
# For production, use full dataset
small_train = dataset['train'].shuffle(seed=42).select(range(2000))
small_test = dataset['test'].shuffle(seed=42).select(range(500))

print(f"\n🎯 Using subset: {len(small_train)} training, {len(small_test)} test samples")

Loading IMDb dataset...


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]


📊 Dataset Information:
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

Training samples: 25000
Test samples: 25000

📝 Sample Review:
Text: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ev...
Label: 0 (0=negative, 1=positive)

🎯 Using subset: 2000 training, 500 test samples


In [None]:
# Tokenizer: Converts text to numbers the model understands
# Pre-trained model: Already knows language patterns, we're adapting it
# Sequence classification: Modified architecture for classification tasks
# Parameters: 82M means model has 82 million learnable weights

# Model configuration
model_name = "distilgpt2"  # 82M parameters
num_labels = 2  # Binary classification

print(f"Loading {model_name}...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# GPT-2 doesn't have a padding token, so we add one
tokenizer.pad_token = tokenizer.eos_token

# Load model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label={0: "negative", 1: "positive"},
    label2id={"negative": 0, "positive": 1}
)

# Configure padding token in model
model.config.pad_token_id = tokenizer.pad_token_id

# Check model size
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\n📦 Model loaded successfully!")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model size: ~{total_params / 1e6:.1f}M parameters")

Loading distilgpt2...


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/76 [00:00<?, ?it/s]

GPT2ForSequenceClassification LOAD REPORT from: distilgpt2
Key                                        | Status     | 
-------------------------------------------+------------+-
transformer.h.{0, 1, 2, 3, 4, 5}.attn.bias | UNEXPECTED | 
score.weight                               | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.



📦 Model loaded successfully!
Total parameters: 81,914,112
Trainable parameters: 81,914,112
Model size: ~81.9M parameters


In [None]:
# Converts text → token IDs (numbers)
# Truncates long texts to max_length
# Pads short texts to same length
# Creates attention masks (tells model which tokens are real vs padding)

# Tokenization function
def tokenize_function(examples):
    """
    Tokenizes the input text with truncation and padding
    """
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=512  # Maximum sequence length
    )

print("Tokenizing dataset...")

# Apply tokenization to entire dataset
tokenized_train = small_train.map(tokenize_function, batched=True)
tokenized_test = small_test.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print("✓ Tokenization complete!")
print(f"\nSample tokenized output:")
print(f"Input IDs shape: {tokenized_train[0]['input_ids'].shape}")
print(f"Attention mask shape: {tokenized_train[0]['attention_mask'].shape}")

Tokenizing dataset...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

✓ Tokenization complete!

Sample tokenized output:
Input IDs shape: torch.Size([512])
Attention mask shape: torch.Size([512])


In [5]:
# Accuracy: (Correct predictions) / (Total predictions)
# Precision: True Positives / (True Positives + False Positives)
# Recall: True Positives / (True Positives + False Negatives)
# F1 Score: 2 × (Precision × Recall) / (Precision + Recall)

# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """
    Computes accuracy, precision, recall, and F1 score
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary'
    )

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

print("Evaluation metrics configured:")
print("- Accuracy: Overall correctness")
print("- Precision: Of predicted positives, how many are correct")
print("- Recall: Of actual positives, how many we found")
print("- F1: Harmonic mean of precision and recall")

Downloading builder script: 0.00B [00:00, ?B/s]

Evaluation metrics configured:
- Accuracy: Overall correctness
- Precision: Of predicted positives, how many are correct
- Recall: Of actual positives, how many we found
- F1: Harmonic mean of precision and recall


In [6]:
# Epoch: One complete pass through the training data
# Batch size: Number of samples processed before updating weights
# Learning rate: Step size for weight updates (too high = unstable, too low = slow)
# Weight decay: Prevents overfitting by penalizing large weights
# Warmup: Gradually increase learning rate at start (stabilizes training)

# Training configuration
training_args = TrainingArguments(
    output_dir="./results",                    # Where to save model
    eval_strategy="epoch",                # Evaluate after each epoch
    save_strategy="epoch",                     # Save after each epoch
    learning_rate=2e-5,                        # Learning rate
    per_device_train_batch_size=8,             # Batch size for training
    per_device_eval_batch_size=8,              # Batch size for evaluation
    num_train_epochs=3,                        # Number of training epochs
    weight_decay=0.01,                         # Regularization
    load_best_model_at_end=True,               # Load best model at end
    metric_for_best_model="accuracy",          # Metric to determine best model
    push_to_hub=False,                         # Don't push to Hugging Face Hub
    logging_dir='./logs',                      # TensorBoard logs
    logging_steps=50,                          # Log every 50 steps
    warmup_steps=100,                          # Warmup steps
    report_to="none",                          # Disable wandb/tensorboard auto-logging
)

print("Training Configuration:")
print(f"- Epochs: {training_args.num_train_epochs}")
print(f"- Batch size: {training_args.per_device_train_batch_size}")
print(f"- Learning rate: {training_args.learning_rate}")
print(f"- Total training steps: ~{len(tokenized_train) // training_args.per_device_train_batch_size * training_args.num_train_epochs}")

`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


Training Configuration:
- Epochs: 3
- Batch size: 8
- Learning rate: 2e-05
- Total training steps: ~750


In [7]:
# Model makes predictions on batch
# Calculates loss (how wrong predictions are)
# Backpropagation: Calculates gradients
# Updates weights to reduce loss
# Repeats for all batches (1 epoch)
# Repeats for all epochs

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

print("🚀 Starting training...")
print("This may take 10-20 minutes depending on GPU availability\n")

# Train the model
train_result = trainer.train()

# Print training results
print("\n✅ Training completed!")
print(f"\nTraining metrics:")
print(f"- Total training time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"- Training loss: {train_result.metrics['train_loss']:.4f}")
print(f"- Training samples per second: {train_result.metrics['train_samples_per_second']:.2f}")

🚀 Starting training...
This may take 10-20 minutes depending on GPU availability



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.37641,0.652138,0.786,0.708709,0.95935,0.815199
2,0.274887,0.32991,0.872,0.855469,0.890244,0.87251
3,0.255923,0.476322,0.86,0.830827,0.898374,0.863281


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


✅ Training completed!

Training metrics:
- Total training time: 407.38 seconds
- Training loss: 0.4195
- Training samples per second: 14.73


In [8]:
# Good performance: Accuracy > 85%, F1 > 0.85
# Moderate performance: Accuracy 70-85%, F1 0.70-0.85
# Poor performance: Accuracy < 70%, F1 < 0.70

print("\n📊 Evaluating model on test set...")

# Evaluate
eval_results = trainer.evaluate()

print("\n🎯 Final Test Results:")
print(f"- Accuracy: {eval_results['eval_accuracy']:.4f} ({eval_results['eval_accuracy']*100:.2f}%)")
print(f"- Precision: {eval_results['eval_precision']:.4f}")
print(f"- Recall: {eval_results['eval_recall']:.4f}")
print(f"- F1 Score: {eval_results['eval_f1']:.4f}")
print(f"- Test Loss: {eval_results['eval_loss']:.4f}")

# Save detailed results
results_df = pd.DataFrame([eval_results])
results_df.to_csv('evaluation_results.csv', index=False)
print("\n✓ Results saved to 'evaluation_results.csv'")


📊 Evaluating model on test set...



🎯 Final Test Results:
- Accuracy: 0.8720 (87.20%)
- Precision: 0.8555
- Recall: 0.8902
- F1 Score: 0.8725
- Test Loss: 0.3299

✓ Results saved to 'evaluation_results.csv'


In [9]:
# Confidence > 90%: Model is very certain
# Confidence 70-90%: Model is fairly confident
# Confidence < 70%: Model is uncertain

# Test the model with custom reviews
def predict_sentiment(text):
    """
    Predicts sentiment for a given text
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(predictions, dim=-1).item()
    confidence = predictions[0][predicted_class].item()

    sentiment = "positive" if predicted_class == 1 else "negative"

    return sentiment, confidence

# Test examples
test_reviews = [
    "This movie was absolutely fantastic! Best film I've seen all year.",
    "Terrible waste of time. I want my money back.",
    "It was okay, nothing special but watchable.",
    "An absolute masterpiece of cinema!",
    "Boring and predictable plot. Very disappointing."
]

print("\n🔮 Testing model with custom reviews:\n")
for review in test_reviews:
    sentiment, confidence = predict_sentiment(review)
    print(f"Review: {review[:60]}...")
    print(f"Prediction: {sentiment.upper()} (confidence: {confidence:.2%})\n")


🔮 Testing model with custom reviews:

Review: This movie was absolutely fantastic! Best film I've seen all...
Prediction: POSITIVE (confidence: 99.29%)

Review: Terrible waste of time. I want my money back....
Prediction: NEGATIVE (confidence: 97.41%)

Review: It was okay, nothing special but watchable....
Prediction: POSITIVE (confidence: 73.96%)

Review: An absolute masterpiece of cinema!...
Prediction: POSITIVE (confidence: 98.56%)

Review: Boring and predictable plot. Very disappointing....
Prediction: NEGATIVE (confidence: 71.54%)



In [10]:
# Save model and tokenizer
save_directory = "./fine_tuned_distilgpt2_imdb"

print(f"💾 Saving model to {save_directory}...")
trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

print("✓ Model saved successfully!")
print(f"\nTo load this model later:")
print(f"model = AutoModelForSequenceClassification.from_pretrained('{save_directory}')")
print(f"tokenizer = AutoTokenizer.from_pretrained('{save_directory}')")

💾 Saving model to ./fine_tuned_distilgpt2_imdb...


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

✓ Model saved successfully!

To load this model later:
model = AutoModelForSequenceClassification.from_pretrained('./fine_tuned_distilgpt2_imdb')
tokenizer = AutoTokenizer.from_pretrained('./fine_tuned_distilgpt2_imdb')
