# BART-Base Fine-tuning for Document Summarization
This notebook demonstrates fine-tuning BART-base model on CNN/DailyMail dataset for document summarization tasks.

In [6]:
# Install required packages
%pip install evaluate accelerate

Collecting evaluate
  Using cached evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting accelerate
  Using cached accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Using cached evaluate-0.4.5-py3-none-any.whl (84 kB)
Using cached accelerate-1.10.1-py3-none-any.whl (374 kB)
Installing collected packages: accelerate, evaluate

   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [acce

In [1]:
# Import libraries
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
    pipeline
)
import warnings
warnings.filterwarnings('ignore')

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

Using device: cuda
GPU: NVIDIA GeForce RTX 3050 Laptop GPU
GPU Memory: 4.0 GB


In [2]:
# Load CNN/DailyMail dataset
print("Loading CNN/DailyMail dataset...")
dataset = load_dataset('abisee/cnn_dailymail', '3.0.0')

# Use subset for faster training - adjust as needed
train_subset_size = 5000  # Use 5k samples for training
val_subset_size = 500     # Use 500 samples for validation

dataset['train'] = dataset['train'].select(range(train_subset_size))
dataset['validation'] = dataset['validation'].select(range(val_subset_size))

print(f"Training samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")

# Show a sample
print("\nSample article (first 200 chars):")
print(dataset['train'][0]['article'][:200] + "...")
print("\nSample summary:")
print(dataset['train'][0]['highlights'])

Loading CNN/DailyMail dataset...
Training samples: 5000
Validation samples: 500

Sample article (first 200 chars):
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on ...

Sample summary:
Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


In [3]:
# Load BART-base model and tokenizer
model_name = "facebook/bart-base"
print(f"Loading {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print(f"Model loaded: {model_name}")
print(f"Model parameters: {model.num_parameters():,}")
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")

# Move model to device
model.to(device)
print(f"Model moved to {device}")

Loading facebook/bart-base...
Model loaded: facebook/bart-base
Model parameters: 139,420,416
Tokenizer vocab size: 50265
Model moved to cuda


In [4]:
# Preprocessing function
def preprocess_function(examples):
    # Get articles and summaries
    articles = [doc for doc in examples["article"]]
    summaries = [doc for doc in examples["highlights"]]
    
    # Tokenize inputs and targets
    model_inputs = tokenizer(
        articles,
        max_length=512,          # Max input length
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    
    # Tokenize targets (summaries)
    labels = tokenizer(
        text_target=summaries,
        max_length=128,          # Max summary length
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to dataset
print("Preprocessing dataset...")
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

print("Preprocessing completed!")
print(f"Tokenized training samples: {len(tokenized_dataset['train'])}")
print(f"Tokenized validation samples: {len(tokenized_dataset['validation'])}")

Preprocessing dataset...


Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Preprocessing completed!
Tokenized training samples: 5000
Tokenized validation samples: 500


In [None]:
# Preprocessing function
def preprocess_function(examples):
    # Get articles and summaries
    articles = [doc for doc in examples["article"]]
    summaries = [doc for doc in examples["highlights"]]
    
    # Tokenize inputs and targets
    model_inputs = tokenizer(
        articles,
        max_length=512,          # Max input length
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    
    # Tokenize targets (summaries)
    labels = tokenizer(
        text_target=summaries,
        max_length=128,          # Max summary length
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to dataset
print("Preprocessing dataset...")
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

print("Preprocessing completed!")
print(f"Tokenized training samples: {len(tokenized_dataset['train'])}")
print(f"Tokenized validation samples: {len(tokenized_dataset['validation'])}")

In [5]:
# Data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

# Training arguments optimized for speed
training_args = TrainingArguments(
    output_dir="./bart_cnn_summarization",
    num_train_epochs=2,                    # 2 epochs for balance of speed/quality
    per_device_train_batch_size=4,         # Adjust based on your GPU memory
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,         # Effective batch size = 4*2 = 8
    learning_rate=3e-5,                    # Good learning rate for fine-tuning
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,                             # Mixed precision for speed
    dataloader_num_workers=2,
    remove_unused_columns=True,
    report_to="none",                      # Disable wandb
    push_to_hub=False,
)

print("Training arguments configured:")
print(f"  - Epochs: {training_args.num_train_epochs}")
print(f"  - Batch size: {training_args.per_device_train_batch_size}")
print(f"  - Learning rate: {training_args.learning_rate}")
print(f"  - Mixed precision: {training_args.fp16}")

Training arguments configured:
  - Epochs: 2
  - Batch size: 4
  - Learning rate: 3e-05
  - Mixed precision: True


In [6]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("Trainer initialized!")
print("Starting training...")

# Start training
try:
    trainer.train()
    print("Training completed successfully!")
except Exception as e:
    print(f"Training failed with error: {e}")
    raise

Trainer initialized!
Starting training...


Step,Training Loss,Validation Loss
500,1.5844,1.170038
1000,1.3531,1.14455


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Training completed successfully!


In [7]:
# Save the fine-tuned model
model_save_path = "./fine_tuned_bart_cnn"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved to: {model_save_path}")
print("You can now use this model for inference!")

Model saved to: ./fine_tuned_bart_cnn
You can now use this model for inference!
