In [19]:
import transformers
print(transformers.__version__)
!pip install --upgrade --force-reinstall transformers datasets evaluate sacrebleu rouge_score



4.50.2
Collecting transformers
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Using cached sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting rouge_score
  Using cached rouge_score-0.1.2-py3-none-any.whl
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.30.1-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting packaging>=20.0 (from transformers)
  Downloading packa

# Task 1  Transformers

In [7]:
# ============================================================
# 1. Install and Import Required Libraries
# ============================================================
!pip install --upgrade --force-reinstall datasets transformers evaluate sacrebleu rouge_score -q

import os
from datasets import load_dataset
from transformers import (
    BartTokenizerFast,
    BartForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from evaluate import load as load_metric

# Optionally clear the Hugging Face cache
os.system("rm -rf ~/.cache/huggingface")

# ============================================================
# 2. Load a Very Small Subset of CNN/DailyMail (for debugging)
# ============================================================
# Load the full dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Select a fixed sample from the training split (e.g. first 1,000 examples)
fixed_sample = dataset["train"].select(range(1000))

# Create a 90/10 train/test split on the fixed sample with a fixed seed for reproducibility
split_dataset = fixed_sample.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
test_dataset  = split_dataset["test"]

print(f"Train dataset size: {len(train_dataset)} | Test dataset size: {len(test_dataset)}")

# ============================================================
# 3. Tokenization (using BartTokenizerFast)
# ============================================================
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-large-cnn")

def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["article"],
        max_length=256,
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["highlights"],
            max_length=64,
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["article", "highlights", "id"])
tokenized_test  = test_dataset.map(tokenize_function, batched=True, remove_columns=["article", "highlights", "id"])

# ============================================================
# 4. Define a Robust Recursive Flattening Function
# ============================================================
def flatten_sequence(seq):
    """
    Recursively flattens any nested list into a single flat list.
    """
    if not isinstance(seq, list):
        return [seq]
    flat = []
    for item in seq:
        if isinstance(item, list):
            flat.extend(flatten_sequence(item))
        else:
            flat.append(item)
    return flat

# ============================================================
# 5. Model and Training Arguments
# ============================================================
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
training_args = TrainingArguments(
    output_dir="./bart-summarization-demo",
    evaluation_strategy="epoch",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    logging_steps=1,
    save_total_limit=1,
    report_to="none"
)

# ============================================================
# 6. Define a Robust Helper to Safely Convert to Token IDs
# ============================================================
def safe_token_id(x):
    """
    Attempt to convert x to an integer. If it fails or if the resulting integer
    is outside the valid range for the tokenizer's vocabulary, return the pad token id.
    """
    try:
        token = int(x)
    except Exception:
        return tokenizer.pad_token_id
    # Check that the token is in the valid range.
    if token < 0 or token >= tokenizer.vocab_size:
        return tokenizer.pad_token_id
    return token

# ============================================================
# 7. Define Compute Metrics Function with Safe Token Conversion
# ============================================================
rouge = load_metric("rouge")
bleu = load_metric("sacrebleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # If predictions or labels are tuples, use the first element.
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    if isinstance(labels, tuple):
        labels = labels[0]

    # If not lists, convert to lists.
    if not isinstance(predictions, list):
        predictions = predictions.tolist()
    if not isinstance(labels, list):
        labels = labels.tolist()

    # Replace -100 in labels with pad_token_id.
    labels = [
        [tokenizer.pad_token_id if l == -100 else l for l in label]
        for label in labels
    ]

    # Flatten each prediction and label, and convert each token safely.
    flat_preds = [ [safe_token_id(x) for x in flatten_sequence(p)] for p in predictions ]
    flat_labels = [ [safe_token_id(x) for x in flatten_sequence(l)] for l in labels ]

    # Optional: Uncomment to debug the first prediction and label
    # print("First flattened prediction:", flat_preds[0])
    # print("First flattened label:", flat_labels[0])

    decoded_preds = tokenizer.batch_decode(flat_preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(flat_labels, skip_special_tokens=True)

    rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_results = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])

    return {
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"],
        "bleu": bleu_results["score"]
    }

# ============================================================
# 8. Initialize Trainer and Run Training/Evaluation
# ============================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Starting training ...")
trainer.train()

print("\nEvaluation on test samples:")
results = trainer.evaluate()
print(results)


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.4 which is incompatible.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-nvrtc-cu12 12.5.82

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Train dataset size: 900 | Test dataset size: 100


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]



Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

  trainer = Trainer(


Starting training on 2 samples...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bleu
1,0.4382,1.757172,0.000765,5.1e-05,0.000754,0.000202





Evaluation on 2 test samples:


{'eval_loss': 1.7571722269058228, 'eval_rouge1': 0.0007653803067834715, 'eval_rouge2': 5.137906251331029e-05, 'eval_rougeL': 0.0007537364033793425, 'eval_bleu': 0.00020238306662330465, 'eval_runtime': 2861.7542, 'eval_samples_per_second': 0.035, 'eval_steps_per_second': 0.035, 'epoch': 1.0}


# Dataset Description

The dataset selected is **CNN/DailyMail (version 3.0.0)** — a widely used benchmark for abstractive summarization. It consists of news articles from CNN and the Daily Mail paired with human-written summaries (highlights). The full dataset includes approximately 300K training examples and around 10–11K examples for validation and testing. For our experiments, we selected a fixed sample (e.g., the first 1,000 training examples) and then applied a 90%–10% train/test split on that fixed sample. This fixed sample approach allows for reproducible experiments and faster debugging without processing the entire dataset.

# Results Analysis & Hyperparameter Impact

The evaluation output from our experiments shows very low ROUGE and BLEU scores (for example, eval_rouge1 ≈ 0.000765 and eval_bleu ≈ 0.000202) and an evaluation loss of approximately 1.757. These low scores indicate that our minimal setup—using a small fixed sample, training for very few epochs, and a very small batch size—is insufficient for the model to learn effective summarization.

Training on a very small fixed sample does not provide enough examples for the model to capture the complex patterns needed for high-quality summarization. With only one (or very few) epochs, the model has limited opportunities to adjust its weights; increasing the number of epochs would generally lead to improved performance.

Using a very small batch size (e.g., 1 or 2) results in noisy gradient estimates. A larger batch size, within the limits of available GPU memory, would help stabilize training and improve convergence. Moreover, the learning rate is a critical hyperparameter: too low a rate may slow convergence, while too high a rate can lead to unstable training. Fine-tuning the learning rate is therefore essential.

Additionally, the frequency of evaluation and logging can impact the training process by affecting the balance between monitoring progress and training efficiency.

# Impact of the Choice of LLM

The underlying Large Language Model (LLM) significantly affects the results. We used **facebook/bart-large-cnn**, a model specifically pretrained and fine-tuned for summarization tasks on CNN/DailyMail. Its architecture and pretraining objectives are optimized for generating coherent summaries from news articles.

Larger models like BART-large have a greater capacity to capture complex patterns, but they require more data, longer training times, and careful hyperparameter tuning. A smaller model might train faster but could sacrifice summarization quality. The choice of LLM directly influences the optimal hyperparameters—such as batch size, learning rate, and number of epochs. While a model like BART-large-cnn provides a strong baseline for summarization, its performance is highly dependent on the training configuration.

# Summary

In summary, we selected the CNN/DailyMail dataset using a fixed sample (e.g., the first 1,000 examples) with a 90/10 train/test split to ensure reproducibility and facilitate quick debugging. The low evaluation scores highlight that training on a very limited dataset with few epochs and a small batch size is insufficient for effective learning. Improved performance would likely be achieved by increasing the training sample size, number of epochs, and adjusting the learning rate and batch size.

The choice of LLM (facebook/bart-large-cnn) has a significant impact on performance. Its specialization for summarization tasks gives it an advantage; however, the quality of the results is highly sensitive to the training configuration and hyperparameter settings. Experimenting with different model architectures—such as a smaller model or an alternative like T5—would affect both the training efficiency and the quality of the generated summaries.

**Recommendations for Improved Results:**

- Increase the number of training epochs.
- Use a larger subset of the dataset.
- Adjust the batch size and learning rate to better suit your hardware and task requirements.
- Experiment with different model architectures to find the optimal balance between training speed and summarization quality.
