# Fine-tuning T5-Small for ArXiv Paper Summarization


Install Required Libraries (if not installed)

In [None]:
#%pip install datasets evaluate transformers rouge-score nltk

Note: you may need to restart the kernel to use updated packages.


## Import Required Libraries

In [2]:
import numpy as np
import torch
from datasets import load_dataset
from evaluate import load
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') 

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Setup Device (CPU/GPU)

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


Load Pre-trained Model and Tokenizer

In [4]:
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

## Load Dataset and Evaluation Metric

Load ArXiv summarization dataset

In [5]:
raw_datasets = load_dataset("ccdv/arxiv-summarization")
metric = load("rouge")

## Define Data Processing Parameters and Functions

Set maximum lengths for input and output

In [None]:
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    """ 
    Preprocess and tokenize the dataset for the model.

    Args:
        examples: Raw datset examples

    Returns:
        model_inputs: Tokenized and formatted inputs for the model
    """
    inputs = [f"abstract: {doc}" for doc in examples["article"]]
    model_inputs = tokenizer(
        inputs, 
        max_length=max_input_length, 
        truncation=True,
        padding="max_length"
    )
    
    labels = tokenizer(
        text_target=examples["abstract"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length"
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

## Preprocess Dataset

Apply preprocessing to entire dataset

In [8]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Configure Training Parameters

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="t5-small-finetuned-arxiv",
    evaluation_strategy="steps",
    eval_steps=1000,  
    save_steps=1000, 
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    logging_dir="./logs",
    report_to="none",
    push_to_hub=False
)


# Initialize data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)



Define Evaluation Metrics

In [None]:
def compute_metrics(eval_pred):
    """
    Compute the metrics for evaluation.

    Args:
        eval_pred: Evaluation predictions

    Returns:
        result: Dictionary of computed metrics
    """
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        use_aggregator=True
    )
    return {k: round(v * 100, 4) for k, v in result.items()}

## Initialize and Start Training
Setup trainer


In [11]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Seq2SeqTrainer(


Start training


In [12]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1000,2.7368,2.57882,17.8685,6.416,14.1888,16.1815
2000,2.6537,2.524384,18.122,6.7082,14.4169,16.4287
3000,2.628,2.492149,18.1831,6.733,14.4298,16.4544
4000,2.5906,2.468353,18.2407,6.794,14.461,16.5224
5000,2.5771,2.457046,18.3844,6.8785,14.5734,16.6396
6000,2.5688,2.442801,18.4195,6.9044,14.5646,16.6825
7000,2.531,2.431459,18.4318,6.91,14.5544,16.6857
8000,2.5419,2.422232,18.4713,6.9145,14.597,16.72
9000,2.5099,2.415556,18.4608,6.9495,14.6006,16.7166
10000,2.5115,2.40943,18.5402,6.9834,14.6445,16.7832


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=12690, training_loss=2.5846937631978304, metrics={'train_runtime': 25631.5724, 'train_samples_per_second': 7.921, 'train_steps_per_second': 0.495, 'total_flos': 5.495878669094093e+16, 'train_loss': 2.5846937631978304, 'epoch': 1.0})

Save model

In [13]:
trainer.save_model("t5-small-finetuned-arxiv")