In [25]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
import torch

In [21]:
# 1. Load Dataset
dataset = load_dataset("ccdv/arxiv-summarization")
dataset = dataset["train"].select(range(1000))  # Use a smaller subset for faster training

In [23]:
# Preview a few samples
for i in range(3):
    print(f"\nSample {i+1}:")
    print("Article:\n", dataset[i]["article"][:10000])  # first 1000 chars
    print("\nAbstract:\n", dataset[i]["abstract"])


Sample 1:
Article:
 additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability when compared to fully nonparametric models . 
 it is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models . 
 many examples of such estimators belong to the large class of regularized kernel based methods over a reproducing kernel hilbert space @xmath0 , see e.g. @xcite . in the last years 
 many interesting results on learning rates of regularized kernel based models for additive models have been published when the focus is on sparsity and when the classical least squares loss function is used , see e.g. @xcite , @xcite , @xcite , @xcite , @xcite , @xcite and the references th

In [26]:
# 2. Tokenizer and Model Setup
model_checkpoint = "google/long-t5-tglobal-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

config.json:   0%|          | 0.00/851 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [11]:
# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
#     # model = torch.nn.DataParallel(model)

# model = model.to("cuda" if torch.cuda.is_available() else "cpu")

In [27]:
# 3. Preprocessing Function
def preprocess_function(examples):
    inputs = ["summarize: " + article for article in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples["abstract"], max_length=150, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [28]:
# 4. Tokenize Dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [29]:
# 5. Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    run_name="longt5-arxiv-summarization-run",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    save_steps=500,
    fp16=torch.cuda.is_available(),
    report_to="none",
    remove_unused_columns=False
)

In [30]:
# 6. Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

In [31]:
print("🚀 Starting training...")
!nvidia-smi 

🚀 Starting training...
Wed Jun 18 12:35:22 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   72C    P0             30W /   70W |    5943MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4          

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
# 7. Train
trainer.train()
!nvidia-smi

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,5.1449
200,4.0221
300,3.3728
400,3.171
500,3.0113
600,2.8728
700,2.8395
800,2.746
900,2.7541
1000,2.6924




Wed Jun 18 12:51:15 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   73C    P0             30W /   70W |    6629MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [33]:
# 8. Inference on a Sample
sample_text = dataset[0]["article"]
input_ids = tokenizer("summarize: " + sample_text, return_tensors="pt", max_length=512, truncation=True).input_ids.to("cuda")
output_ids = model.module.generate(input_ids, max_length=150, num_beams=2) if hasattr(model, "module") else model.generate(input_ids, max_length=150, num_beams=2)
summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [34]:
print("\nGenerated Summary:\n", summary)


Generated Summary:
 we study the case of regularized kernel based methods based on a general convex and on a bounded loss function on a general kernel . we consider the case of regularized kernel based methods based on a general convex and on a bounded loss function on a general kernel . we also consider a general case of regularized kernel based methods based on a general convex and on a bounded loss function on a general kernel . we also consider the case of regularized kernel based methods based on a general convex and on a bounded loss function .


In [39]:
pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=16c320310333a436c1ff3ba1a7bcd2159bae6fff6de919f35c91ab1613c85c58
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [40]:
# Evaluate with ROUGE
import evaluate
rouge = evaluate.load("rouge")

In [41]:
preds, refs = [], []
for ex in dataset.select(range(10)):
    input_ids = tokenizer("summarize: " + ex["article"], return_tensors="pt", max_length=2048, truncation=True).input_ids.to("cuda")
    output_ids = model.generate(input_ids, max_length=256, num_beams=2)
    pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    preds.append(pred)
    refs.append(ex["abstract"])

In [42]:
results = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
print("\nROUGE Evaluation (10 samples):\n", results)


ROUGE Evaluation (10 samples):
 {'rouge1': 0.263204747006002, 'rouge2': 0.06315976492163909, 'rougeL': 0.18880264946761055, 'rougeLsum': 0.22619862034347016}
