In [1]:
!pip uninstall -y transformers accelerate peft sentence-transformers
!pip install transformers==4.38.2 datasets==2.18.0 accelerate==0.28.0 peft==0.9.0 sentence-transformers==2.2.2

Found existing installation: transformers 4.53.3
Uninstalling transformers-4.53.3:
  Successfully uninstalled transformers-4.53.3
Found existing installation: accelerate 1.9.0
Uninstalling accelerate-1.9.0:
  Successfully uninstalled accelerate-1.9.0
Found existing installation: peft 0.16.0
Uninstalling peft-0.16.0:
  Successfully uninstalled peft-0.16.0
Found existing installation: sentence-transformers 4.1.0
Uninstalling sentence-transformers-4.1.0:
  Successfully uninstalled sentence-transformers-4.1.0
Collecting transformers==4.38.2
  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.7/130.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.18.0
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting accelerate==0.28.0
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Collecting peft==0.9.0
  Downloading peft-0.9.0-py3-n

In [2]:
import torch
from datasets import load_dataset, DatasetDict
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import math
import shutil # For zipping the file

def fine_tune_xsum():
    # --- 1. Define Model and Dataset ---
    model_name = "google/flan-t5-base"
    dataset_name = "xsum"
    
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"Number of GPUs being used: {torch.cuda.device_count()}")

    # --- 2. Load Dataset, Tokenizer, and Model ---
    print("Loading dataset, tokenizer, and model...")
    
    # --- THIS IS THE CHANGE for DATASET SIZE ---
    # Load only the first 100,000 examples for training
    train_split = load_dataset(dataset_name, split="train[:100000]")
    validation_split = load_dataset(dataset_name, split="validation")
    test_split = load_dataset(dataset_name, split="test")

    dataset = DatasetDict({
        'train': train_split,
        'validation': validation_split,
        'test': test_split
    })
    # -----------------------------------------

    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    # --- 3. Preprocessing Function (Adapted for XSum) ---
    def preprocess_function(examples):
        prefix = "summarize: "
        # NOTE: XSum uses 'document' and 'summary'
        inputs = [prefix + doc for doc in examples["document"]]
        model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    # --- 4. Apply Preprocessing ---
    print("Preprocessing the dataset...")
    tokenized_datasets = dataset.map(preprocess_function, batched=True)
    print("Preprocessing complete.\n")

    # --- 5. Set Up Training Arguments with All Fixes ---
    training_args = Seq2SeqTrainingArguments(
        output_dir="/kaggle/working/flan-t5-xsum-model",
        num_train_epochs=1,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        save_total_limit=3,
        fp16=True,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='/kaggle/working/logs_xsum',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=1000,
        save_steps=1000,
        load_best_model_at_end=True,
        report_to="none",
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # --- 7. Start Fine-Tuning ---
    print("Starting the fine-tuning process...")
    trainer.train()

    # --- 8. Save the Final Model ---
    print("Training complete. Saving the final, best model...")
    final_model_path = "/kaggle/working/flan-t5-xsum-model/final"
    trainer.save_model(final_model_path)
    print(f"Model saved successfully to: {final_model_path}")

    # --- 9. AUTOMATICALLY CREATE A ZIP FILE ---
    print("\nCreating a zip file of the final model for easy download...")
    shutil.make_archive("/kaggle/working/my_final_xsum_model", 'zip', final_model_path)
    print("Successfully created my_final_xsum_model.zip in /kaggle/working/")
    print("You can now download this file from the 'Output' panel on the right.")

if __name__ == "__main__":
    fine_tune_xsum()

2025-10-14 09:52:46.369182: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760435566.561636      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760435566.623776      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


CUDA available: True
Number of GPUs being used: 2
Loading dataset, tokenizer, and model...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Preprocessing the dataset...


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]



Map:   0%|          | 0/11332 [00:00<?, ? examples/s]

Map:   0%|          | 0/11334 [00:00<?, ? examples/s]

Preprocessing complete.



dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Starting the fine-tuning process...




Step,Training Loss,Validation Loss
1000,0.5064,0.445638
2000,0.4881,0.441391
3000,0.4942,0.439014
4000,0.4864,0.437051
5000,0.4738,0.435665
6000,0.4842,0.434833


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Training complete. Saving the final, best model...
Model saved successfully to: /kaggle/working/flan-t5-xsum-model/final

Creating a zip file of the final model for easy download...
Successfully created my_final_xsum_model.zip in /kaggle/working/
You can now download this file from the 'Output' panel on the right.
