In [1]:
!pip uninstall -y transformers accelerate peft sentence-transformers
!pip install transformers==4.38.2 datasets==2.18.0 accelerate==0.28.0 peft==0.9.0 sentence-transformers==2.2.2

Found existing installation: transformers 4.53.3
Uninstalling transformers-4.53.3:
  Successfully uninstalled transformers-4.53.3
Found existing installation: accelerate 1.9.0
Uninstalling accelerate-1.9.0:
  Successfully uninstalled accelerate-1.9.0
Found existing installation: peft 0.16.0
Uninstalling peft-0.16.0:
  Successfully uninstalled peft-0.16.0
Found existing installation: sentence-transformers 4.1.0
Uninstalling sentence-transformers-4.1.0:
  Successfully uninstalled sentence-transformers-4.1.0
Collecting transformers==4.38.2
  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m130.7/130.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.18.0
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting accelerate==0.28.0
  Downloading accelerate-0.28.0-py3-none-an

In [2]:
import os
import torch
import shutil
from tqdm.auto import tqdm
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

def fine_tune_flan_t5_optimized():
    # --- 1Ô∏è‚É£ Setup ---
    model_name = "google/flan-t5-base"
    dataset_name = "cnn_dailymail"
    dataset_version = "3.0.0"
    cache_dir = "/kaggle/working/tokenized_cnn"
    output_dir = "/kaggle/working/flan-t5-cnn-dailymail-model"

    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"GPUs detected: {torch.cuda.device_count()}")

    # --- 2Ô∏è‚É£ Load Dataset ---
    print("\nüì¶ Loading dataset...")
    train_split = load_dataset(dataset_name, dataset_version, split="train[:100000]")
    validation_split = load_dataset(dataset_name, dataset_version, split="validation")
    test_split = load_dataset(dataset_name, dataset_version, split="test")

    dataset = DatasetDict({
        "train": train_split,
        "validation": validation_split,
        "test": test_split
    })

    # --- 3Ô∏è‚É£ Tokenizer & Model ---
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    # --- 4Ô∏è‚É£ Preprocessing Function ---
    def preprocess_function(examples):
        prefix = "summarize: "
        inputs = [prefix + doc for doc in examples["article"]]
        model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    # --- 5Ô∏è‚É£ Cache Handling ---
    rebuild_cache = False
    if os.path.exists(cache_dir):
        try:
            print("\nüìÇ Loading tokenized dataset from cache...")
            tokenized_datasets = load_from_disk(cache_dir)
            # Check if splits are non-empty
            if any(len(split) == 0 for split in tokenized_datasets.values()):
                print("‚ö†Ô∏è Cache appears empty ‚Äî rebuilding...")
                rebuild_cache = True
        except Exception as e:
            print(f"‚ö†Ô∏è Cache corrupted ({e}) ‚Äî rebuilding...")
            rebuild_cache = True
    else:
        rebuild_cache = True

    # --- 6Ô∏è‚É£ Tokenization Step (if needed) ---
    if rebuild_cache:
        if os.path.exists(cache_dir):
            shutil.rmtree(cache_dir)
        print("‚öôÔ∏è Tokenizing dataset ‚Äî this may take a few minutes...")
        tokenized_datasets = dataset.map(
            preprocess_function,
            batched=True,
            num_proc=4,
            remove_columns=dataset["train"].column_names,
            desc="Tokenizing"
        )
        tokenized_datasets.save_to_disk(cache_dir)
        print("‚úÖ Tokenized dataset saved to cache.")
    else:
        print("‚úÖ Using cached dataset.")

    print({k: len(v) for k, v in tokenized_datasets.items()})

    # --- 7Ô∏è‚É£ Training Arguments ---
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=4,   # Larger batch = faster training
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2,
        save_total_limit=2,
        fp16=True,
        bf16=False,
        optim="adafactor",              # ‚úÖ Adafactor saves VRAM
        evaluation_strategy="steps",
        eval_steps=2000,
        save_steps=2000,
        logging_steps=200,
        predict_with_generate=True,
        report_to="none",
        load_best_model_at_end=True
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    # --- 8Ô∏è‚É£ Trainer ---
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # --- 9Ô∏è‚É£ Train ---
    print("\nüöÄ Starting fine-tuning...")
    trainer.train()

    # --- üîü Save Model ---
    final_model_path = f"{output_dir}/final"
    print("\nüíæ Saving model...")
    trainer.save_model(final_model_path)

    # --- üîÅ Zip Model ---
    print("\nüì¶ Creating ZIP archive...")
    shutil.make_archive("/kaggle/working/my_final_cnn_model", 'zip', final_model_path)
    print("‚úÖ Model archived at /kaggle/working/my_final_cnn_model.zip")

if __name__ == "__main__":
    fine_tune_flan_t5_optimized()


2025-10-14 05:20:20.904033: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760419221.330685      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760419221.451449      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


CUDA available: True
GPUs detected: 2

üì¶ Loading dataset...


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 257M/257M [00:01<00:00, 163MB/s]
Downloading data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 257M/257M [00:01<00:00, 184MB/s]
Downloading data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 259M/259M [00:01<00:00, 183MB/s]
Downloading data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 34.7M/34.7M [00:00<00:00, 110MB/s]
Downloading data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30.0M/30.0M [00:00<00:00, 101MB/s] 


Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

‚öôÔ∏è Tokenizing dataset ‚Äî this may take a few minutes...


Tokenizing (num_proc=4):   0%|          | 0/100000 [00:00<?, ? examples/s]

Tokenizing (num_proc=4):   0%|          | 0/13368 [00:00<?, ? examples/s]

Tokenizing (num_proc=4):   0%|          | 0/11490 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/13368 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11490 [00:00<?, ? examples/s]

‚úÖ Tokenized dataset saved to cache.
{'train': 100000, 'validation': 13368, 'test': 11490}


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



üöÄ Starting fine-tuning...




Step,Training Loss,Validation Loss
2000,1.0179,1.056085
4000,0.9996,1.056477
6000,1.0044,1.052414


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].



üíæ Saving model...

üì¶ Creating ZIP archive...
‚úÖ Model archived at /kaggle/working/my_final_cnn_model.zip
