In [2]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.__version__)
# print(torch.cuda.get_device_name(0))

True
12.8
2.8.0+cu128


Install dependencies

In [3]:
%pip install transformers datasets rouge-score nltk sentencepiece accelerate -q

import nltk
nltk.download("punkt")


Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to C:\Users\Intern-
[nltk_data]     Tech\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**Load CNN/DailyMail dataset**

In [None]:
from datasets import load_dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

train_dataset = dataset["train"].shuffle(seed=42).select(range(20000))
val_dataset   = dataset["validation"].shuffle(seed=42).select(range(2000))

print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))


  from .autonotebook import tqdm as notebook_tqdm


Train size: 20000
Validation size: 2000


**Load T5-small model & tokenizer**

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print("Model loaded:", model_name)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Model loaded: t5-small


**Preprocessing function**

In [6]:
def preprocess(batch):
    inputs = ["summarize: " + x for x in batch["article"]]
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        batch["highlights"],
        max_length=150,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)
val_dataset   = val_dataset.map(preprocess, batched=True, remove_columns=dataset["validation"].column_names)

**ROUGE evaluation function**

In [7]:
from rouge_score import rouge_scorer
import numpy as np

scorer = rouge_scorer.RougeScorer(["rouge1","rouge2","rougeL"], use_stemmer=True)

def compute_rouge(eval_pred):
    preds, labels = eval_pred
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    r1, r2, rL = [], [], []
    for p, l in zip(preds, labels):
        scores = scorer.score(l, p)
        r1.append(scores["rouge1"].fmeasure)
        r2.append(scores["rouge2"].fmeasure)
        rL.append(scores["rougeL"].fmeasure)
    
    return {
        "rouge1": np.mean(r1),
        "rouge2": np.mean(r2),
        "rougeL": np.mean(rL)
    }


**Training Arguments**

In [18]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="t5-small-1070-colab",
    eval_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=2,        # fits 8GB VRAM
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,        # simulates larger batch
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=5,
    save_strategy="epoch",                # saves checkpoint each epoch
    predict_with_generate=True,
    fp16=True,                             # mixed precision
    logging_steps=50,
    report_to="none"
)


**Trainer**

In [19]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_rouge
)


  trainer = Seq2SeqTrainer(


**Start Training**

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,0.9963,0.997183,0.248773,0.121371,0.205225
2,0.9719,0.988497,0.250995,0.122495,0.205746


TrainOutput(global_step=5000, training_loss=1.0155254341125488, metrics={'train_runtime': 5674.3396, 'train_samples_per_second': 7.049, 'train_steps_per_second': 0.881, 'total_flos': 5413672058880000.0, 'train_loss': 1.0155254341125488, 'epoch': 2.0})

In [20]:
trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
4,0.9061,0.991228,0.249119,0.120736,0.204701
5,0.9577,0.989651,0.249724,0.121664,0.205419


TrainOutput(global_step=12500, training_loss=0.3760144564819336, metrics={'train_runtime': 6540.2537, 'train_samples_per_second': 15.29, 'train_steps_per_second': 1.911, 'total_flos': 1.35341801472e+16, 'train_loss': 0.3760144564819336, 'epoch': 5.0})

**Save Final Model**

In [21]:
trainer.save_model("t5-small-1070-colab")
tokenizer.save_pretrained("t5-small-1070-colab")


('t5-small-1070-colab\\tokenizer_config.json',
 't5-small-1070-colab\\special_tokens_map.json',
 't5-small-1070-colab\\spiece.model',
 't5-small-1070-colab\\added_tokens.json')

In [4]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the latest checkpoint (YOUR REAL TRAINED MODEL)
checkpoint_path = r"C:\Users\Intern-Tech\Documents\Text_Summarizer\t5-small-1070-colab\checkpoint-12500"

model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)
tokenizer = T5Tokenizer.from_pretrained(checkpoint_path)

# Save final clean model directory
model.save_pretrained("t5-small-final")
tokenizer.save_pretrained("t5-small-final")

print("Model saved to t5-small-final/")


Weights Saved!


In [None]:
torch.save(model.state_dict(), "t5_weights.pt")
print("Weights Saved!")