In [None]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score

Collecting transformers
  Downloading transformers-4.48.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.48.2-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.1
    Uninstalling transformers-4.47.1:
      Successfully uninstalled transformers-4.47.1
Successfully installed transformers-4.48.2
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multi

In [None]:
import torch
import pprint
import evaluate
import numpy as np

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

In [None]:
from huggingface_hub import login


In [None]:
# Load the dataset from Google Drive
#dataset_path = "/content/drive/MyDrive/Summarization Datasets/samsum"
#Dataset = load_dataset(dataset_path)
Dataset = load_dataset("knkarthick/samsum")
dataset_train = Dataset['train']
dataset_valid = Dataset['validation']
dataset_test = Dataset['test']

print(dataset_train)
print(dataset_valid)
print(dataset_test)

README.md:   0%|          | 0.00/4.36k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/9.26M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/504k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/522k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 14732
})
Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 818
})
Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 819
})


In [None]:
MODEL = 'csb05/T5-base-ami'
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 5
OUT_DIR = '/content/drive/MyDrive/Summarization Model/T5-AMI-Samsum'
MAX_LENGTH = 512


In [None]:
torch.cuda.empty_cache()

model = T5ForConditionalGeneration.from_pretrained(
    MODEL,
    use_cache=False
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

222,903,552 total parameters.
222,903,552 training parameters.


In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL, legacy=False)

def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['dialogue']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    targets = examples['summary']
    labels = tokenizer(
        text_target=targets,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

tokenizer_config.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/14732 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [None]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=300,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    eval_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=3e-5,
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

history = trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,0.1275,0.114289,0.529,0.2662,0.4901,27.6247
400,0.1059,0.094944,0.5762,0.306,0.5433,28.4792
600,0.0891,0.091366,0.5855,0.3132,0.5542,28.5293
800,0.1009,0.088989,0.5933,0.3218,0.5628,28.5306
1000,0.0998,0.087576,0.5954,0.3287,0.5661,28.5306
1200,0.0946,0.086251,0.6026,0.3366,0.5744,28.5306
1400,0.0907,0.085878,0.6067,0.3393,0.5784,28.5306
1600,0.0842,0.084933,0.61,0.3463,0.581,28.5306
1800,0.0749,0.0841,0.6124,0.3446,0.584,28.5306
2000,0.0831,0.083589,0.6127,0.3516,0.5855,28.5306


In [None]:
# Save to Google Drive (add this after training)
trainer.save_model(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)

('/content/drive/MyDrive/Summarization Model/T5-AMI-Samsum/tokenizer_config.json',
 '/content/drive/MyDrive/Summarization Model/T5-AMI-Samsum/special_tokens_map.json',
 '/content/drive/MyDrive/Summarization Model/T5-AMI-Samsum/spiece.model',
 '/content/drive/MyDrive/Summarization Model/T5-AMI-Samsum/added_tokens.json')

In [None]:
model.save_pretrained("T5-base-ami-samsum")
tokenizer.save_pretrained("T5-base-ami-samsum")


('T5-base-ami-samsum/tokenizer_config.json',
 'T5-base-ami-samsum/special_tokens_map.json',
 'T5-base-ami-samsum/spiece.model',
 'T5-base-ami-samsum/added_tokens.json')

In [None]:
model.push_to_hub("csb05/t5-base-ami-samsum")
tokenizer.push_to_hub("csb05/t5-base-ami-samsum")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/csb05/T5-base-ami-samsum/commit/1c8993eb691a50db50b40ba37552ad9c97b1f20f', commit_message='Upload tokenizer', commit_description='', oid='1c8993eb691a50db50b40ba37552ad9c97b1f20f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/csb05/T5-base-ami-samsum', endpoint='https://huggingface.co', repo_type='model', repo_id='csb05/T5-base-ami-samsum'), pr_revision=None, pr_num=None)