In [None]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score

Collecting transformers
  Downloading transformers-4.48.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.48.2-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.1
    Uninstalling transformers-4.47.1:
      Successfully uninstalled transformers-4.47.1
Successfully installed transformers-4.48.2
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multi

In [None]:
import torch
import pprint
import evaluate
import numpy as np

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

In [None]:
from huggingface_hub import login


In [None]:
# Load the dataset from Google Drive
dataset_path = "/content/drive/MyDrive/Summarization Datasets/ami"
Dataset = load_dataset(dataset_path)
dataset_train = Dataset['train']
dataset_valid = Dataset['validation']
dataset_test = Dataset['test']

print(dataset_train)
print(dataset_valid)
print(dataset_test)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 209
})
Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 42
})
Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 28
})


In [None]:
MODEL = 't5-base'
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = '/content/drive/MyDrive/Summarization Model/T5-Large'
MAX_LENGTH = 512

In [None]:
torch.cuda.empty_cache()

model = T5ForConditionalGeneration.from_pretrained(
    MODEL,
    use_cache=False  # Disables the legacy cache system
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

222,903,552 total parameters.
222,903,552 training parameters.


In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL, legacy=False)

def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['dialogue']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    targets = examples['summary']
    labels = tokenizer(
        text_target=targets,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/209 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/42 [00:00<?, ? examples/s]

In [None]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [None]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    # Updated parameter name:
    eval_strategy='steps',  # Changed from evaluation_strategy
    eval_steps=200,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=0.0001,
    # Adjusted for system warning:
    dataloader_num_workers=2,  # Reduced from 4 to 2
    # Recommended optimization:
    dataloader_pin_memory=True,  # Add this for better GPU utilization
    fp16=True  # Enable mixed-precision training if using compatible GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

history = trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,2.1783,1.779863,0.548,0.2055,0.3867,300.5714
400,1.8953,1.601898,0.5797,0.24,0.4318,301.381


In [None]:
# Save to Google Drive (add this after training)
trainer.save_model(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)

('/content/drive/MyDrive/Summarization Model/T5-Large/tokenizer_config.json',
 '/content/drive/MyDrive/Summarization Model/T5-Large/special_tokens_map.json',
 '/content/drive/MyDrive/Summarization Model/T5-Large/spiece.model',
 '/content/drive/MyDrive/Summarization Model/T5-Large/added_tokens.json')

In [None]:
model.push_to_hub("csb05/T5-base-ami")
tokenizer.push_to_hub("csb05/T5-base-ami")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/csb05/T5-base-ami/commit/847e40b457195806208fcd7f8fc026adc0f986f2', commit_message='Upload tokenizer', commit_description='', oid='847e40b457195806208fcd7f8fc026adc0f986f2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/csb05/T5-base-ami', endpoint='https://huggingface.co', repo_type='model', repo_id='csb05/T5-base-ami'), pr_revision=None, pr_num=None)