In [7]:
pip uninstall -y datasets > /dev/null 2>&1 && pip install datasets==3.6.0 > /dev/null 2>&1

In [None]:
from datasets import load_dataset, Dataset as HFDataset
from datasets.utils.logging import set_verbosity_error
import pandas as pd

from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer,T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq

In [None]:
DATASET_NAMES = ["cnn_dailymail", "xsum"]
VERSION = "3.0.0"

set_verbosity_error()

xsum = load_dataset(DATASET_NAMES[1], trust_remote_code=True)
cnn_dailymail = load_dataset(DATASET_NAMES[0], VERSION)

xsum = xsum.remove_columns(['id'])
xsum = xsum.rename_columns({
    "document" : "text",
    "summary" : "summary"
})

cnn_dailymail = cnn_dailymail.remove_columns(['id'])
cnn_dailymail = cnn_dailymail.rename_columns({
    "article" : "text",
    "highlights" : "summary"
})

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


In [None]:
TRAIN_SAMPLES = 5000
VAL_SAMPLES = 500
TEST_SAMPLES = 500

df_train_xsum = xsum["train"].to_pandas()
df_validation_xsum = xsum["validation"].to_pandas()
df_test_xsum =xsum["test"].to_pandas()

df_train_cnn = cnn_dailymail["train"].to_pandas()
df_validation_cnn = cnn_dailymail["validation"].to_pandas()
df_test_cnn =cnn_dailymail["test"].to_pandas()

df_train_xsum = df_train_xsum.sample(n=TRAIN_SAMPLES//2, random_state=42)
df_validation_xsum = df_validation_xsum.sample(n=VAL_SAMPLES//2, random_state=42)
df_test_xsum = df_test_xsum.sample(n=TEST_SAMPLES//2, random_state=42)

df_train_cnn = df_train_cnn.sample(n=TRAIN_SAMPLES//2, random_state=42)
df_validation_cnn = df_validation_cnn.sample(n=VAL_SAMPLES//2, random_state=42)
df_test_cnn = df_test_cnn.sample(n=TEST_SAMPLES//2, random_state=42)

df_train = pd.concat([df_train_xsum, df_train_cnn])
df_validation = pd.concat([df_validation_xsum, df_validation_cnn])
df_test = pd.concat([df_test_xsum, df_test_cnn])

print(f"Train Size: {df_train.shape}")
print(f"Validation Size: {df_validation.shape}")
print(f"Test Size: {df_test.shape}")

# train_dataset = concatenate_datasets([xsum["train"], cnn_dailymail["train"]])
# validation_dataset = concatenate_datasets([xsum["validation"], cnn_dailymail["validation"]])
# test_dataset = concatenate_datasets([xsum["test"], cnn_dailymail["test"]])

# print(train_dataset)
# print(validation_dataset)
# print(test_dataset)

Dataset({
    features: ['text', 'summary'],
    num_rows: 491158
})
Dataset({
    features: ['text', 'summary'],
    num_rows: 24700
})
Dataset({
    features: ['text', 'summary'],
    num_rows: 22824
})


## Dataset Preprocessing

In [None]:
class Config:
    MODEL_NAME = "t5-base"
    MAX_INPUT_LENGTH = 512
    MAX_TARGET_LENGTH = 128
    BATCH_SIZE = 8
    EPOCHS = 3
    LEARNING_RATE = 3e-4
    WEIGHT_DECAY = 0.01
    WARMUP_STEPS = 500
    OUTPUT_DIR = "./t5_summarization_model"
    LOGGING_STEPS = 100
    SAVE_STEPS = 1000
    EVAL_STEPS = 1000
    HARSH_THRESHOLD = 0.30
    STANDARD_THRESHOLD = 0.60

In [None]:
def calculate_compression_ratio(text, summary):
    text_len = len(text.split())
    summary_len = len(summary.split())
    if text_len == 0:
        return 0
    return summary_len / text_len

def assign_length_category(ratio):
    if ratio < Config.HARSH_THRESHOLD:
        return "harsh"
    elif ratio < Config.STANDARD_THRESHOLD:
        return "standard"
    else:
        return "detailed"

def preprocess_df(df):
    df['compression_ratio'] = df.apply(lambda row: calculate_compression_ratio(row['text'], row['summary']), axis=1)
    df['length_category'] = df['compression_ratio'].apply(assign_length_category)
    df['input_text'] = df.apply(lambda row: f"summarize {row['length_category']}: {row['text']}", axis=1)
    df['target_text'] = df['summary']
    return df



In [None]:
class SummarizationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length, max_target_length):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        input_encoding = self.tokenizer(
            row['input_text'],
            max_length=self.max_input_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        target_encoding = self.tokenizer(
            row['target_text'],
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        labels = target_encoding['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }

## Model Training & Initialization

In [None]:
def setup_model_and_tokenizer():
    model = T5ForConditionalGeneration.from_pretrained(Config.MODEL_NAME)
    tokenizer = T5Tokenizer.from_pretrained(Config.MODEL_NAME)
    
    return model, tokenizer

In [None]:
def train_model(df_train, df_val, df_test, text_col='text', summary_col='summary'):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    model, tokenizer = setup_model_and_tokenizer()
    model.to(device)
    
    df_train = preprocess_df(df_train)
    df_val = preprocess_df(df_validation)
    df_test = preprocess_df(df_test)
        
    train_dataset = SummarizationDataset(df_train, tokenizer, Config.MAX_INPUT_LENGTH, Config.MAX_TARGET_LENGTH)
    validation_dataset = SummarizationDataset(df_val, tokenizer, Config.MAX_INPUT_LENGTH, Config.MAX_TARGET_LENGTH)
    test_dataset = SummarizationDataset(df_test, tokenizer, Config.MAX_INPUT_LENGTH, Config.MAX_TARGET_LENGTH)
    
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        padding=True
    )
    
    training_args = TrainingArguments(
        output_dir=Config.OUTPUT_DIR,
        num_train_epochs=Config.EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.BATCH_SIZE,
        learning_rate=Config.LEARNING_RATE,
        weight_decay=Config.WEIGHT_DECAY,
        warmup_steps=Config.WARMUP_STEPS,
        logging_steps=Config.LOGGING_STEPS,
        eval_steps=Config.EVAL_STEPS,
        save_steps=Config.SAVE_STEPS,
        eval_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        push_to_hub=False,
        report_to="none",
        fp16=torch.cuda.is_available(),
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )
    
    print("\n" + "="*50)
    print("Starting training...")
    print("="*50 + "\n")
    
    trainer.train()
    
    print("\nSaving final model...")
    trainer.save_model(Config.OUTPUT_DIR)
    tokenizer.save_pretrained(Config.OUTPUT_DIR)
    
    print(f"\nModel saved to {Config.OUTPUT_DIR}")
    
    return model, tokenizer, trainer

In [None]:
def generate_summary(text, model, tokenizer, length_style="standard", max_length=150, device='cuda'):
    model.eval()
    
    input_text = f"summarize {length_style}: {text}"
    
    inputs = tokenizer(
        input_text,
        max_length=Config.MAX_INPUT_LENGTH,
        truncation=True,
        return_tensors='pt'
    ).to(device)
    
    if length_style == "harsh":
        max_gen_length = int(max_length * 0.6)
    elif length_style == "standard":
        max_gen_length = max_length
    else:
        max_gen_length = int(max_length * 1.4)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_gen_length,
            num_beams=4,
            length_penalty=1.0,
            early_stopping=True,
            no_repeat_ngram_size=3
        )
    
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

In [None]:
if __name__ == "__main__":
    print("Creating dummy data for demonstration...")

    model, tokenizer, trainer = train_model(
        df_train, df_validation, df_test,
        text_col='text',
        summary_col='summary'
    )
    
    print("\n" + "="*50)
    print("Example Inference")
    print("="*50 + "\n")
    
    sample_text = "Artificial intelligence is transforming the world. Machine learning models are becoming more powerful every day."
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    for style in ["harsh", "standard", "detailed"]:
        summary = generate_summary(sample_text, model, tokenizer, style, device=device)
        print(f"{style.upper()}: {summary}\n")