# Imports & Setup

In [None]:
# Install libraries
!pip uninstall -y datasets > /dev/null 2>&1
!pip install datasets==2.17.0 > /dev/null 2>&1
!pip install evaluate rouge_score transformers accelerate torch > /dev/null 2>&1

import os
import random
import numpy as np
import torch
import evaluate
import pandas as pd
from google.colab import drive
from datasets import load_dataset, concatenate_datasets

from transformers import (
    T5TokenizerFast as T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7cb1894078f0>

In [None]:
drive.mount('/content/drive')
DRIVE_PATH = "/content/drive/MyDrive/Project/Text Summarizer/ModelT5Base"

if not os.path.exists(DRIVE_PATH):
    os.makedirs(DRIVE_PATH)
    print(f"Created directory: {DRIVE_PATH}")
else:
    print(f"Directory exists: {DRIVE_PATH}")

Mounted at /content/drive
Directory exists: /content/drive/MyDrive/Project/Text Summarizer/ModelT5Base


# Dataset Preparation & Information

In [None]:
# Configurations
TRAIN_SAMPLES = 4500

print("Loading datasets...")
xsum = load_dataset('xsum', trust_remote_code=True, split='train')
cnn = load_dataset('cnn_dailymail', '3.0.0', split='train')

# Select subsets
xsum = xsum.select(range(TRAIN_SAMPLES))
cnn = cnn.select(range(TRAIN_SAMPLES))

xsum = xsum.remove_columns(['id'])
xsum = xsum.rename_columns({'document': 'text', 'summary': 'summary'})

cnn = cnn.remove_columns(['id'])
cnn = cnn.rename_columns({'article': 'text', 'highlights': 'summary'})


def format_harsh(example):
    example["text"] = f"summarize harsh: {example['text']}"
    return example

def format_detailed(example):
    example["text"] = f"summarize detailed: {example['text']}"
    return example

print("Applying style tags...")
xsum = xsum.map(format_harsh)
cnn = cnn.map(format_detailed)

dataset = concatenate_datasets([xsum, cnn])

dataset = dataset.filter(lambda x: len(x["summary"].split()) < len(x["text"].split()))

full_dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=SEED)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test'].select(range(200))

print(f"Training samples: {len(dataset_train)}")
print(f"Validation samples: {len(dataset_valid)}")

Loading datasets...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Applying style tags...


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9000 [00:00<?, ? examples/s]

Training samples: 8095
Validation samples: 200


# Configurations & Parameters

In [None]:
MODEL_NAME = 'google/flan-t5-base'
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 150

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

# Tokenization & Style Processing

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
rouge = evaluate.load("rouge")

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        text_target=examples["summary"],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing...")
tokenized_train = dataset_train.map(preprocess_function, batched=True)
tokenized_valid = dataset_valid.map(preprocess_function, batched=True)

Tokenizing...


Map:   0%|          | 0/8095 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

# Model Loading & Training

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.clip(predictions, 0, tokenizer.vocab_size - 1)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v, 4) for k, v in result.items()}

model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",

    # HYPERPARAMETERS
    num_train_epochs=8,
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    warmup_ratio=0.05,

    # Evaluation
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,

    # Optimization
    fp16=True,
    predict_with_generate=True,
    generation_max_length=150,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model),
    compute_metrics=compute_metrics
)


print("Starting training...")
trainer.train()

Starting training...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
500,0.0,,0.2755,0.098,0.2076,0.2066
1000,0.0,,0.2755,0.098,0.2076,0.2066
1500,0.0,,0.2755,0.098,0.2076,0.2066
2000,0.0,,0.2755,0.098,0.2076,0.2066
2500,0.0,,0.2755,0.098,0.2076,0.2066
3000,0.0,,0.2755,0.098,0.2076,0.2066
3500,0.0,,0.2755,0.098,0.2076,0.2066
4000,0.0,,0.2755,0.098,0.2076,0.2066
4500,0.0,,0.2755,0.098,0.2076,0.2066
5000,0.0,,0.2755,0.098,0.2076,0.2066


TrainOutput(global_step=8096, training_loss=0.0, metrics={'train_runtime': 8022.1027, 'train_samples_per_second': 8.073, 'train_steps_per_second': 1.009, 'total_flos': 4.434488614453248e+16, 'train_loss': 0.0, 'epoch': 8.0})

In [None]:
print(f"Saving final model to {DRIVE_PATH}...")
trainer.save_model(DRIVE_PATH)
tokenizer.save_pretrained(DRIVE_PATH)
print("Done!")

Saving final model to /content/drive/MyDrive/Project/Text Summarizer/ModelT5Base...
Done!


# Model Testing & Evaluation (Inference)

In [None]:
DRIVE_PATH = "/content/drive/MyDrive/Project/Text Summarizer/ModelT5Base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Loading model from {DRIVE_PATH}...")
model = T5ForConditionalGeneration.from_pretrained(DRIVE_PATH).to(device)
tokenizer = T5Tokenizer.from_pretrained(DRIVE_PATH)
print("Model loaded successfully!")

Loading model from /content/drive/MyDrive/Project/Text Summarizer/ModelT5Base...
Model loaded successfully!


In [None]:
def generate_summary(text, style, model, tokenizer):
    model.eval()

    input_text = f"summarize {style}: {text}"

    input_words = len(text.split())

    inputs = tokenizer(
        input_text,
        max_length=512,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    if style == "harsh":
        max_len = int(input_words * 0.35)
        min_len = 5
        rep_penalty = 2.5
        beam_size = 4

    elif style == "detailed":
        max_len = int(input_words * 0.70)
        min_len = 30
        rep_penalty = 1.2
        beam_size = 4

    else:
        max_len = 150
        min_len = 20
        rep_penalty = 1.5
        beam_size = 4

    max_len = min(max_len, 250)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_len,
            min_length=min_len,
            num_beams=beam_size,
            repetition_penalty=rep_penalty,
            no_repeat_ngram_size=3,
            early_stopping=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Sample Text (A complex paragraph to test hallucination)
text_to_summarize = """
    The James Webb Space Telescope (JWST) has captured a lush landscape of stellar birth.
    The new image shows the Cosmic Cliffs, which are the edge of a giant gaseous cavity within the star-forming region NGC 3324.
    The cavity has been carved from the nebula by the intense ultraviolet radiation and stellar winds from extremely massive, hot, young stars located in the center of the bubble, above the area shown in this image.
    The high peaks of the "cliffs" are about 7 light-years high.
    The blistering radiation from the young stars is sculpting the nebula's wall by slowly eroding it away.
    Simultaneously, the same gravity that pulls matter together to form stars also tears them apart in violent outbursts.
"""

print(f"Original Word Count: {len(text_to_summarize.split())}")
print("-" * 50)

harsh_out = generate_summary(text_to_summarize, "harsh", model, tokenizer)
print(f"\n--- HARSH (Short & Direct) ---\n{harsh_out}")

detailed_out = generate_summary(text_to_summarize, "detailed", model, tokenizer)
print(f"\n--- DETAILED (Comprehensive) ---\n{detailed_out}")

Original Word Count: 118
--------------------------------------------------

--- HARSH (Short & Direct) ---
The new image shows the Cosmic Cliffs, which are the edge of a giant gaseous cavity within the star-forming region NGC 3324.

--- DETAILED (Comprehensive) ---
The James Webb Space Telescope (JWST) has captured a lush landscape of stellar birth. The new image shows the Cosmic Cliffs, which are...


In [None]:
from google.colab import runtime
runtime.unassign()