<a href="https://colab.research.google.com/github/Inzamam1234/NLP_ToolKit_FOSS/blob/main/NLP_ToolKit_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets peft accelerate



In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [None]:
model_name = "google-t5/t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
lora_config = LoraConfig(
    r=4,  # LoRA rank (you can tune this)
    lora_alpha=32,
    target_modules=["q", "v"],  # Only apply LoRA to attention modules
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 442,368 || all params: 223,345,920 || trainable%: 0.1981


In [None]:
from datasets import load_dataset

dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")["train"].shuffle(seed=42).select(range(1000))

# Split into train (90%) and eval (10%)
dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
eval_data = dataset["test"]

print(f"Train samples: {len(train_data)}, Eval samples: {len(eval_data)}")

Train samples: 800, Eval samples: 200


In [None]:
train_data[2]

 'highlights': 'More than 30 gunmen entered the town in disguise .\nThe gunmen were dressed in SWAT-style uniforms, authorities said .\nThey carried forged arrest warrants, says head of Haditha local council .\nProvincial council blames Iraqi army for not securing highways .',
 'id': '37a1f2b25defbc1ff2d1133094107a5327c02168'}

In [None]:
eval_data[0]

{'article': "By . Brendan Carlin . PUBLISHED: . 16:42 EST, 3 August 2013 . | . UPDATED: . 02:10 EST, 4 August 2013 . Tory Immigration Minister Mark Harper has denied claims by anti-racism campaigner Doreen Lawrence that controversial immigration spot checks were linked to people’s colour. Mr Harper sought to defuse a growing storm over the checks by also insisting they were not ‘random’ but based on ‘specific intelligence’. He also revealed that so far 17 people had been arrested on suspicion at two London Underground stations but was unable to say how many in total had been stopped. Migrant row: Tory Immigration Minister Mark Harper, left, has denied claims by anti-racism campaigner Baroness Doreen Lawrence, right, that controversial immigration spot checks were linked to people’s colour . The Equality and Human Rights Commission is now investigating the checks for possible discrimination. Last week, Doreen Lawrence, mother of murdered teenager Stephen Lawrence and who is now a Labour

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google-t5/t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,                  # Rank
    lora_alpha=32,        # Scaling factor
    lora_dropout=0.1,     # Dropout
    target_modules=["q", "v"]  # Target Pegasus layers
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Check: ~1.3M trainable params

trainable params: 884,736 || all params: 223,788,288 || trainable%: 0.3953


In [None]:
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=150, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

print(f"Keys of tokenized dataset: {list(tokenized_datasets['train'].features)}")
print(f"Tokenized dataset: {tokenized_datasets}")

Map:   0%|          | 0/800 [00:00<?, ? examples/s]



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels']
Tokenized dataset: DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})


In [None]:
from transformers import (
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer, Seq2SeqTrainingArguments
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/t5_lora_summary",
    eval_strategy="epoch",  # You can set it to "steps" for more frequent evaluations
    logging_strategy="steps",  # Use 'steps' to log training every certain number of steps
    logging_steps=50,  # Log every 50 steps
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Enable mixed precision on GPU
    logging_dir="/content/logs",  # Specify where to store logs
    logging_first_step=True,  # Log the first step as well
    report_to="tensorboard",  # Use TensorBoard for visualization if needed
    save_steps=100,  # Save model checkpoint every 100 steps
)


In [None]:
import os

# Tell Transformers to ignore W&B
os.environ["WANDB_DISABLED"] = "true"

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=50,  # After 50 epochs loss did'nt changed
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",  # 👈 This disables W&B and other integrations
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,No log,3.454426
2,No log,1.030941
3,No log,0.834584
4,No log,0.785211
5,2.420200,0.760251
6,2.420200,0.748752
7,2.420200,0.740583
8,2.420200,0.734789
9,2.420200,0.730455
10,0.837300,0.727756


TrainOutput(global_step=5000, training_loss=0.9451478271484375, metrics={'train_runtime': 2995.7345, 'train_samples_per_second': 13.352, 'train_steps_per_second': 1.669, 'total_flos': 2.446703198208e+16, 'train_loss': 0.9451478271484375, 'epoch': 50.0})

In [None]:
sample_input = "summarize: " + dataset["test"][0]["article"]
inputs = tokenizer(sample_input, return_tensors="pt", truncation=True, max_length=512).to(model.device)

summary_ids = model.generate(**inputs, max_new_tokens=120, min_length=50, num_beams=5, length_penalty=2.0, early_stopping=True)
print("Generated Summary:", tokenizer.decode(summary_ids[0], skip_special_tokens=True))
print("Actual Summary:", dataset["test"][0]["highlights"])

Generated Summary: Mark Harper sought to defuse a growing storm over the spot checks . He also insists they are not 'random' but based on 'specific intelligence' Equality and Human Rights Commission is investigating the checks for possible discrimination .
Actual Summary: Baroness Lawrence said she thought 'racial profiling' was being used .
But Immigration Minister Mark Harper insists they are not 'random'
Equality and Human Rights Commission to investigate the checks .


In [None]:
def summarize_text(text, mode="headline", max_input_length=512):
    model.eval()
    prompt = ("Summarize in ONE sentence: " if mode=="headline"
              else "Summarize briefly in 2–3 sentences: ") + text.strip()

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_input_length
    ).to(model.device)

    gen_kwargs = (
        dict(num_beams=4, max_new_tokens=80, min_length=20, length_penalty=2.0,
             no_repeat_ngram_size=3, encoder_no_repeat_ngram_size=3, early_stopping=False)
        if mode == "headline" else
        dict(num_beams=5, max_new_tokens=60, min_length=18, length_penalty=2.0,
             no_repeat_ngram_size=3, encoder_no_repeat_ngram_size=3, early_stopping=False)
    )

    with torch.no_grad():
        out = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            **gen_kwargs
        )

    summary = tokenizer.decode(out[0], skip_special_tokens=True).strip()
    return summary


example = (
    "Argentina defeated Brazil 3-1 in the Copa América final held in Rio de Janeiro. Lionel Messi scored once and assisted another, securing Argentina’s third consecutive title. Fans celebrated across Buenos Aires as Messi dedicated the victory to his teammates and supporters."
)

print("Generated Summary:", summarize_text(example))

Generated Summary: positive spokesman for Argentine Insights . CUBA AMÉRICA TURN OUT THOUGHTS ABOUT INDIVIDUAL TYPES OF COMMUNICATIONS . CONTEXTE DEVELOPMENT AUTHORITY SURFACE . SAVE THINGS UP


In [None]:
# Save model and tokenizer in Hugging Face format
model.save_pretrained("/content/t5_summarizer")
tokenizer.save_pretrained("/content/t5_summarizer")

# Zip it for download (if you want it locally)
import shutil
shutil.make_archive("t5_summarizer", 'zip', "/content/t5_summarizer")

from google.colab import files
files.download("t5_summarizer.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>