<a href="https://colab.research.google.com/github/NaVeen913/Datascienceproject/blob/main/summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [None]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

In [None]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
import pandas as pd

# from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

# Download NLTK sentence tokenizer
nltk.download("punkt")


In [None]:
from transformers import AutoTokenizer, PegasusForConditionalGeneration

model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")

ARTICLE_TO_SUMMARIZE = (
    "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
    "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
    "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
)
inputs = tokenizer(ARTICLE_TO_SUMMARIZE, max_length=1024, return_tensors="pt")

# Generate Summary
summary_ids = model.generate(inputs["input_ids"])
tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
device


## Fine Tuning

In [None]:
model = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model).to(device)



In [None]:
!pip install datasets --quiet

In [None]:
from datasets import load_dataset
import pandas as pd
import os


os.makedirs("summarizer-data", exist_ok=True)

#  Load CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# OPTIONAL: use smaller subset so it's lighter (uncomment if needed)
# dataset["train"] = dataset["train"].select(range(20000))
# dataset["validation"] = dataset["validation"].select(range(2000))
# dataset["test"] = dataset["test"].select(range(2000))

# 3) Convert to CSV and save inside summarizer-data folder
dataset["train"].to_pandas().to_csv("summarizer-data/train.csv", index=False)
dataset["validation"].to_pandas().to_csv("summarizer-data/val.csv", index=False)
dataset["test"].to_pandas().to_csv("summarizer-data/test.csv", index=False)

# 4) Zip the folder
!zip -r summarizer-data.zip summarizer-data


In [None]:
from datasets import load_dataset
import pandas as pd
import os

# Create folder for smaller dataset
os.makedirs("summarizer-data", exist_ok=True)

# Load full dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")


train_small = dataset["train"].select(range(2000))        # 2000 samples
val_small = dataset["validation"].select(range(500))       # 500 samples
test_small = dataset["test"].select(range(500))            # 500 samples

# Convert to CSV
train_small.to_pandas().to_csv("summarizer-data/train.csv", index=False)
val_small.to_pandas().to_csv("summarizer-data/val.csv", index=False)
test_small.to_pandas().to_csv("summarizer-data/test.csv", index=False)

!zip -r summarizer-data.zip summarizer-data


In [None]:
# Download & unzip data
!wget https://raw.githubusercontent.com/NaVeen913/Text-Summarization/main/summarizer-data.zip
!unzip -o summarizer-data.zip


In [None]:
import pandas as pd

train_df = pd.read_csv("summarizer-data/train.csv")
val_df = pd.read_csv("summarizer-data/val.csv")
test_df = pd.read_csv("summarizer-data/test.csv")

train_df.head()


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_name = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
def summarize(text):
    inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt").to(model_pegasus.device)

    summary_ids = model_pegasus.generate(
        inputs["input_ids"],
        max_length=128,
        num_beams=5,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [None]:
sample_text = train_df['article'][0] if 'article' in train_df.columns else train_df.iloc[0,0]
summary = summarize(sample_text)

print("Original Text:\n", sample_text[:500], "...")
print("\nSummary:\n", summary)


In [None]:
!pip install -q transformers datasets rouge_score sentencepiece


In [None]:
import pandas as pd

train_df = pd.read_csv("summarizer-data/train.csv")
val_df   = pd.read_csv("summarizer-data/val.csv")
test_df  = pd.read_csv("summarizer-data/test.csv")

print(train_df.columns)
train_df.head()


In [None]:
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq
)
import torch

#  Model & tokenizer
model_name = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model_pegasus = model_pegasus.to(device)

# Convert pandas -> HF Dataset
train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)

#  Preprocessing / tokenization function
max_input_length = 512
max_target_length = 64

def preprocess_function(batch):

    inputs = batch["article"]
    targets = batch["highlights"]

    # tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
    )

    # tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
train_tokenized = train_ds.map(preprocess_function, batched=True, remove_columns=train_ds.column_names)
val_tokenized   = val_ds.map(preprocess_function, batched=True, remove_columns=val_ds.column_names)
test_tokenized  = test_ds.map(preprocess_function, batched=True, remove_columns=test_ds.column_names)

train_tokenized[0]


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_pegasus)


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

batch_size = 2
num_epochs = 2

training_args = Seq2SeqTrainingArguments(
    output_dir="pegasus-summarizer-checkpoints",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=num_epochs,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to=[]
)


In [None]:
!pip install -U transformers datasets rouge_score sentencepiece -q


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch

batch_size = 2
num_epochs = 2


training_args = Seq2SeqTrainingArguments(
    "pegasus-summarizer-checkpoints"
)


training_args.num_train_epochs = num_epochs
training_args.per_device_train_batch_size = batch_size
training_args.per_device_eval_batch_size = batch_size
training_args.learning_rate = 5e-5
training_args.weight_decay = 0.01
training_args.logging_steps = 50
training_args.save_total_limit = 2


if hasattr(training_args, "evaluation_strategy"):
    training_args.evaluation_strategy = "epoch"
if hasattr(training_args, "save_strategy"):
    training_args.save_strategy = "epoch"
if hasattr(training_args, "predict_with_generate"):
    training_args.predict_with_generate = True


if hasattr(training_args, "fp16"):
    training_args.fp16 = torch.cuda.is_available()



In [None]:
!pip install -q evaluate rouge_score


In [None]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    predictions, labels = eval_pred


    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)


    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)


    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)


    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
    )

    result = {key: value * 100 for key, value in result.items()}


    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)


    return {k: round(v, 4) for k in result.items()}


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
import os
os.environ.pop("WANDB_DISABLED", None)


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch

batch_size = 2
num_epochs = 2

training_args = Seq2SeqTrainingArguments("pegasus-summarizer-checkpoints")

training_args.num_train_epochs = num_epochs
training_args.per_device_train_batch_size = batch_size
training_args.per_device_eval_batch_size = batch_size
training_args.learning_rate = 5e-5
training_args.weight_decay = 0.01
training_args.logging_steps = 50
training_args.save_total_limit = 2

if hasattr(training_args, "evaluation_strategy"):
    training_args.evaluation_strategy = "epoch"
if hasattr(training_args, "save_strategy"):
    training_args.save_strategy = "epoch"
if hasattr(training_args, "predict_with_generate"):
    training_args.predict_with_generate = True


if hasattr(training_args, "report_to"):
    training_args.report_to = []      # or ["none"]


In [None]:
trainer = Seq2SeqTrainer(
    model=model_pegasus,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
del trainer


In [None]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # convert to percentage
    final_result = {}
    for key, value in result.items():
        final_result[key] = round(value * 100, 4)

    # add average generated length
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    final_result["gen_len"] = round(np.mean(prediction_lens), 4)

    return final_result


In [None]:
trainer = Seq2SeqTrainer(
    model=model_pegasus,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
val_metrics = trainer.evaluate()
print(val_metrics)


In [None]:
test_results = trainer.predict(test_tokenized)
print("Test metrics:\n", test_results.metrics)


In [None]:
save_dir = "pegasus-text-summarizer"

trainer.save_model(save_dir)          # saves model weights
tokenizer.save_pretrained(save_dir)   # saves tokenizer

print("Model saved to:", save_dir)


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

loaded_tokenizer = AutoTokenizer.from_pretrained("pegasus-text-summarizer")
loaded_model = AutoModelForSeq2SeqLM.from_pretrained("pegasus-text-summarizer").to(device)

def generate_summary(text, max_len=128):
    inputs = loaded_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(device)

    summary_ids = loaded_model.generate(
        inputs["input_ids"],
        num_beams=5,
        max_length=max_len,
        early_stopping=True
    )

    return loaded_tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [None]:
sample_text = train_df["article"][0]
print("ORIGINAL:\n", sample_text[:600], "...\n")
print("SUMMARY:\n", generate_summary(sample_text))
