<a href="https://colab.research.google.com/github/Huzaifa3242/Text_summarization-using-t5-transformer/blob/main/Text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets

In [None]:
# Step 1: Install required libraries
# pip install datasets pandas nltk

from datasets import load_dataset
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords

# Step 2: Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
# Step 3: Load the dataset
# Load with specific config (snapshot)
dataset = load_dataset("permutans/fineweb-bbc-news", "CC-MAIN-2020-10")


# Step 4: Convert to DataFrame
df = pd.DataFrame(dataset['train'])

# Step 5: Define preprocessing function
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Step 6: Create final DataFrame
final_df = pd.DataFrame({
    'article': df['text'],
    'summary': df['text'].apply(preprocess_text)
})

# Optional: Save to CSV
# final_df.to_csv("bbc_news_preprocessed.csv", index=False)

# Preview
print(final_df.head())

In [None]:
final_df=df.iloc[0:15000,:]

In [None]:
df=final_df.dropna()
df=df.drop_duplicates()
df.columns=["text","summary"]

In [None]:
df["text"]=df["text"].str.encode('ascii','ignore').str.decode('ascii')
df["summary"]=df["summary"].str.encode('ascii','ignore').str.decode('ascii')

In [None]:
!pip install --quiet transformers

In [None]:
!pip install --quiet pytorch-lightning

In [None]:
from torch.utils.data import Dataset,DataLoader
import torch
import numpy as np
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split

In [None]:
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)
from torch.optim import AdamW
from tqdm.auto import tqdm

In [None]:
train_df,test_df=train_test_split(df,test_size=0.2)

In [None]:
class SummaryDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        text_max_token_len: int = 512,
        summary_max_token_len: int = 128,
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index:int):
        data_row=self.data.iloc[index]
        text=data_row["text"]

        text_encodings= self.tokenizer(
            text,
            max_length=self.text_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        summary_encodings= self.tokenizer(
            data_row["summary"],
            max_length=self.summary_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        labels=summary_encodings["input_ids"]
        # Change: Use tokenizer.pad_token_id instead of -100
        labels[labels==0]=self.tokenizer.pad_token_id

        return dict(
            text=text,
            summary=data_row["summary"],
            text_input_ids=text_encodings["input_ids"].flatten(),
            text_attention_mask=text_encodings["attention_mask"].flatten(),
            labels=labels.flatten(),
            labels_attention_mask=summary_encodings["attention_mask"].flatten()
        )

In [None]:
class SummaryDataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        text_max_token_len: int = 512,
        summary_max_token_len: int = 128,
    ):
      # Fix: Provide the required arguments to super().__init__()
      super(SummaryDataModule, self).__init__()
      self.train_df=train_df
      self.test_df=test_df
      self.batch_size=batch_size
      self.tokenizer=tokenizer
      self.text_max_token_len=text_max_token_len
      self.summary_max_token_len=summary_max_token_len



    def setup(self, stage=None):
        self.train_dataset=SummaryDataset(
            self.train_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )

        self.test_dataset=SummaryDataset(
            self.test_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )


    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=2,
            pin_memory=True
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False, # Fix: shuffle should be False for test_dataloader
            num_workers=2,
            pin_memory=True
        )


    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False, # Fix: shuffle should be False for val_dataloader
            num_workers=2,
            pin_memory=True
        )

In [None]:
EPOCHS=10
BATCH_SIZE=16
data_module=SummaryDataModule(train_df,test_df,tokenizer)

In [None]:
import pytorch_lightning as pl
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning.loggers import CSVLogger

class SummaryModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)
        self.tokenizer = T5Tokenizer.from_pretrained("t5-base")

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )

        self.log("train_loss", loss, prog_bar=True, logger=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )


        self.log("val_loss", loss, prog_bar=True, logger=True, on_epoch=True)

        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )

        self.log("test_loss", loss, prog_bar=True, logger=True, on_epoch=True)
        return loss

    def configure_optimizers(self):
        from torch.optim import AdamW
        return AdamW(self.parameters(), lr=1e-4,weight_decay=0.01)

In [None]:
model=SummaryModel()

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./lightening_logs

In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping

# ModelCheckpoint: saves best model based on val_loss
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)


early_stop_callback = EarlyStopping(
    monitor="val_loss",
    patience=2,
    verbose=True,
    mode="min"
)


# TensorBoard Logger (for visual logging)
tensorboard_logger = TensorBoardLogger("lightening_logs", name="bbc-news-summary")

# CSV Logger (for table-style metrics logging)
csv_logger = CSVLogger("logs", name="bbc-news-summary")

trainer = pl.Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=EPOCHS,
    logger=[tensorboard_logger, csv_logger],
    callbacks=[checkpoint_callback, early_stop_callback],
    enable_model_summary=True,
    detect_anomaly=True,
    enable_progress_bar=True
)


In [None]:
trainer.fit(model,data_module)

In [None]:
trained_model=SummaryModel.load_from_checkpoint(
    trainer.checkpoint_callback.best_model_path
)
trained_model.freeze()

In [None]:
from operator import length_hint
def summarize_text(text):
  text_encodings=tokenizer(
      text,
      max_length=512,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
  )
  # Move tensors to the same device as the model
  text_encodings = {k: v.to(trained_model.device) for k, v in text_encodings.items()}


  generate_ids=trained_model.model.generate(
      input_ids=text_encodings["input_ids"],
      attention_mask=text_encodings["attention_mask"],
      max_length=200, # Fix: Corrected the typo from 'max_lenth' to 'max_length'
      num_beams=2,
      repetition_penalty=2.5,
      length_penalty=1.0,
      early_stopping=True
  )

  preds=[
      tokenizer.decode(gen_id,skip_special_tokens=True,clean_up_tokenization_spaces=True)
      for gen_id in generate_ids
  ]
  return "".join(preds)

In [None]:
!pip install rouge_score
!pip install evaluate

In [None]:
import evaluate
preds = []
refs = []

# Loop through the test DataFrame
for _, row in test_df.iterrows():
    text = row["text"]
    reference_summary = row["summary"]  # Make sure this column exists!

    # Generate prediction using your custom function
    generated_summary = summarize_text(text)

    preds.append(generated_summary)
    refs.append(reference_summary)


# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Compute the results
results = rouge.compute(predictions=preds, references=refs)

# Print scores
for key in results:
    print(f"{key}: {results[key]}")