<a href="https://colab.research.google.com/github/LohithVarun/NLP_Project/blob/main/NLP_Text_Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install transformers
!pip install PyPDF2
!pip install rouge
!pip install bert-extractive-summarizer
!pip install sentencepiece

import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import pipeline
import PyPDF2
import os
import numpy as np
from rouge import Rouge
import re
from typing import List, Dict
import logging

class TextSummarizer:
    def __init__(self, model_name: str = "google/pegasus-large"):
        """
        Initialize the summarizer with specified model
        Args:
            model_name (str): Name of the pretrained model to use
        """
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
        self.model = PegasusForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.rouge = Rouge()
        self.logger = self._setup_logger()

    def _setup_logger(self) -> logging.Logger:
        """Setup logging configuration"""
        logging.basicConfig(level=logging.INFO)
        return logging.getLogger(__name__)

    def read_pdf(self, file_path: str) -> str:
        """
        Read and extract text from PDF file
        Args:
            file_path (str): Path to the PDF file
        Returns:
            str: Extracted text from PDF
        """
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text()
                return self._preprocess_text(text)
        except Exception as e:
            self.logger.error(f"Error reading PDF file: {str(e)}")
            return ""

    def _preprocess_text(self, text: str) -> str:
        """
        Preprocess the input text
        Args:
            text (str): Input text
        Returns:
            str: Preprocessed text
        """
        # Remove extra whitespace and special characters
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s.,!?-]', '', text)
        return text.strip()

    def _chunk_text(self, text: str, max_chunk_size: int = 1024) -> List[str]:
        """
        Split text into chunks for processing
        Args:
            text (str): Input text
            max_chunk_size (int): Maximum chunk size
        Returns:
            List[str]: List of text chunks
        """
        words = text.split()
        chunks = []
        current_chunk = []
        current_size = 0

        for word in words:
            if current_size + len(word) + 1 <= max_chunk_size:
                current_chunk.append(word)
                current_size += len(word) + 1
            else:
                chunks.append(' '.join(current_chunk))
                current_chunk = [word]
                current_size = len(word) + 1

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def summarize(self, file_path: str, max_length: int = 150, min_length: int = 50) -> Dict:
        """
        Generate summary for the given file
        Args:
            file_path (str): Path to the input file
            max_length (int): Maximum length of the summary
            min_length (int): Minimum length of the summary
        Returns:
            Dict: Dictionary containing summary and metrics
        """
        try:
            # Read and preprocess text
            if file_path.endswith('.pdf'):
                text = self.read_pdf(file_path)
            else:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = self._preprocess_text(file.read())

            if not text:
                raise ValueError("No text could be extracted from the file")

            # Split text into chunks
            chunks = self._chunk_text(text)
            summaries = []

            # Process each chunk
            for chunk in chunks:
                inputs = self.tokenizer(chunk, return_tensors="pt", max_length=1024, truncation=True).to(self.device)
                summary_ids = self.model.generate(
                    inputs["input_ids"],
                    max_length=max_length,
                    min_length=min_length,
                    num_beams=4,
                    length_penalty=2.0,
                    early_stopping=True
                )
                summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
                summaries.append(summary)

            # Combine summaries
            final_summary = " ".join(summaries)

            # Calculate ROUGE scores
            if len(text.split()) > 10:  # Only calculate ROUGE if there's enough text
                scores = self.rouge.get_scores(final_summary, text)
            else:
                scores = None

            return {
                "summary": final_summary,
                "rouge_scores": scores,
                "original_length": len(text.split()),
                "summary_length": len(final_summary.split())
            }

        except Exception as e:
            self.logger.error(f"Error in summarization: {str(e)}")
            return {
                "summary": "",
                "error": str(e)
            }

# Example usage
def main():
    summarizer = TextSummarizer()

    # Example with sample file
    result = summarizer.summarize("NLP_Text_Summarization.pdf")

    print("Summary:", result["summary"])
    if result.get("rouge_scores"):
        print("\nROUGE Scores:")
        print("ROUGE-1:", result["rouge_scores"][0]["rouge-1"])
        print("ROUGE-2:", result["rouge_scores"][0]["rouge-2"])
        print("ROUGE-L:", result["rouge_scores"][0]["rouge-L"])

if __name__ == "__main__":
    main()