<a href="https://colab.research.google.com/github/LohithVarun/NLP_Project/blob/main/NLP_Text_Summarizer_Real.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q --no-deps xformers trl peft accelerate bitsandbytes

In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip install PyPDF2
!pip install rouge
!pip install sentencepiece
!pip install unsloth_zoo
!pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [9]:
from unsloth import FastLanguageModel
import torch
from unsloth.chat_templates import get_chat_template
from typing import List, Dict, Optional
import logging
from PyPDF2 import PdfReader
import re
from tqdm.auto import tqdm

class ResearchPaperSummarizer:
    def __init__(
        self,
        model_name: str = "unsloth/Llama-3.2-3B-Instruct",
        max_seq_length: int = 2048*6,
        load_in_4bit: bool = True
    ):
        """Initialize the research paper summarizer with Llama 3.2 model"""
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.logger = self._setup_logger()

        # Initialize model using unsloth
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_name,
            max_seq_length=max_seq_length,
            dtype=None,  # Auto detect
            load_in_4bit=load_in_4bit,
        )

        # Configure model with LoRA for efficient fine-tuning
        self.model = FastLanguageModel.get_peft_model(
            self.model,
            r=16,
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj",
            ],
            lora_alpha=16,
            lora_dropout=0,
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=3407,
        )

        # Set up for inference
        FastLanguageModel.for_inference(self.model)

        # Set up chat template
        self.tokenizer = get_chat_template(
            self.tokenizer,
            chat_template="llama-3.1",
        )

    def _setup_logger(self) -> logging.Logger:
        """Setup logging configuration"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        return logging.getLogger(__name__)

    def read_pdf(self, file_path: str) -> str:
        """Extract text from PDF file"""
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text()
                return self._preprocess_text(text)
        except Exception as e:
            self.logger.error(f"Error reading PDF file: {str(e)}")
            return ""

    def _preprocess_text(self, text: str) -> str:
        """Clean and preprocess the input text"""
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep essential punctuation
        text = re.sub(r'[^\w\s.,!?;:()\-\']', '', text)
        # Clean up any double spaces created
        text = re.sub(r'\s{2,}', ' ', text)
        return text.strip()

    def _chunk_text(self, text: str, max_chunk_size: int = 2048*5) -> List[str]:
        """Split text into manageable chunks for processing"""
        sentences = re.split(r'(?<=[.!?])\s+', text)
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence_length = len(sentence.split())
            if current_length + sentence_length <= max_chunk_size:
                current_chunk.append(sentence)
                current_length += sentence_length
            else:
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = sentence_length

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def _generate_summary(self, text: str, max_new_tokens: int = 500) -> str:
        """Generate summary for a chunk of text"""
        messages = [{
            "role": "user",
            "content": (
                "Please summarize the following research paper excerpt. Focus on key findings, "
                "methodology, and conclusions. Format the summary in clear, concise language:\n\n"
                f"{text}"
            )
        }]

        inputs = self.tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(self.device)

        outputs = self.model.generate(
            input_ids=inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            min_p=0.1,
            do_sample=True,
            use_cache=True
        )

        summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract only the assistant's response
        summary = summary.split("assistant")[1].split("<|eot")[0].strip()
        return summary

    def summarize(self, file_path: str) -> Dict:
        """
        Generate a comprehensive summary of a research paper

        Args:
            file_path: Path to the PDF or text file containing the research paper

        Returns:
            Dictionary containing the summary and metadata
        """
        try:
            # Read and preprocess the paper
            if file_path.endswith('.pdf'):
                text = self.read_pdf(file_path)
            else:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = self._preprocess_text(file.read())

            if not text:
                raise ValueError("No text could be extracted from the file")

            # Split into chunks and summarize each section
            chunks = self._chunk_text(text)
            self.logger.info(f"Processing paper in {len(chunks)} chunks")

            summaries = []
            for chunk in tqdm(chunks, desc="Summarizing chunks"):
                summary = self._generate_summary(chunk)
                summaries.append(summary)

            # Combine chunk summaries into a final summary
            combined_summary = " ".join(summaries)

            # Generate a final, condensed summary
            final_summary = self._generate_summary(
                "Please provide a concise, well-structured final summary of this research paper: " +
                combined_summary
            )

            return {
                "summary": final_summary,
                "original_length": len(text.split()),
                "summary_length": len(final_summary.split()),
                "num_chunks_processed": len(chunks)
            }

        except Exception as e:
            self.logger.error(f"Error in summarization process: {str(e)}")
            return {
                "summary": "",
                "error": str(e)
            }

def main():
    # Example usage
    summarizer = ResearchPaperSummarizer()

    # Example with a research paper
    result = summarizer.summarize("test_nlp.pdf")

    if "error" in result:
        print(f"Error: {result['error']}")
    else:
        print("\nSummary:")
        print("-" * 80)
        print(result["summary"])
        print("-" * 80)
        print(f"\nOriginal length: {result['original_length']} words")
        print(f"Summary length: {result['summary_length']} words")
        print(f"Compression ratio: {result['summary_length']/result['original_length']:.2%}")

if __name__ == "__main__":
    main()

==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.50.0.dev0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Summarizing chunks:   0%|          | 0/1 [00:00<?, ?it/s]


Summary:
--------------------------------------------------------------------------------
**Summary of Research Paper Excerpt: Sentiment Analysis**

**Key Findings:**

1. Sentiment analysis is crucial in various applications, including customer service, market research, and social media monitoring.
2. The complexity of language and variability of human emotions pose significant challenges for sentiment analysis.
3. Machine learning algorithms and NLP techniques are effective approaches to analyzing sentiment in text data.
4. Human evaluation is critical in sentiment analysis, and more effective methods are needed to evaluate sentiment analysis systems.

**Methodology:**

1. The authors reviewed various approaches to sentiment analysis, including machine learning algorithms, NLP techniques, and rule-based systems.
2. They evaluated the strengths and weaknesses of each approach and provided examples of their application in different domains.
3. A new approach to sentiment analysis was p