<a href="https://colab.research.google.com/github/LohithVarun/NLP_Project/blob/main/NLP_Text_Summarizer_Real.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q --no-deps xformers trl peft accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install git+https://github.com/huggingface/transformers
!pip install PyPDF2
!pip install rouge
!pip install sentencepiece
!pip install unsloth_zoo
!pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-e1kmb8o9
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-e1kmb8o9
  Resolved https://github.com/huggingface/transformers to commit a40f1ac602fe900281722254c52ce3773f28eb0e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.50.0.dev0-py3-none-any.whl size=10860615 sha256=c6e1e3e9d8d3a97250f01a8a5c7b780e2be63341be3e2f4ce4311befba71d3ea
  Stored in directory: /tmp/pip-ephem-wheel-cache-ug0c62ff/wheels/04/a3/f1/b88775f8e1665827525b19ac7590250f1038d947067beba9fb
Successfully built transformer

Collecting git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-req-build-8bk83mwg
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-8bk83mwg
  Resolved https://github.com/unslothai/unsloth.git to commit 088765042786ede9e62dd888d9956424293232dd
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2025.2.15-py3-none-any.whl size=189844 sha256=de0c3d177206c3180a00551e365be6ccd90d4048c5fa76d3e334cd96d61e51e2
  Stored in directory: /tmp/pip-ephem-wheel-cache-kdscgn7v/wheels/d1/17/05/850ab10c33284a4763b0595cd8ea9d01fce6e221cac24b3c01
Successfully built unsloth
Installing collected packages: unsloth


In [None]:
from unsloth import FastLanguageModel
import torch
from unsloth.chat_templates import get_chat_template
from typing import List, Dict, Optional
import logging
from PyPDF2 import PdfReader
import re
from tqdm.auto import tqdm

class ResearchPaperSummarizer:
    def __init__(
        self,
        model_name: str = "unsloth/Llama-3.2-3B-Instruct",
        max_seq_length: int = 2048*6,
        load_in_4bit: bool = True
    ):
        """Initialize the research paper summarizer with Llama 3.2 model"""
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.logger = self._setup_logger()
        self.max_seq_length = max_seq_length

        # Initialize model using unsloth
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_name,
            max_seq_length=max_seq_length,
            dtype=None,  # Auto detect
            load_in_4bit=load_in_4bit,
        )

        # Configure model with LoRA for efficient fine-tuning
        self.model = FastLanguageModel.get_peft_model(
            self.model,
            r=16,
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj",
            ],
            lora_alpha=16,
            lora_dropout=0,
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=3407,
        )

        # Set up for inference
        FastLanguageModel.for_inference(self.model)

        # Set up chat template
        self.tokenizer = get_chat_template(
            self.tokenizer,
            chat_template="llama-3.1",
        )

    def _setup_logger(self) -> logging.Logger:
        """Setup logging configuration"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        return logging.getLogger(__name__)

    def read_pdf(self, file_path: str) -> str:
        """Extract text from PDF file"""
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text()
                return self._preprocess_text(text)
        except Exception as e:
            self.logger.error(f"Error reading PDF file: {str(e)}")
            return ""

    def _preprocess_text(self, text: str) -> str:
        """Clean and preprocess the input text"""
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep essential punctuation
        text = re.sub(r'[^\w\s.,!?;:()\-\']', '', text)
        # Clean up any double spaces created
        text = re.sub(r'\s{2,}', ' ', text)
        return text.strip()

    def _chunk_text(self, text: str, max_chunk_size: int = 1536) -> List[str]:
        """Split text into manageable chunks for processing"""
        # For safety, making sure chunks are significantly smaller than max_seq_length
        # to account for tokenization expansion and template tokens
        sentences = re.split(r'(?<=[.!?])\s+', text)
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence_length = len(sentence.split())
            if current_length + sentence_length <= max_chunk_size:
                current_chunk.append(sentence)
                current_length += sentence_length
            else:
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = sentence_length

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def _estimate_token_length(self, text: str) -> int:
        """Roughly estimate the number of tokens in the text"""
        # A rough approximation: 1 token ≈ 4 characters in English
        return len(text) // 4

    def _generate_summary(self, text: str, max_new_tokens: int = 500) -> str:
        """Generate summary for a chunk of text"""
        prompt = (
            "Please summarize the following research paper excerpt. Focus on key findings, "
            "methodology, and conclusions. Format the summary in clear, concise language:\n\n"
            f"{text}"
        )

        # Check if the prompt might be too long and trim if necessary
        if self._estimate_token_length(prompt) > (self.max_seq_length - max_new_tokens - 100):
            # Cut the text to a safe limit and add an indication it was trimmed
            safe_limit = (self.max_seq_length - max_new_tokens - 200) * 4  # Convert back to chars
            text = text[:safe_limit] + "... [text truncated due to length constraints]"
            prompt = (
                "Please summarize the following research paper excerpt. Focus on key findings, "
                "methodology, conclusions and technical words. Format the summary in clear, concise language:\n\n"
                f"{text}"
            )

        messages = [{"role": "user", "content": prompt}]

        # Apply chat template and ensure it doesn't exceed the model's context window
        inputs = self.tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(self.device)

        # Truncate if still too long (safety measure)
        if inputs.shape[1] > self.max_seq_length:
            self.logger.warning(f"Input still too long ({inputs.shape[1]} tokens), truncating to {self.max_seq_length}")
            inputs = inputs[:, :self.max_seq_length]

        outputs = self.model.generate(
            input_ids=inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            min_p=0.1,
            do_sample=True,
            use_cache=True
        )

        summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract only the assistant's response
        summary = summary.split("assistant")[1].split("<|eot")[0].strip()
        return summary

    def summarize(self, file_path: str) -> Dict:
        """
        Generate a comprehensive summary of a research paper

        Args:
            file_path: Path to the PDF or text file containing the research paper

        Returns:
            Dictionary containing the summary and metadata
        """
        try:
            # Read and preprocess the paper
            if file_path.endswith('.pdf'):
                text = self.read_pdf(file_path)
            else:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = self._preprocess_text(file.read())

            if not text:
                raise ValueError("No text could be extracted from the file")

            # Split into chunks and summarize each section
            # Using a smaller chunk size to ensure each input fits within context limits
            chunks = self._chunk_text(text)
            self.logger.info(f"Processing paper in {len(chunks)} chunks")

            summaries = []
            for chunk in tqdm(chunks, desc="Summarizing chunks"):
                summary = self._generate_summary(chunk)
                summaries.append(summary)

            # Combine chunk summaries into a final summary
            combined_summary = " ".join(summaries)

            # Generate a final, condensed summary
            final_summary = self._generate_summary(
                "Please provide a concise, well-structured final summary of this research paper: " +
                combined_summary
            )

            return {
                "summary": final_summary,
                "original_length": len(text.split()),
                "summary_length": len(final_summary.split()),
                "num_chunks_processed": len(chunks)
            }

        except Exception as e:
            self.logger.error(f"Error in summarization process: {str(e)}")
            return {
                "summary": "",
                "error": str(e)
            }

def main():
    # Example usage
    summarizer = ResearchPaperSummarizer()

    # Example with a research paper
    result = summarizer.summarize("SAMPLE RESEARCH PAPER.pdf")

    if "error" in result:
        print(f"Error: {result['error']}")
    else:
        print("\nSummary:")
        print("-" * 80)
        print(result["summary"])
        print("-" * 80)
        print(f"\nOriginal length: {result['original_length']} words")
        print(f"Summary length: {result['summary_length']} words")
        print(f"Compression ratio: {result['summary_length']/result['original_length']:.2%}")

if __name__ == "__main__":
    main()