<a href="https://colab.research.google.com/github/LohithVarun/Text_Summarization/blob/main/TextSummarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q --no-deps xformers trl peft accelerate bitsandbytes
!pip install git+https://github.com/huggingface/transformers
!pip install PyPDF2
!pip install rouge
!pip install sentencepiece
!pip install unsloth_zoo
!pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
# Install required packages
!pip install unsloth
!pip install PyPDF2
!pip install nltk
!pip install scikit-learn
!pip install torch

In [4]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
import torch
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from typing import List, Dict, Optional
import logging
from PyPDF2 import PdfReader
import re
from tqdm.auto import tqdm
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize
from google.colab import files
import gc

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

class ColabResearchPaperSummarizer:
    def __init__(
        self,
        model_name: str = "unsloth/Llama-3.2-3B-Instruct",
        max_seq_length: int = 1024,  # Reduced for Colab
        load_in_4bit: bool = True
    ):
        """Initialize the research paper summarizer optimized for Google Colab"""
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.logger = self._setup_logger()

        # Clear CUDA cache before model loading
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()

        # Initialize model using unsloth with memory-optimized settings
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_name,
            max_seq_length=max_seq_length,
            dtype=None,  # Auto detect
            load_in_4bit=load_in_4bit,
        )

        # Configure model with reduced LoRA parameters for Colab
        self.model = FastLanguageModel.get_peft_model(
            self.model,
            r=8,  # Reduced from 16
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj",
            ],
            lora_alpha=8,  # Reduced from 16
            lora_dropout=0,
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=3407,
        )

        FastLanguageModel.for_inference(self.model)
        self.tokenizer = get_chat_template(self.tokenizer, chat_template="llama-3.1")
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

    def _setup_logger(self) -> logging.Logger:
        """Setup logging configuration"""
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
        return logging.getLogger(__name__)

    def read_pdf(self, file_path: str) -> str:
        """Extract text from PDF file with memory-efficient processing"""
        try:
            text = ""
            with open(file_path, 'rb') as file:
                pdf_reader = PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text()
                    if len(text) > 100000:  # Every 100KB
                        gc.collect()
            return self._preprocess_text(text)
        except Exception as e:
            self.logger.error(f"Error reading PDF file: {str(e)}")
            return ""

    def _preprocess_text(self, text: str) -> str:
        """Clean and preprocess the input text"""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s.,!?;:()\-\']', '', text)
        return text.strip()

    def _chunk_text(self, text: str, max_chunk_size: int = 512) -> List[str]:
        """Split text into smaller chunks for Colab's memory constraints"""
        sentences = re.split(r'(?<=[.!?])\s+', text)
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence_length = len(sentence.split())
            if current_length + sentence_length <= max_chunk_size:
                current_chunk.append(sentence)
                current_length += sentence_length
            else:
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = sentence_length

                # Periodic memory cleanup
                if len(chunks) % 5 == 0:
                    gc.collect()
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def _generate_summary(self, text: str, max_new_tokens: int = 250) -> str:
        """Generate summary with proper attention mask"""
        try:
            messages = [{
                "role": "user",
                "content": (
                    "Summarize this research paper excerpt concisely, focusing on key findings "
                    f"and methodology:\n\n{text}"
                )
            }]

            # Generate input_ids and attention_mask
            inputs = self.tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to(self.device)

            # Create attention mask (1 for all tokens)
            attention_mask = torch.ones_like(inputs).to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    input_ids=inputs,
                    attention_mask=attention_mask,
                    max_new_tokens=max_new_tokens,
                    temperature=0.7,
                    min_p=0.1,
                    do_sample=True,
                    use_cache=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id
                )

            summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            summary = summary.split("assistant")[1].split("<|eot")[0].strip()

            # Clear memory
            del outputs, inputs, attention_mask
            torch.cuda.empty_cache()
            gc.collect()

            return summary

        except RuntimeError as e:
            if "out of memory" in str(e):
                torch.cuda.empty_cache()
                gc.collect()
                return "Error: Out of memory. Try processing a smaller chunk of text."
            raise e

    def analyze_content(self, text: str) -> Dict:
        """Improved content analysis with better key terms extraction"""
        try:
            # Basic metrics
            sentences = sent_tokenize(text)
            words = text.split()

            # Improved key terms extraction
            technical_text = re.sub(r'[^\w\s]', ' ', text.lower())
            technical_text = re.sub(r'\s+', ' ', technical_text).strip()

            # Create word sequences (unigrams and bigrams)
            words = technical_text.split()
            unigrams = [word for word in words if len(word) > 2]
            bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)]

            # Use TF-IDF on both unigrams and bigrams
            all_terms = unigrams + bigrams
            if len(all_terms) > 0:
                vectorizer = TfidfVectorizer(
                    max_features=1000,
                    stop_words='english',
                    ngram_range=(1, 2)
                )

                try:
                    tfidf_matrix = vectorizer.fit_transform([' '.join(all_terms)])
                    feature_names = np.array(vectorizer.get_feature_names_out())
                    tfidf_scores = tfidf_matrix.toarray()[0]

                    # Get top terms (combining unigrams and bigrams)
                    top_indices = tfidf_scores.argsort()[-10:][::-1]
                    key_terms = feature_names[top_indices].tolist()
                except ValueError:
                    key_terms = all_terms[:10]  # Fallback to simple frequency
            else:
                key_terms = []

            # Clear some memory
            gc.collect()

            return {
                'key_terms': key_terms,
                'avg_sentence_length': len(words) / max(len(sentences), 1),
                'total_sentences': len(sentences)
            }
        except Exception as e:
            self.logger.error(f"Analysis error: {str(e)}")
            return {'key_terms': [], 'avg_sentence_length': 0, 'total_sentences': 0}

    def summarize(self, file_path: str) -> Dict:
        """Generate summary and analysis with Colab optimization"""
        try:
            # Read and preprocess
            if file_path.endswith('.pdf'):
                text = self.read_pdf(file_path)
            else:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = self._preprocess_text(file.read())

            if not text:
                raise ValueError("No text could be extracted from the file")

            # Process in smaller chunks
            chunks = self._chunk_text(text)
            self.logger.info(f"Processing paper in {len(chunks)} chunks")

            summaries = []
            for i, chunk in enumerate(tqdm(chunks, desc="Summarizing")):
                summary = self._generate_summary(chunk)
                summaries.append(summary)

                # Memory cleanup every few chunks
                if i % 3 == 0:
                    gc.collect()
                    torch.cuda.empty_cache()

            # Combine summaries
            combined_summary = " ".join(summaries)

            # Generate final summary
            final_summary = self._generate_summary(
                "Provide a concise final summary of this research paper: " +
                combined_summary[:1500]  # Limit length for final summary
            )

            # Analyze content
            analysis = self.analyze_content(text)

            return {
                "summary": final_summary,
                "key_terms": analysis.get('key_terms', []),
                "statistics": {
                    "original_length": len(text.split()),
                    "summary_length": len(final_summary.split()),
                    "compression_ratio": len(final_summary.split()) / len(text.split()),
                    "avg_sentence_length": analysis.get('avg_sentence_length', 0),
                    "total_sentences": analysis.get('total_sentences', 0)
                }
            }

        except Exception as e:
            self.logger.error(f"Summarization error: {str(e)}")
            return {"error": str(e)}

def main():
    print("Initializing summarizer...")
    summarizer = ColabResearchPaperSummarizer()

    print("\nUpload your research paper (PDF or text file)...")
    uploaded = files.upload()

    file_name = list(uploaded.keys())[0]
    print(f"\nProcessing {file_name}...")

    result = summarizer.summarize(file_name)

    if "error" in result:
        print(f"\nError: {result['error']}")
    else:
        print("\nSummary:")
        print("-" * 80)
        print(result["summary"])
        print("-" * 80)

        print("\nKey Technical Terms:")
        print(", ".join(result["key_terms"]))

        print("\nStatistics:")
        for key, value in result["statistics"].items():
            if isinstance(value, float):
                print(f"{key}: {value:.2%}")
            else:
                print(f"{key}: {value}")

if __name__ == "__main__":
    main()

Initializing summarizer...
==((====))==  Unsloth 2025.2.14: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

Upload your research paper (PDF or text file)...


Saving test_nlp.pdf to test_nlp (3).pdf

Processing test_nlp (3).pdf...


Summarizing:   0%|          | 0/8 [00:00<?, ?it/s]


Summary:
--------------------------------------------------------------------------------
Here's a concise summary of the research paper excerpt:

**Key Findings:**

* The study successfully uses logistic regression and the Effective Word Score heuristic to improve sentiment analysis accuracy on Twitter data.
* The approach can classify tweets into positive, negative, or neutral sentiments.

**Methodology:**

* The study employs unigram feature vectors and k-fold cross-validation to evaluate model accuracy.
* Tweet subjectivity is used to select precise training samples.
* The logistic regression model classifies tweets, and the Effective Word Score heuristic is used to identify frequently used words with polarity scores.
* The study aims to address the challenges of sentiment analysis on Twitter data due to its short length and informal nature.
--------------------------------------------------------------------------------

Key Technical Terms:
sentiment, analysis, twitter, used, se