<a href="https://colab.research.google.com/github/LohithVarun/Text_Summarization/blob/main/Updetd_Text_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q --no-deps xformers trl peft accelerate bitsandbytes
!pip install git+https://github.com/huggingface/transformers
!pip install PyPDF2
!pip install rouge
!pip install sentencepiece
!pip install unsloth_zoo
!pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [5]:
# Install required packages
!pip install unsloth
!pip install PyPDF2
!pip install nltk
!pip install scikit-learn
!pip install torch

Collecting unsloth
  Downloading unsloth-2025.2.15-py3-none-any.whl.metadata (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.2.7 (from unsloth)
  Downloading unsloth_zoo-2025.2.7-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.16-py3-none-any.whl.metadata (9.4 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth)
  Downloading t

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [8]:
import torch
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from typing import List, Dict, Optional
import logging
from PyPDF2 import PdfReader
import re
from tqdm.auto import tqdm
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize
from google.colab import files
import gc

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('stopwords', quiet=True)

def print_section_header(title: str, underline_char: str = "=") -> None:
    """Print a formatted section header"""
    print(f"\n{title}")
    print(underline_char * len(title))

def print_formatted_text(text: str, indent: int = 0) -> None:
    """Print text with proper formatting and indentation"""
    indent_str = " " * indent
    paragraphs = text.split("\n")
    for paragraph in paragraphs:
        words = paragraph.split()
        current_line = indent_str
        for word in words:
            if len(current_line) + len(word) + 1 > 80:
                print(current_line)
                current_line = indent_str + word
            else:
                if current_line == indent_str:
                    current_line += word
                else:
                    current_line += " " + word
        if current_line:
            print(current_line)
    print()

def create_progress_bar(title: str, total: int) -> tqdm:
    """Create a formatted progress bar"""
    return tqdm(
        total=total,
        desc=f"► {title}",
        bar_format="{l_bar}{bar:30}{r_bar}",
        ncols=80
    )

class ColabResearchPaperSummarizer:
    def __init__(
        self,
        model_name: str = "unsloth/Llama-3.2-3B-Instruct",
        max_seq_length: int = 1024,
        load_in_4bit: bool = True
    ):
        """Initialize the research paper summarizer optimized for Google Colab"""
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.logger = self._setup_logger()

        print_section_header("Initializing Model", "-")
        print_formatted_text(f"Using device: {self.device}")
        print_formatted_text(f"Model: {model_name}")

        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()

        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_name,
            max_seq_length=max_seq_length,
            dtype=None,
            load_in_4bit=load_in_4bit,
        )

        self.model = FastLanguageModel.get_peft_model(
            self.model,
            r=8,
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj",
            ],
            lora_alpha=8,
            lora_dropout=0,
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=3407,
        )

        FastLanguageModel.for_inference(self.model)
        self.tokenizer = get_chat_template(self.tokenizer, chat_template="llama-3.1")
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

    def _setup_logger(self) -> logging.Logger:
        """Setup logging configuration"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        return logging.getLogger(__name__)

    def read_pdf(self, file_path: str) -> str:
        """Extract text from PDF file with memory-efficient processing"""
        try:
            text = ""
            with open(file_path, 'rb') as file:
                pdf_reader = PdfReader(file)
                for page in tqdm(pdf_reader.pages, desc="Reading PDF"):
                    text += page.extract_text()
                    if len(text) > 100000:
                        gc.collect()
            return self._preprocess_text(text)
        except Exception as e:
            self.logger.error(f"Error reading PDF file: {str(e)}")
            return ""

    def _preprocess_text(self, text: str) -> str:
        """Clean and preprocess the input text"""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s.,!?;:()\-\']', '', text)
        return text.strip()

    def _chunk_text(self, text: str, max_chunk_size: int = 512) -> List[str]:
        """Split text into smaller chunks for processing"""
        sentences = re.split(r'(?<=[.!?])\s+', text)
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence_length = len(sentence.split())
            if current_length + sentence_length <= max_chunk_size:
                current_chunk.append(sentence)
                current_length += sentence_length
            else:
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = sentence_length

                if len(chunks) % 5 == 0:
                    gc.collect()
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def _generate_summary(self, text: str, max_new_tokens: int = 250) -> str:
        """Generate summary with proper attention mask"""
        try:
            messages = [{
                "role": "user",
                "content": (
                    "Summarize this research paper excerpt concisely, focusing on key findings, "
                    f"the problem tackled, and methodology:\n\n{text}"
                )
            }]

            inputs = self.tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to(self.device)

            attention_mask = torch.ones_like(inputs).to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    input_ids=inputs,
                    attention_mask=attention_mask,
                    max_new_tokens=max_new_tokens,
                    temperature=0.7,
                    min_p=0.1,
                    do_sample=True,
                    use_cache=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id
                )

            summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            summary = summary.split("assistant")[1].split("<|eot")[0].strip()

            del outputs, inputs, attention_mask
            torch.cuda.empty_cache()
            gc.collect()

            return summary

        except RuntimeError as e:
            if "out of memory" in str(e):
                torch.cuda.empty_cache()
                gc.collect()
                return "Error: Out of memory. Try processing a smaller chunk of text."
            raise e

    def analyze_content(self, text: str) -> Dict:
        """Analyze content with improved key terms extraction"""
        try:
            sentences = sent_tokenize(text)
            words = text.split()

            stop_words = set(nltk.corpus.stopwords.words('english'))
            stop_words.update([
                "used", "using", "approach", "method", "study",
                "paper", "result", "results"
            ])

            technical_text = re.sub(r'[^\w\s]', ' ', text.lower())
            technical_text = re.sub(r'\s+', ' ', technical_text).strip()
            filtered_words = [
                word for word in technical_text.split()
                if word not in stop_words and len(word) > 2
            ]

            tagged_words = nltk.pos_tag(filtered_words)
            chunk_parser = nltk.RegexpParser("""
                NP: {<JJ.*>*<NN.*>+}
            """)
            tree = chunk_parser.parse(tagged_words)
            noun_phrases = [
                ' '.join(w[0] for w in subtree.leaves())
                for subtree in tree.subtrees()
                if subtree.label() == 'NP'
            ]

            if len(noun_phrases) > 0:
                vectorizer = TfidfVectorizer(max_features=10, stop_words='english')
                tfidf_matrix = vectorizer.fit_transform(noun_phrases)
                feature_names = vectorizer.get_feature_names_out()
                tfidf_scores = tfidf_matrix.toarray().sum(axis=0)
                top_indices = tfidf_scores.argsort()[::-1]
                key_terms = [feature_names[i] for i in top_indices]
            else:
                word_counts = Counter(filtered_words)
                key_terms = [word for word, count in word_counts.most_common(10)]

            gc.collect()

            return {
                'key_terms': key_terms,
                'avg_sentence_length': len(words) / max(len(sentences), 1),
                'total_sentences': len(sentences)
            }
        except Exception as e:
            self.logger.error(f"Analysis error: {str(e)}")
            return {'key_terms': [], 'avg_sentence_length': 0, 'total_sentences': 0}

    def summarize(self, file_path: str) -> Dict:
        """Generate summary and analysis with improved formatting"""
        try:
            print_section_header("Reading Document", "-")
            if file_path.endswith('.pdf'):
                text = self.read_pdf(file_path)
            else:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = self._preprocess_text(file.read())

            if not text:
                raise ValueError("No text could be extracted from the file")

            chunks = self._chunk_text(text)
            print_formatted_text(f"Processing paper in {len(chunks)} chunks")

            print_section_header("Generating Summary", "-")
            summaries = []
            for i, chunk in enumerate(tqdm(chunks, desc="Summarizing")):
                summary = self._generate_summary(chunk)
                summaries.append(summary)

                if i % 3 == 0:
                    gc.collect()
                    torch.cuda.empty_cache()

            combined_summary = " ".join(summaries)

            print_section_header("Generating Final Summary", "-")
            final_summary = self._generate_summary(
                "Provide a concise final summary of this research paper: " +
                combined_summary[:1500]
            )

            print_section_header("Analyzing Content", "-")
            analysis = self.analyze_content(text)

            return {
                "summary": final_summary,
                "key_terms": analysis.get('key_terms', []),
                "statistics": {
                    "original_length": len(text.split()),
                    "summary_length": len(final_summary.split()),
                    "compression_ratio": len(final_summary.split()) / len(text.split()),
                    "avg_sentence_length": analysis.get('avg_sentence_length', 0),
                    "total_sentences": analysis.get('total_sentences', 0)
                }
            }

        except Exception as e:
            self.logger.error(f"Summarization error: {str(e)}")
            return {"error": str(e)}

def main():
    print_section_header("Research Paper Summarizer", "=")
    print_formatted_text("Welcome to the Research Paper Summarizer!")

    summarizer = ColabResearchPaperSummarizer()

    print_section_header("File Upload", "-")
    print_formatted_text("Please upload your research paper (PDF or text file)...")
    uploaded = files.upload()

    if not uploaded:
        print_formatted_text("No file was uploaded. Exiting...")
        return

    file_name = list(uploaded.keys())[0]
    print_section_header(f"Processing {file_name}", "-")

    result = summarizer.summarize(file_name)

    if "error" in result:
        print_section_header("Error", "!")
        print_formatted_text(result["error"])
    else:
        print_section_header("Summary")
        print_formatted_text(result["summary"])

        print_section_header("Key Technical Terms")
        terms_text = ", ".join(result["key_terms"])
        print_formatted_text(terms_text, indent=2)

        print_section_header("Statistics")
        stats = result["statistics"]
        stat_format = {
            "Original Length": stats["original_length"],
            "Summary Length": stats["summary_length"],
            "Compression Ratio": f"{stats['compression_ratio']:.2%}",
            "Average Sentence Length": f"{stats['avg_sentence_length']:.2f}",
            "Total Sentences": stats["total_sentences"]
        }

        max_key_length = max(len(key) for key in stat_format.keys())
        for key, value in stat_format.items():
            print(f"  {key:<{max_key_length}} : {value}")

if __name__ == "__main__":
    main()


Research Paper Summarizer
Welcome to the Research Paper Summarizer!


Initializing Model
------------------
Using device: cuda

Model: unsloth/Llama-3.2-3B-Instruct

==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

File Upload
-----------
Please upload your research paper (PDF or text file)...



Saving test_nlp.pdf to test_nlp (6).pdf

Processing test_nlp (6).pdf
---------------------------

Reading Document
----------------


Reading PDF:   0%|          | 0/5 [00:00<?, ?it/s]

Processing paper in 8 chunks


Generating Summary
------------------


Summarizing:   0%|          | 0/8 [00:00<?, ?it/s]


Generating Final Summary
------------------------

Analyzing Content
-----------------

Summary
Here's a concise summary of the research paper excerpt:
**Problem Tackled:** Sentiment analysis in tweets, a challenging task due to the
noisy nature of microblogging data, with applications in opinion mining,
sentiment analysis, and social media monitoring.
**Key Findings:**
* A proposed approach using logistic regression classification and the Effective
Word Score heuristic achieves accurate sentiment classification (positive,
negative, or neutral).
* The Effective Word Score heuristic speeds up the classification process.
**Methodology:**
* Logistic regression classification is used as the classifier.
* The Effective Word Score heuristic is used to find word polarity scores.
* k-fold cross-validation is used to evaluate accuracy.
* Unigrams (single words) are used as feature vectors to represent tweets.


Key Technical Terms
  sentiment, analysis, classification, tweets, sentiments, twit